Index: head/sys/compat/linprocfs/linprocfs.c =================================================================== --- head/sys/compat/linprocfs/linprocfs.c (revision 334117) +++ head/sys/compat/linprocfs/linprocfs.c (revision 334118) @@ -1,1663 +1,1663 @@ /*- * SPDX-License-Identifier: BSD-4-Clause * * Copyright (c) 2000 Dag-Erling Coïdan Smørgrav * Copyright (c) 1999 Pierre Beyssac * Copyright (c) 1993 Jan-Simon Pendry * Copyright (c) 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * Jan-Simon Pendry. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)procfs_status.c 8.4 (Berkeley) 6/15/94 */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #if defined(__i386__) || defined(__amd64__) #include #include #endif /* __i386__ || __amd64__ */ #include #include #include #include #include #include /* * Various conversion macros */ #define T2J(x) ((long)(((x) * 100ULL) / (stathz ? stathz : hz))) /* ticks to jiffies */ #define T2CS(x) ((unsigned long)(((x) * 100ULL) / (stathz ? stathz : hz))) /* ticks to centiseconds */ #define T2S(x) ((x) / (stathz ? stathz : hz)) /* ticks to seconds */ #define B2K(x) ((x) >> 10) /* bytes to kbytes */ #define B2P(x) ((x) >> PAGE_SHIFT) /* bytes to pages */ #define P2B(x) ((x) << PAGE_SHIFT) /* pages to bytes */ #define P2K(x) ((x) << (PAGE_SHIFT - 10)) /* pages to kbytes */ #define TV2J(x) ((x)->tv_sec * 100UL + (x)->tv_usec / 10000) /** * @brief Mapping of ki_stat in struct kinfo_proc to the linux state * * The linux procfs state field displays one of the characters RSDZTW to * denote running, sleeping in an interruptible wait, waiting in an * uninterruptible disk sleep, a zombie process, process is being traced * or stopped, or process is paging respectively. * * Our struct kinfo_proc contains the variable ki_stat which contains a * value out of SIDL, SRUN, SSLEEP, SSTOP, SZOMB, SWAIT and SLOCK. * * This character array is used with ki_stati-1 as an index and tries to * map our states to suitable linux states. */ static char linux_state[] = "RRSTZDD"; /* * Filler function for proc/meminfo */ static int linprocfs_domeminfo(PFS_FILL_ARGS) { unsigned long memtotal; /* total memory in bytes */ unsigned long memused; /* used memory in bytes */ unsigned long memfree; /* free memory in bytes */ unsigned long buffers, cached; /* buffer / cache memory ??? */ unsigned long long swaptotal; /* total swap space in bytes */ unsigned long long swapused; /* used swap space in bytes */ unsigned long long swapfree; /* free swap space in bytes */ int i, j; memtotal = physmem * PAGE_SIZE; /* * The correct thing here would be: * memfree = vm_free_count() * PAGE_SIZE; memused = memtotal - memfree; * * but it might mislead linux binaries into thinking there * is very little memory left, so we cheat and tell them that * all memory that isn't wired down is free. */ memused = vm_wire_count() * PAGE_SIZE; memfree = memtotal - memused; swap_pager_status(&i, &j); swaptotal = (unsigned long long)i * PAGE_SIZE; swapused = (unsigned long long)j * PAGE_SIZE; swapfree = swaptotal - swapused; /* * We'd love to be able to write: * buffers = bufspace; * * but bufspace is internal to vfs_bio.c and we don't feel * like unstaticizing it just for linprocfs's sake. */ buffers = 0; cached = vm_inactive_count() * PAGE_SIZE; sbuf_printf(sb, "MemTotal: %9lu kB\n" "MemFree: %9lu kB\n" "Buffers: %9lu kB\n" "Cached: %9lu kB\n" "SwapTotal:%9llu kB\n" "SwapFree: %9llu kB\n", B2K(memtotal), B2K(memfree), B2K(buffers), B2K(cached), B2K(swaptotal), B2K(swapfree)); return (0); } #if defined(__i386__) || defined(__amd64__) /* * Filler function for proc/cpuinfo (i386 & amd64 version) */ static int linprocfs_docpuinfo(PFS_FILL_ARGS) { int hw_model[2]; char model[128]; uint64_t freq; size_t size; u_int cache_size[4]; int fqmhz, fqkhz; int i, j; /* * We default the flags to include all non-conflicting flags, * and the Intel versions of conflicting flags. */ static char *flags[] = { "fpu", "vme", "de", "pse", "tsc", "msr", "pae", "mce", "cx8", "apic", "sep", "sep", "mtrr", "pge", "mca", "cmov", "pat", "pse36", "pn", "b19", "b20", "b21", "mmxext", "mmx", "fxsr", "xmm", "sse2", "b27", "b28", "b29", "3dnowext", "3dnow" }; static char *power_flags[] = { "ts", "fid", "vid", "ttp", "tm", "stc", "100mhzsteps", "hwpstate", "", "cpb", "eff_freq_ro", "proc_feedback", "acc_power", }; hw_model[0] = CTL_HW; hw_model[1] = HW_MODEL; model[0] = '\0'; size = sizeof(model); if (kernel_sysctl(td, hw_model, 2, &model, &size, 0, 0, 0, 0) != 0) strcpy(model, "unknown"); #ifdef __i386__ switch (cpu_vendor_id) { case CPU_VENDOR_AMD: if (cpu_class < CPUCLASS_686) flags[16] = "fcmov"; break; case CPU_VENDOR_CYRIX: flags[24] = "cxmmx"; break; } #endif if (cpu_exthigh >= 0x80000006) do_cpuid(0x80000006, cache_size); else memset(cache_size, 0, sizeof(cache_size)); for (i = 0; i < mp_ncpus; ++i) { fqmhz = 0; fqkhz = 0; freq = atomic_load_acq_64(&tsc_freq); if (freq != 0) { fqmhz = (freq + 4999) / 1000000; fqkhz = ((freq + 4999) / 10000) % 100; } sbuf_printf(sb, "processor\t: %d\n" "vendor_id\t: %.20s\n" "cpu family\t: %u\n" "model\t\t: %u\n" "model name\t: %s\n" "stepping\t: %u\n" "cpu MHz\t\t: %d.%02d\n" "cache size\t: %d KB\n" "physical id\t: %d\n" "siblings\t: %d\n" "core id\t\t: %d\n" "cpu cores\t: %d\n" "apicid\t\t: %d\n" "initial apicid\t: %d\n" "fpu\t\t: %s\n" "fpu_exception\t: %s\n" "cpuid level\t: %d\n" "wp\t\t: %s\n", i, cpu_vendor, CPUID_TO_FAMILY(cpu_id), CPUID_TO_MODEL(cpu_id), model, cpu_id & CPUID_STEPPING, fqmhz, fqkhz, (cache_size[2] >> 16), 0, mp_ncpus, i, mp_ncpus, i, i, /*cpu_id & CPUID_LOCAL_APIC_ID ??*/ (cpu_feature & CPUID_FPU) ? "yes" : "no", "yes", CPUID_TO_FAMILY(cpu_id), "yes"); sbuf_cat(sb, "flags\t\t:"); for (j = 0; j < nitems(flags); j++) if (cpu_feature & (1 << j)) sbuf_printf(sb, " %s", flags[j]); sbuf_cat(sb, "\n"); sbuf_printf(sb, "bugs\t\t: %s\n" "bogomips\t: %d.%02d\n" "clflush size\t: %d\n" "cache_alignment\t: %d\n" "address sizes\t: %d bits physical, %d bits virtual\n", #if defined(I586_CPU) && !defined(NO_F00F_HACK) (has_f00f_bug) ? "Intel F00F" : "", #else "", #endif fqmhz, fqkhz, cpu_clflush_line_size, cpu_clflush_line_size, cpu_maxphyaddr, (cpu_maxphyaddr > 32) ? 48 : 0); sbuf_cat(sb, "power management: "); for (j = 0; j < nitems(power_flags); j++) if (amd_pminfo & (1 << j)) sbuf_printf(sb, " %s", power_flags[j]); sbuf_cat(sb, "\n\n"); /* XXX per-cpu vendor / class / model / id? */ } sbuf_cat(sb, "\n"); return (0); } #endif /* __i386__ || __amd64__ */ /* * Filler function for proc/mtab * * This file doesn't exist in Linux' procfs, but is included here so * users can symlink /compat/linux/etc/mtab to /proc/mtab */ static int linprocfs_domtab(PFS_FILL_ARGS) { struct nameidata nd; const char *lep; char *dlep, *flep, *mntto, *mntfrom, *fstype; size_t lep_len; int error; struct statfs *buf, *sp; size_t count; /* resolve symlinks etc. in the emulation tree prefix */ NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, linux_emul_path, td); flep = NULL; error = namei(&nd); lep = linux_emul_path; if (error == 0) { if (vn_fullpath(td, nd.ni_vp, &dlep, &flep) == 0) lep = dlep; vrele(nd.ni_vp); } lep_len = strlen(lep); buf = NULL; error = kern_getfsstat(td, &buf, SIZE_T_MAX, &count, UIO_SYSSPACE, MNT_WAIT); if (error != 0) { free(buf, M_TEMP); free(flep, M_TEMP); return (error); } for (sp = buf; count > 0; sp++, count--) { /* determine device name */ mntfrom = sp->f_mntfromname; /* determine mount point */ mntto = sp->f_mntonname; if (strncmp(mntto, lep, lep_len) == 0 && mntto[lep_len] == '/') mntto += lep_len; /* determine fs type */ fstype = sp->f_fstypename; if (strcmp(fstype, pn->pn_info->pi_name) == 0) mntfrom = fstype = "proc"; else if (strcmp(fstype, "procfs") == 0) continue; if (strcmp(fstype, "linsysfs") == 0) { sbuf_printf(sb, "/sys %s sysfs %s", mntto, sp->f_flags & MNT_RDONLY ? "ro" : "rw"); } else { /* For Linux msdosfs is called vfat */ if (strcmp(fstype, "msdosfs") == 0) fstype = "vfat"; sbuf_printf(sb, "%s %s %s %s", mntfrom, mntto, fstype, sp->f_flags & MNT_RDONLY ? "ro" : "rw"); } #define ADD_OPTION(opt, name) \ if (sp->f_flags & (opt)) sbuf_printf(sb, "," name); ADD_OPTION(MNT_SYNCHRONOUS, "sync"); ADD_OPTION(MNT_NOEXEC, "noexec"); ADD_OPTION(MNT_NOSUID, "nosuid"); ADD_OPTION(MNT_UNION, "union"); ADD_OPTION(MNT_ASYNC, "async"); ADD_OPTION(MNT_SUIDDIR, "suiddir"); ADD_OPTION(MNT_NOSYMFOLLOW, "nosymfollow"); ADD_OPTION(MNT_NOATIME, "noatime"); #undef ADD_OPTION /* a real Linux mtab will also show NFS options */ sbuf_printf(sb, " 0 0\n"); } free(buf, M_TEMP); free(flep, M_TEMP); return (error); } /* * Filler function for proc/partitions */ static int linprocfs_dopartitions(PFS_FILL_ARGS) { struct g_class *cp; struct g_geom *gp; struct g_provider *pp; int major, minor; g_topology_lock(); sbuf_printf(sb, "major minor #blocks name rio rmerge rsect " "ruse wio wmerge wsect wuse running use aveq\n"); LIST_FOREACH(cp, &g_classes, class) { if (strcmp(cp->name, "DISK") == 0 || strcmp(cp->name, "PART") == 0) LIST_FOREACH(gp, &cp->geom, geom) { LIST_FOREACH(pp, &gp->provider, provider) { if (linux_driver_get_major_minor( pp->name, &major, &minor) != 0) { major = 0; minor = 0; } sbuf_printf(sb, "%d %d %lld %s " "%d %d %d %d %d " "%d %d %d %d %d %d\n", major, minor, (long long)pp->mediasize, pp->name, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); } } } g_topology_unlock(); return (0); } /* * Filler function for proc/stat */ static int linprocfs_dostat(PFS_FILL_ARGS) { struct pcpu *pcpu; long cp_time[CPUSTATES]; long *cp; struct timeval boottime; int i; read_cpu_time(cp_time); getboottime(&boottime); sbuf_printf(sb, "cpu %ld %ld %ld %ld\n", T2J(cp_time[CP_USER]), T2J(cp_time[CP_NICE]), T2J(cp_time[CP_SYS] /*+ cp_time[CP_INTR]*/), T2J(cp_time[CP_IDLE])); CPU_FOREACH(i) { pcpu = pcpu_find(i); cp = pcpu->pc_cp_time; sbuf_printf(sb, "cpu%d %ld %ld %ld %ld\n", i, T2J(cp[CP_USER]), T2J(cp[CP_NICE]), T2J(cp[CP_SYS] /*+ cp[CP_INTR]*/), T2J(cp[CP_IDLE])); } sbuf_printf(sb, "disk 0 0 0 0\n" "page %ju %ju\n" "swap %ju %ju\n" "intr %ju\n" "ctxt %ju\n" "btime %lld\n", (uintmax_t)VM_CNT_FETCH(v_vnodepgsin), (uintmax_t)VM_CNT_FETCH(v_vnodepgsout), (uintmax_t)VM_CNT_FETCH(v_swappgsin), (uintmax_t)VM_CNT_FETCH(v_swappgsout), (uintmax_t)VM_CNT_FETCH(v_intr), (uintmax_t)VM_CNT_FETCH(v_swtch), (long long)boottime.tv_sec); return (0); } static int linprocfs_doswaps(PFS_FILL_ARGS) { struct xswdev xsw; uintmax_t total, used; int n; char devname[SPECNAMELEN + 1]; sbuf_printf(sb, "Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n"); for (n = 0; ; n++) { if (swap_dev_info(n, &xsw, devname, sizeof(devname)) != 0) break; total = (uintmax_t)xsw.xsw_nblks * PAGE_SIZE / 1024; used = (uintmax_t)xsw.xsw_used * PAGE_SIZE / 1024; /* * The space and not tab after the device name is on * purpose. Linux does so. */ sbuf_printf(sb, "/dev/%-34s unknown\t\t%jd\t%jd\t-1\n", devname, total, used); } return (0); } /* * Filler function for proc/uptime */ static int linprocfs_douptime(PFS_FILL_ARGS) { long cp_time[CPUSTATES]; struct timeval tv; getmicrouptime(&tv); read_cpu_time(cp_time); sbuf_printf(sb, "%lld.%02ld %ld.%02lu\n", (long long)tv.tv_sec, tv.tv_usec / 10000, T2S(cp_time[CP_IDLE] / mp_ncpus), T2CS(cp_time[CP_IDLE] / mp_ncpus) % 100); return (0); } /* * Get OS build date */ static void linprocfs_osbuild(struct thread *td, struct sbuf *sb) { #if 0 char osbuild[256]; char *cp1, *cp2; strncpy(osbuild, version, 256); osbuild[255] = '\0'; cp1 = strstr(osbuild, "\n"); cp2 = strstr(osbuild, ":"); if (cp1 && cp2) { *cp1 = *cp2 = '\0'; cp1 = strstr(osbuild, "#"); } else cp1 = NULL; if (cp1) sbuf_printf(sb, "%s%s", cp1, cp2 + 1); else #endif sbuf_cat(sb, "#4 Sun Dec 18 04:30:00 CET 1977"); } /* * Get OS builder */ static void linprocfs_osbuilder(struct thread *td, struct sbuf *sb) { #if 0 char builder[256]; char *cp; cp = strstr(version, "\n "); if (cp) { strncpy(builder, cp + 5, 256); builder[255] = '\0'; cp = strstr(builder, ":"); if (cp) *cp = '\0'; } if (cp) sbuf_cat(sb, builder); else #endif sbuf_cat(sb, "des@freebsd.org"); } /* * Filler function for proc/version */ static int linprocfs_doversion(PFS_FILL_ARGS) { char osname[LINUX_MAX_UTSNAME]; char osrelease[LINUX_MAX_UTSNAME]; linux_get_osname(td, osname); linux_get_osrelease(td, osrelease); sbuf_printf(sb, "%s version %s (", osname, osrelease); linprocfs_osbuilder(td, sb); sbuf_cat(sb, ") (gcc version " __VERSION__ ") "); linprocfs_osbuild(td, sb); sbuf_cat(sb, "\n"); return (0); } /* * Filler function for proc/loadavg */ static int linprocfs_doloadavg(PFS_FILL_ARGS) { sbuf_printf(sb, "%d.%02d %d.%02d %d.%02d %d/%d %d\n", (int)(averunnable.ldavg[0] / averunnable.fscale), (int)(averunnable.ldavg[0] * 100 / averunnable.fscale % 100), (int)(averunnable.ldavg[1] / averunnable.fscale), (int)(averunnable.ldavg[1] * 100 / averunnable.fscale % 100), (int)(averunnable.ldavg[2] / averunnable.fscale), (int)(averunnable.ldavg[2] * 100 / averunnable.fscale % 100), 1, /* number of running tasks */ nprocs, /* number of tasks */ lastpid /* the last pid */ ); return (0); } /* * Filler function for proc/pid/stat */ static int linprocfs_doprocstat(PFS_FILL_ARGS) { struct kinfo_proc kp; struct timeval boottime; char state; static int ratelimit = 0; vm_offset_t startcode, startdata; getboottime(&boottime); sx_slock(&proctree_lock); PROC_LOCK(p); fill_kinfo_proc(p, &kp); sx_sunlock(&proctree_lock); if (p->p_vmspace) { startcode = (vm_offset_t)p->p_vmspace->vm_taddr; startdata = (vm_offset_t)p->p_vmspace->vm_daddr; } else { startcode = 0; startdata = 0; } sbuf_printf(sb, "%d", p->p_pid); #define PS_ADD(name, fmt, arg) sbuf_printf(sb, " " fmt, arg) PS_ADD("comm", "(%s)", p->p_comm); if (kp.ki_stat > sizeof(linux_state)) { state = 'R'; if (ratelimit == 0) { printf("linprocfs: don't know how to handle unknown FreeBSD state %d/%zd, mapping to R\n", kp.ki_stat, sizeof(linux_state)); ++ratelimit; } } else state = linux_state[kp.ki_stat - 1]; PS_ADD("state", "%c", state); PS_ADD("ppid", "%d", p->p_pptr ? p->p_pptr->p_pid : 0); PS_ADD("pgrp", "%d", p->p_pgid); PS_ADD("session", "%d", p->p_session->s_sid); PROC_UNLOCK(p); PS_ADD("tty", "%ju", (uintmax_t)kp.ki_tdev); PS_ADD("tpgid", "%d", kp.ki_tpgid); PS_ADD("flags", "%u", 0); /* XXX */ PS_ADD("minflt", "%lu", kp.ki_rusage.ru_minflt); PS_ADD("cminflt", "%lu", kp.ki_rusage_ch.ru_minflt); PS_ADD("majflt", "%lu", kp.ki_rusage.ru_majflt); PS_ADD("cmajflt", "%lu", kp.ki_rusage_ch.ru_majflt); PS_ADD("utime", "%ld", TV2J(&kp.ki_rusage.ru_utime)); PS_ADD("stime", "%ld", TV2J(&kp.ki_rusage.ru_stime)); PS_ADD("cutime", "%ld", TV2J(&kp.ki_rusage_ch.ru_utime)); PS_ADD("cstime", "%ld", TV2J(&kp.ki_rusage_ch.ru_stime)); PS_ADD("priority", "%d", kp.ki_pri.pri_user); PS_ADD("nice", "%d", kp.ki_nice); /* 19 (nicest) to -19 */ PS_ADD("0", "%d", 0); /* removed field */ PS_ADD("itrealvalue", "%d", 0); /* XXX */ PS_ADD("starttime", "%lu", TV2J(&kp.ki_start) - TV2J(&boottime)); PS_ADD("vsize", "%ju", P2K((uintmax_t)kp.ki_size)); PS_ADD("rss", "%ju", (uintmax_t)kp.ki_rssize); PS_ADD("rlim", "%lu", kp.ki_rusage.ru_maxrss); PS_ADD("startcode", "%ju", (uintmax_t)startcode); PS_ADD("endcode", "%ju", (uintmax_t)startdata); PS_ADD("startstack", "%u", 0); /* XXX */ PS_ADD("kstkesp", "%u", 0); /* XXX */ PS_ADD("kstkeip", "%u", 0); /* XXX */ PS_ADD("signal", "%u", 0); /* XXX */ PS_ADD("blocked", "%u", 0); /* XXX */ PS_ADD("sigignore", "%u", 0); /* XXX */ PS_ADD("sigcatch", "%u", 0); /* XXX */ PS_ADD("wchan", "%u", 0); /* XXX */ PS_ADD("nswap", "%lu", kp.ki_rusage.ru_nswap); PS_ADD("cnswap", "%lu", kp.ki_rusage_ch.ru_nswap); PS_ADD("exitsignal", "%d", 0); /* XXX */ PS_ADD("processor", "%u", kp.ki_lastcpu); PS_ADD("rt_priority", "%u", 0); /* XXX */ /* >= 2.5.19 */ PS_ADD("policy", "%u", kp.ki_pri.pri_class); /* >= 2.5.19 */ #undef PS_ADD sbuf_putc(sb, '\n'); return (0); } /* * Filler function for proc/pid/statm */ static int linprocfs_doprocstatm(PFS_FILL_ARGS) { struct kinfo_proc kp; segsz_t lsize; sx_slock(&proctree_lock); PROC_LOCK(p); fill_kinfo_proc(p, &kp); PROC_UNLOCK(p); sx_sunlock(&proctree_lock); /* * See comments in linprocfs_doprocstatus() regarding the * computation of lsize. */ /* size resident share trs drs lrs dt */ sbuf_printf(sb, "%ju ", B2P((uintmax_t)kp.ki_size)); sbuf_printf(sb, "%ju ", (uintmax_t)kp.ki_rssize); sbuf_printf(sb, "%ju ", (uintmax_t)0); /* XXX */ sbuf_printf(sb, "%ju ", (uintmax_t)kp.ki_tsize); sbuf_printf(sb, "%ju ", (uintmax_t)(kp.ki_dsize + kp.ki_ssize)); lsize = B2P(kp.ki_size) - kp.ki_dsize - kp.ki_ssize - kp.ki_tsize - 1; sbuf_printf(sb, "%ju ", (uintmax_t)lsize); sbuf_printf(sb, "%ju\n", (uintmax_t)0); /* XXX */ return (0); } /* * Filler function for proc/pid/status */ static int linprocfs_doprocstatus(PFS_FILL_ARGS) { struct kinfo_proc kp; char *state; segsz_t lsize; struct thread *td2; struct sigacts *ps; l_sigset_t siglist, sigignore, sigcatch; int i; sx_slock(&proctree_lock); PROC_LOCK(p); td2 = FIRST_THREAD_IN_PROC(p); /* XXXKSE pretend only one thread */ if (P_SHOULDSTOP(p)) { state = "T (stopped)"; } else { switch(p->p_state) { case PRS_NEW: state = "I (idle)"; break; case PRS_NORMAL: if (p->p_flag & P_WEXIT) { state = "X (exiting)"; break; } switch(td2->td_state) { case TDS_INHIBITED: state = "S (sleeping)"; break; case TDS_RUNQ: case TDS_RUNNING: state = "R (running)"; break; default: state = "? (unknown)"; break; } break; case PRS_ZOMBIE: state = "Z (zombie)"; break; default: state = "? (unknown)"; break; } } fill_kinfo_proc(p, &kp); sx_sunlock(&proctree_lock); sbuf_printf(sb, "Name:\t%s\n", p->p_comm); /* XXX escape */ sbuf_printf(sb, "State:\t%s\n", state); /* * Credentials */ sbuf_printf(sb, "Pid:\t%d\n", p->p_pid); sbuf_printf(sb, "PPid:\t%d\n", p->p_pptr ? p->p_pptr->p_pid : 0); sbuf_printf(sb, "Uid:\t%d %d %d %d\n", p->p_ucred->cr_ruid, p->p_ucred->cr_uid, p->p_ucred->cr_svuid, /* FreeBSD doesn't have fsuid */ p->p_ucred->cr_uid); sbuf_printf(sb, "Gid:\t%d %d %d %d\n", p->p_ucred->cr_rgid, p->p_ucred->cr_gid, p->p_ucred->cr_svgid, /* FreeBSD doesn't have fsgid */ p->p_ucred->cr_gid); sbuf_cat(sb, "Groups:\t"); for (i = 0; i < p->p_ucred->cr_ngroups; i++) sbuf_printf(sb, "%d ", p->p_ucred->cr_groups[i]); PROC_UNLOCK(p); sbuf_putc(sb, '\n'); /* * Memory * * While our approximation of VmLib may not be accurate (I * don't know of a simple way to verify it, and I'm not sure * it has much meaning anyway), I believe it's good enough. * * The same code that could (I think) accurately compute VmLib * could also compute VmLck, but I don't really care enough to * implement it. Submissions are welcome. */ sbuf_printf(sb, "VmSize:\t%8ju kB\n", B2K((uintmax_t)kp.ki_size)); sbuf_printf(sb, "VmLck:\t%8u kB\n", P2K(0)); /* XXX */ sbuf_printf(sb, "VmRSS:\t%8ju kB\n", P2K((uintmax_t)kp.ki_rssize)); sbuf_printf(sb, "VmData:\t%8ju kB\n", P2K((uintmax_t)kp.ki_dsize)); sbuf_printf(sb, "VmStk:\t%8ju kB\n", P2K((uintmax_t)kp.ki_ssize)); sbuf_printf(sb, "VmExe:\t%8ju kB\n", P2K((uintmax_t)kp.ki_tsize)); lsize = B2P(kp.ki_size) - kp.ki_dsize - kp.ki_ssize - kp.ki_tsize - 1; sbuf_printf(sb, "VmLib:\t%8ju kB\n", P2K((uintmax_t)lsize)); /* * Signal masks */ PROC_LOCK(p); bsd_to_linux_sigset(&p->p_siglist, &siglist); ps = p->p_sigacts; mtx_lock(&ps->ps_mtx); bsd_to_linux_sigset(&ps->ps_sigignore, &sigignore); bsd_to_linux_sigset(&ps->ps_sigcatch, &sigcatch); mtx_unlock(&ps->ps_mtx); PROC_UNLOCK(p); sbuf_printf(sb, "SigPnd:\t%016jx\n", siglist.__mask); /* * XXX. SigBlk - target thread's signal mask, td_sigmask. * To implement SigBlk pseudofs should support proc/tid dir entries. */ sbuf_printf(sb, "SigBlk:\t%016x\n", 0); sbuf_printf(sb, "SigIgn:\t%016jx\n", sigignore.__mask); sbuf_printf(sb, "SigCgt:\t%016jx\n", sigcatch.__mask); /* * Linux also prints the capability masks, but we don't have * capabilities yet, and when we do get them they're likely to * be meaningless to Linux programs, so we lie. XXX */ sbuf_printf(sb, "CapInh:\t%016x\n", 0); sbuf_printf(sb, "CapPrm:\t%016x\n", 0); sbuf_printf(sb, "CapEff:\t%016x\n", 0); return (0); } /* * Filler function for proc/pid/cwd */ static int linprocfs_doproccwd(PFS_FILL_ARGS) { struct filedesc *fdp; struct vnode *vp; char *fullpath = "unknown"; char *freepath = NULL; fdp = p->p_fd; FILEDESC_SLOCK(fdp); vp = fdp->fd_cdir; if (vp != NULL) VREF(vp); FILEDESC_SUNLOCK(fdp); vn_fullpath(td, vp, &fullpath, &freepath); if (vp != NULL) vrele(vp); sbuf_printf(sb, "%s", fullpath); if (freepath) free(freepath, M_TEMP); return (0); } /* * Filler function for proc/pid/root */ static int linprocfs_doprocroot(PFS_FILL_ARGS) { struct filedesc *fdp; struct vnode *vp; char *fullpath = "unknown"; char *freepath = NULL; fdp = p->p_fd; FILEDESC_SLOCK(fdp); vp = jailed(p->p_ucred) ? fdp->fd_jdir : fdp->fd_rdir; if (vp != NULL) VREF(vp); FILEDESC_SUNLOCK(fdp); vn_fullpath(td, vp, &fullpath, &freepath); if (vp != NULL) vrele(vp); sbuf_printf(sb, "%s", fullpath); if (freepath) free(freepath, M_TEMP); return (0); } /* * Filler function for proc/pid/cmdline */ static int linprocfs_doproccmdline(PFS_FILL_ARGS) { int ret; PROC_LOCK(p); if ((ret = p_cansee(td, p)) != 0) { PROC_UNLOCK(p); return (ret); } /* * Mimic linux behavior and pass only processes with usermode * address space as valid. Return zero silently otherwize. */ if (p->p_vmspace == &vmspace0) { PROC_UNLOCK(p); return (0); } if (p->p_args != NULL) { sbuf_bcpy(sb, p->p_args->ar_args, p->p_args->ar_length); PROC_UNLOCK(p); return (0); } if ((p->p_flag & P_SYSTEM) != 0) { PROC_UNLOCK(p); return (0); } PROC_UNLOCK(p); ret = proc_getargv(td, p, sb); return (ret); } /* * Filler function for proc/pid/environ */ static int linprocfs_doprocenviron(PFS_FILL_ARGS) { /* * Mimic linux behavior and pass only processes with usermode * address space as valid. Return zero silently otherwize. */ if (p->p_vmspace == &vmspace0) return (0); return (proc_getenvv(td, p, sb)); } static char l32_map_str[] = "%08lx-%08lx %s%s%s%s %08lx %02x:%02x %lu%s%s\n"; static char l64_map_str[] = "%016lx-%016lx %s%s%s%s %08lx %02x:%02x %lu%s%s\n"; static char vdso_str[] = " [vdso]"; static char stack_str[] = " [stack]"; /* * Filler function for proc/pid/maps */ static int linprocfs_doprocmaps(PFS_FILL_ARGS) { struct vmspace *vm; vm_map_t map; vm_map_entry_t entry, tmp_entry; vm_object_t obj, tobj, lobj; vm_offset_t e_start, e_end; vm_ooffset_t off = 0; vm_prot_t e_prot; unsigned int last_timestamp; char *name = "", *freename = NULL; const char *l_map_str; ino_t ino; int ref_count, shadow_count, flags; int error; struct vnode *vp; struct vattr vat; PROC_LOCK(p); error = p_candebug(td, p); PROC_UNLOCK(p); if (error) return (error); if (uio->uio_rw != UIO_READ) return (EOPNOTSUPP); error = 0; vm = vmspace_acquire_ref(p); if (vm == NULL) return (ESRCH); if (SV_CURPROC_FLAG(SV_LP64)) l_map_str = l64_map_str; else l_map_str = l32_map_str; map = &vm->vm_map; vm_map_lock_read(map); for (entry = map->header.next; entry != &map->header; entry = entry->next) { name = ""; freename = NULL; if (entry->eflags & MAP_ENTRY_IS_SUB_MAP) continue; e_prot = entry->protection; e_start = entry->start; e_end = entry->end; obj = entry->object.vm_object; for (lobj = tobj = obj; tobj; tobj = tobj->backing_object) { VM_OBJECT_RLOCK(tobj); if (lobj != obj) VM_OBJECT_RUNLOCK(lobj); lobj = tobj; } last_timestamp = map->timestamp; vm_map_unlock_read(map); ino = 0; if (lobj) { off = IDX_TO_OFF(lobj->size); vp = vm_object_vnode(lobj); if (vp != NULL) vref(vp); if (lobj != obj) VM_OBJECT_RUNLOCK(lobj); flags = obj->flags; ref_count = obj->ref_count; shadow_count = obj->shadow_count; VM_OBJECT_RUNLOCK(obj); if (vp != NULL) { vn_fullpath(td, vp, &name, &freename); vn_lock(vp, LK_SHARED | LK_RETRY); VOP_GETATTR(vp, &vat, td->td_ucred); ino = vat.va_fileid; vput(vp); } else if (SV_PROC_ABI(p) == SV_ABI_LINUX) { if (e_start == p->p_sysent->sv_shared_page_base) name = vdso_str; if (e_end == p->p_sysent->sv_usrstack) name = stack_str; } } else { flags = 0; ref_count = 0; shadow_count = 0; } /* * format: * start, end, access, offset, major, minor, inode, name. */ error = sbuf_printf(sb, l_map_str, (u_long)e_start, (u_long)e_end, (e_prot & VM_PROT_READ)?"r":"-", (e_prot & VM_PROT_WRITE)?"w":"-", (e_prot & VM_PROT_EXECUTE)?"x":"-", "p", (u_long)off, 0, 0, (u_long)ino, *name ? " " : "", name ); if (freename) free(freename, M_TEMP); vm_map_lock_read(map); if (error == -1) { error = 0; break; } if (last_timestamp != map->timestamp) { /* * Look again for the entry because the map was * modified while it was unlocked. Specifically, * the entry may have been clipped, merged, or deleted. */ vm_map_lookup_entry(map, e_end - 1, &tmp_entry); entry = tmp_entry; } } vm_map_unlock_read(map); vmspace_free(vm); return (error); } /* * Criteria for interface name translation */ #define IFP_IS_ETH(ifp) (ifp->if_type == IFT_ETHER) static int linux_ifname(struct ifnet *ifp, char *buffer, size_t buflen) { struct ifnet *ifscan; int ethno; IFNET_RLOCK_ASSERT(); /* Short-circuit non ethernet interfaces */ if (!IFP_IS_ETH(ifp)) return (strlcpy(buffer, ifp->if_xname, buflen)); /* Determine the (relative) unit number for ethernet interfaces */ ethno = 0; - TAILQ_FOREACH(ifscan, &V_ifnet, if_link) { + CK_STAILQ_FOREACH(ifscan, &V_ifnet, if_link) { if (ifscan == ifp) return (snprintf(buffer, buflen, "eth%d", ethno)); if (IFP_IS_ETH(ifscan)) ethno++; } return (0); } /* * Filler function for proc/net/dev */ static int linprocfs_donetdev(PFS_FILL_ARGS) { char ifname[16]; /* XXX LINUX_IFNAMSIZ */ struct ifnet *ifp; sbuf_printf(sb, "%6s|%58s|%s\n" "%6s|%58s|%58s\n", "Inter-", " Receive", " Transmit", " face", "bytes packets errs drop fifo frame compressed multicast", "bytes packets errs drop fifo colls carrier compressed"); CURVNET_SET(TD_TO_VNET(curthread)); IFNET_RLOCK(); - TAILQ_FOREACH(ifp, &V_ifnet, if_link) { + CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) { linux_ifname(ifp, ifname, sizeof ifname); sbuf_printf(sb, "%6.6s: ", ifname); sbuf_printf(sb, "%7ju %7ju %4ju %4ju %4lu %5lu %10lu %9ju ", (uintmax_t )ifp->if_get_counter(ifp, IFCOUNTER_IBYTES), (uintmax_t )ifp->if_get_counter(ifp, IFCOUNTER_IPACKETS), (uintmax_t )ifp->if_get_counter(ifp, IFCOUNTER_IERRORS), (uintmax_t )ifp->if_get_counter(ifp, IFCOUNTER_IQDROPS), /* rx_missed_errors */ 0UL, /* rx_fifo_errors */ 0UL, /* rx_length_errors + * rx_over_errors + * rx_crc_errors + * rx_frame_errors */ 0UL, /* rx_compressed */ (uintmax_t )ifp->if_get_counter(ifp, IFCOUNTER_IMCASTS)); /* XXX-BZ rx only? */ sbuf_printf(sb, "%8ju %7ju %4ju %4ju %4lu %5ju %7lu %10lu\n", (uintmax_t )ifp->if_get_counter(ifp, IFCOUNTER_OBYTES), (uintmax_t )ifp->if_get_counter(ifp, IFCOUNTER_OPACKETS), (uintmax_t )ifp->if_get_counter(ifp, IFCOUNTER_OERRORS), (uintmax_t )ifp->if_get_counter(ifp, IFCOUNTER_OQDROPS), 0UL, /* tx_fifo_errors */ (uintmax_t )ifp->if_get_counter(ifp, IFCOUNTER_COLLISIONS), 0UL, /* tx_carrier_errors + * tx_aborted_errors + * tx_window_errors + * tx_heartbeat_errors*/ 0UL); /* tx_compressed */ } IFNET_RUNLOCK(); CURVNET_RESTORE(); return (0); } /* * Filler function for proc/sys/kernel/osrelease */ static int linprocfs_doosrelease(PFS_FILL_ARGS) { char osrelease[LINUX_MAX_UTSNAME]; linux_get_osrelease(td, osrelease); sbuf_printf(sb, "%s\n", osrelease); return (0); } /* * Filler function for proc/sys/kernel/ostype */ static int linprocfs_doostype(PFS_FILL_ARGS) { char osname[LINUX_MAX_UTSNAME]; linux_get_osname(td, osname); sbuf_printf(sb, "%s\n", osname); return (0); } /* * Filler function for proc/sys/kernel/version */ static int linprocfs_doosbuild(PFS_FILL_ARGS) { linprocfs_osbuild(td, sb); sbuf_cat(sb, "\n"); return (0); } /* * Filler function for proc/sys/kernel/msgmni */ static int linprocfs_domsgmni(PFS_FILL_ARGS) { sbuf_printf(sb, "%d\n", msginfo.msgmni); return (0); } /* * Filler function for proc/sys/kernel/pid_max */ static int linprocfs_dopid_max(PFS_FILL_ARGS) { sbuf_printf(sb, "%i\n", PID_MAX); return (0); } /* * Filler function for proc/sys/kernel/sem */ static int linprocfs_dosem(PFS_FILL_ARGS) { sbuf_printf(sb, "%d %d %d %d\n", seminfo.semmsl, seminfo.semmns, seminfo.semopm, seminfo.semmni); return (0); } /* * Filler function for proc/scsi/device_info */ static int linprocfs_doscsidevinfo(PFS_FILL_ARGS) { return (0); } /* * Filler function for proc/scsi/scsi */ static int linprocfs_doscsiscsi(PFS_FILL_ARGS) { return (0); } /* * Filler function for proc/devices */ static int linprocfs_dodevices(PFS_FILL_ARGS) { char *char_devices; sbuf_printf(sb, "Character devices:\n"); char_devices = linux_get_char_devices(); sbuf_printf(sb, "%s", char_devices); linux_free_get_char_devices(char_devices); sbuf_printf(sb, "\nBlock devices:\n"); return (0); } /* * Filler function for proc/cmdline */ static int linprocfs_docmdline(PFS_FILL_ARGS) { sbuf_printf(sb, "BOOT_IMAGE=%s", kernelname); sbuf_printf(sb, " ro root=302\n"); return (0); } /* * Filler function for proc/filesystems */ static int linprocfs_dofilesystems(PFS_FILL_ARGS) { struct vfsconf *vfsp; vfsconf_slock(); TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) { if (vfsp->vfc_flags & VFCF_SYNTHETIC) sbuf_printf(sb, "nodev"); sbuf_printf(sb, "\t%s\n", vfsp->vfc_name); } vfsconf_sunlock(); return(0); } #if 0 /* * Filler function for proc/modules */ static int linprocfs_domodules(PFS_FILL_ARGS) { struct linker_file *lf; TAILQ_FOREACH(lf, &linker_files, link) { sbuf_printf(sb, "%-20s%8lu%4d\n", lf->filename, (unsigned long)lf->size, lf->refs); } return (0); } #endif /* * Filler function for proc/pid/fd */ static int linprocfs_dofdescfs(PFS_FILL_ARGS) { if (p == curproc) sbuf_printf(sb, "/dev/fd"); else sbuf_printf(sb, "unknown"); return (0); } /* * Filler function for proc/pid/limits */ static const struct linux_rlimit_ident { const char *desc; const char *unit; unsigned int rlim_id; } linux_rlimits_ident[] = { { "Max cpu time", "seconds", RLIMIT_CPU }, { "Max file size", "bytes", RLIMIT_FSIZE }, { "Max data size", "bytes", RLIMIT_DATA }, { "Max stack size", "bytes", RLIMIT_STACK }, { "Max core file size", "bytes", RLIMIT_CORE }, { "Max resident set", "bytes", RLIMIT_RSS }, { "Max processes", "processes", RLIMIT_NPROC }, { "Max open files", "files", RLIMIT_NOFILE }, { "Max locked memory", "bytes", RLIMIT_MEMLOCK }, { "Max address space", "bytes", RLIMIT_AS }, { "Max file locks", "locks", LINUX_RLIMIT_LOCKS }, { "Max pending signals", "signals", LINUX_RLIMIT_SIGPENDING }, { "Max msgqueue size", "bytes", LINUX_RLIMIT_MSGQUEUE }, { "Max nice priority", "", LINUX_RLIMIT_NICE }, { "Max realtime priority", "", LINUX_RLIMIT_RTPRIO }, { "Max realtime timeout", "us", LINUX_RLIMIT_RTTIME }, { 0, 0, 0 } }; static int linprocfs_doproclimits(PFS_FILL_ARGS) { const struct linux_rlimit_ident *li; struct plimit *limp; struct rlimit rl; ssize_t size; int res, error; error = 0; PROC_LOCK(p); limp = lim_hold(p->p_limit); PROC_UNLOCK(p); size = sizeof(res); sbuf_printf(sb, "%-26s%-21s%-21s%-21s\n", "Limit", "Soft Limit", "Hard Limit", "Units"); for (li = linux_rlimits_ident; li->desc != NULL; ++li) { switch (li->rlim_id) { case LINUX_RLIMIT_LOCKS: /* FALLTHROUGH */ case LINUX_RLIMIT_RTTIME: rl.rlim_cur = RLIM_INFINITY; break; case LINUX_RLIMIT_SIGPENDING: error = kernel_sysctlbyname(td, "kern.sigqueue.max_pending_per_proc", &res, &size, 0, 0, 0, 0); if (error != 0) goto out; rl.rlim_cur = res; rl.rlim_max = res; break; case LINUX_RLIMIT_MSGQUEUE: error = kernel_sysctlbyname(td, "kern.ipc.msgmnb", &res, &size, 0, 0, 0, 0); if (error != 0) goto out; rl.rlim_cur = res; rl.rlim_max = res; break; case LINUX_RLIMIT_NICE: /* FALLTHROUGH */ case LINUX_RLIMIT_RTPRIO: rl.rlim_cur = 0; rl.rlim_max = 0; break; default: rl = limp->pl_rlimit[li->rlim_id]; break; } if (rl.rlim_cur == RLIM_INFINITY) sbuf_printf(sb, "%-26s%-21s%-21s%-10s\n", li->desc, "unlimited", "unlimited", li->unit); else sbuf_printf(sb, "%-26s%-21llu%-21llu%-10s\n", li->desc, (unsigned long long)rl.rlim_cur, (unsigned long long)rl.rlim_max, li->unit); } out: lim_free(limp); return (error); } /* * Filler function for proc/sys/kernel/random/uuid */ static int linprocfs_douuid(PFS_FILL_ARGS) { struct uuid uuid; kern_uuidgen(&uuid, 1); sbuf_printf_uuid(sb, &uuid); sbuf_printf(sb, "\n"); return(0); } /* * Filler function for proc/pid/auxv */ static int linprocfs_doauxv(PFS_FILL_ARGS) { struct sbuf *asb; off_t buflen, resid; int error; /* * Mimic linux behavior and pass only processes with usermode * address space as valid. Return zero silently otherwise. */ if (p->p_vmspace == &vmspace0) return (0); if (uio->uio_resid == 0) return (0); if (uio->uio_offset < 0 || uio->uio_resid < 0) return (EINVAL); asb = sbuf_new_auto(); if (asb == NULL) return (ENOMEM); error = proc_getauxv(td, p, asb); if (error == 0) error = sbuf_finish(asb); resid = sbuf_len(asb) - uio->uio_offset; if (resid > uio->uio_resid) buflen = uio->uio_resid; else buflen = resid; if (buflen > IOSIZE_MAX) return (EINVAL); if (buflen > MAXPHYS) buflen = MAXPHYS; if (resid <= 0) return (0); if (error == 0) error = uiomove(sbuf_data(asb) + uio->uio_offset, buflen, uio); sbuf_delete(asb); return (error); } /* * Constructor */ static int linprocfs_init(PFS_INIT_ARGS) { struct pfs_node *root; struct pfs_node *dir; root = pi->pi_root; /* /proc/... */ pfs_create_file(root, "cmdline", &linprocfs_docmdline, NULL, NULL, NULL, PFS_RD); pfs_create_file(root, "cpuinfo", &linprocfs_docpuinfo, NULL, NULL, NULL, PFS_RD); pfs_create_file(root, "devices", &linprocfs_dodevices, NULL, NULL, NULL, PFS_RD); pfs_create_file(root, "filesystems", &linprocfs_dofilesystems, NULL, NULL, NULL, PFS_RD); pfs_create_file(root, "loadavg", &linprocfs_doloadavg, NULL, NULL, NULL, PFS_RD); pfs_create_file(root, "meminfo", &linprocfs_domeminfo, NULL, NULL, NULL, PFS_RD); #if 0 pfs_create_file(root, "modules", &linprocfs_domodules, NULL, NULL, NULL, PFS_RD); #endif pfs_create_file(root, "mounts", &linprocfs_domtab, NULL, NULL, NULL, PFS_RD); pfs_create_file(root, "mtab", &linprocfs_domtab, NULL, NULL, NULL, PFS_RD); pfs_create_file(root, "partitions", &linprocfs_dopartitions, NULL, NULL, NULL, PFS_RD); pfs_create_link(root, "self", &procfs_docurproc, NULL, NULL, NULL, 0); pfs_create_file(root, "stat", &linprocfs_dostat, NULL, NULL, NULL, PFS_RD); pfs_create_file(root, "swaps", &linprocfs_doswaps, NULL, NULL, NULL, PFS_RD); pfs_create_file(root, "uptime", &linprocfs_douptime, NULL, NULL, NULL, PFS_RD); pfs_create_file(root, "version", &linprocfs_doversion, NULL, NULL, NULL, PFS_RD); /* /proc/net/... */ dir = pfs_create_dir(root, "net", NULL, NULL, NULL, 0); pfs_create_file(dir, "dev", &linprocfs_donetdev, NULL, NULL, NULL, PFS_RD); /* /proc//... */ dir = pfs_create_dir(root, "pid", NULL, NULL, NULL, PFS_PROCDEP); pfs_create_file(dir, "cmdline", &linprocfs_doproccmdline, NULL, NULL, NULL, PFS_RD); pfs_create_link(dir, "cwd", &linprocfs_doproccwd, NULL, NULL, NULL, 0); pfs_create_file(dir, "environ", &linprocfs_doprocenviron, NULL, &procfs_candebug, NULL, PFS_RD); pfs_create_link(dir, "exe", &procfs_doprocfile, NULL, &procfs_notsystem, NULL, 0); pfs_create_file(dir, "maps", &linprocfs_doprocmaps, NULL, NULL, NULL, PFS_RD); pfs_create_file(dir, "mem", &procfs_doprocmem, procfs_attr_rw, &procfs_candebug, NULL, PFS_RDWR | PFS_RAW); pfs_create_file(dir, "mounts", &linprocfs_domtab, NULL, NULL, NULL, PFS_RD); pfs_create_link(dir, "root", &linprocfs_doprocroot, NULL, NULL, NULL, 0); pfs_create_file(dir, "stat", &linprocfs_doprocstat, NULL, NULL, NULL, PFS_RD); pfs_create_file(dir, "statm", &linprocfs_doprocstatm, NULL, NULL, NULL, PFS_RD); pfs_create_file(dir, "status", &linprocfs_doprocstatus, NULL, NULL, NULL, PFS_RD); pfs_create_link(dir, "fd", &linprocfs_dofdescfs, NULL, NULL, NULL, 0); pfs_create_file(dir, "auxv", &linprocfs_doauxv, NULL, &procfs_candebug, NULL, PFS_RD|PFS_RAWRD); pfs_create_file(dir, "limits", &linprocfs_doproclimits, NULL, NULL, NULL, PFS_RD); /* /proc/scsi/... */ dir = pfs_create_dir(root, "scsi", NULL, NULL, NULL, 0); pfs_create_file(dir, "device_info", &linprocfs_doscsidevinfo, NULL, NULL, NULL, PFS_RD); pfs_create_file(dir, "scsi", &linprocfs_doscsiscsi, NULL, NULL, NULL, PFS_RD); /* /proc/sys/... */ dir = pfs_create_dir(root, "sys", NULL, NULL, NULL, 0); /* /proc/sys/kernel/... */ dir = pfs_create_dir(dir, "kernel", NULL, NULL, NULL, 0); pfs_create_file(dir, "osrelease", &linprocfs_doosrelease, NULL, NULL, NULL, PFS_RD); pfs_create_file(dir, "ostype", &linprocfs_doostype, NULL, NULL, NULL, PFS_RD); pfs_create_file(dir, "version", &linprocfs_doosbuild, NULL, NULL, NULL, PFS_RD); pfs_create_file(dir, "msgmni", &linprocfs_domsgmni, NULL, NULL, NULL, PFS_RD); pfs_create_file(dir, "pid_max", &linprocfs_dopid_max, NULL, NULL, NULL, PFS_RD); pfs_create_file(dir, "sem", &linprocfs_dosem, NULL, NULL, NULL, PFS_RD); /* /proc/sys/kernel/random/... */ dir = pfs_create_dir(dir, "random", NULL, NULL, NULL, 0); pfs_create_file(dir, "uuid", &linprocfs_douuid, NULL, NULL, NULL, PFS_RD); return (0); } /* * Destructor */ static int linprocfs_uninit(PFS_INIT_ARGS) { /* nothing to do, pseudofs will GC */ return (0); } PSEUDOFS(linprocfs, 1, VFCF_JAIL); #if defined(__amd64__) MODULE_DEPEND(linprocfs, linux_common, 1, 1, 1); #else MODULE_DEPEND(linprocfs, linux, 1, 1, 1); #endif MODULE_DEPEND(linprocfs, procfs, 1, 1, 1); MODULE_DEPEND(linprocfs, sysvmsg, 1, 1, 1); MODULE_DEPEND(linprocfs, sysvsem, 1, 1, 1); Index: head/sys/compat/linux/linux_ioctl.c =================================================================== --- head/sys/compat/linux/linux_ioctl.c (revision 334117) +++ head/sys/compat/linux/linux_ioctl.c (revision 334118) @@ -1,3800 +1,3800 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 1994-1995 Søren Schmidt * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include "opt_compat.h" #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef COMPAT_LINUX32 #include #include #else #include #include #endif #include #include #include #include #include #include #include #include #include #include CTASSERT(LINUX_IFNAMSIZ == IFNAMSIZ); static linux_ioctl_function_t linux_ioctl_cdrom; static linux_ioctl_function_t linux_ioctl_vfat; static linux_ioctl_function_t linux_ioctl_console; static linux_ioctl_function_t linux_ioctl_hdio; static linux_ioctl_function_t linux_ioctl_disk; static linux_ioctl_function_t linux_ioctl_socket; static linux_ioctl_function_t linux_ioctl_sound; static linux_ioctl_function_t linux_ioctl_termio; static linux_ioctl_function_t linux_ioctl_private; static linux_ioctl_function_t linux_ioctl_drm; static linux_ioctl_function_t linux_ioctl_sg; static linux_ioctl_function_t linux_ioctl_v4l; static linux_ioctl_function_t linux_ioctl_v4l2; static linux_ioctl_function_t linux_ioctl_special; static linux_ioctl_function_t linux_ioctl_fbsd_usb; static linux_ioctl_function_t linux_ioctl_evdev; static struct linux_ioctl_handler cdrom_handler = { linux_ioctl_cdrom, LINUX_IOCTL_CDROM_MIN, LINUX_IOCTL_CDROM_MAX }; static struct linux_ioctl_handler vfat_handler = { linux_ioctl_vfat, LINUX_IOCTL_VFAT_MIN, LINUX_IOCTL_VFAT_MAX }; static struct linux_ioctl_handler console_handler = { linux_ioctl_console, LINUX_IOCTL_CONSOLE_MIN, LINUX_IOCTL_CONSOLE_MAX }; static struct linux_ioctl_handler hdio_handler = { linux_ioctl_hdio, LINUX_IOCTL_HDIO_MIN, LINUX_IOCTL_HDIO_MAX }; static struct linux_ioctl_handler disk_handler = { linux_ioctl_disk, LINUX_IOCTL_DISK_MIN, LINUX_IOCTL_DISK_MAX }; static struct linux_ioctl_handler socket_handler = { linux_ioctl_socket, LINUX_IOCTL_SOCKET_MIN, LINUX_IOCTL_SOCKET_MAX }; static struct linux_ioctl_handler sound_handler = { linux_ioctl_sound, LINUX_IOCTL_SOUND_MIN, LINUX_IOCTL_SOUND_MAX }; static struct linux_ioctl_handler termio_handler = { linux_ioctl_termio, LINUX_IOCTL_TERMIO_MIN, LINUX_IOCTL_TERMIO_MAX }; static struct linux_ioctl_handler private_handler = { linux_ioctl_private, LINUX_IOCTL_PRIVATE_MIN, LINUX_IOCTL_PRIVATE_MAX }; static struct linux_ioctl_handler drm_handler = { linux_ioctl_drm, LINUX_IOCTL_DRM_MIN, LINUX_IOCTL_DRM_MAX }; static struct linux_ioctl_handler sg_handler = { linux_ioctl_sg, LINUX_IOCTL_SG_MIN, LINUX_IOCTL_SG_MAX }; static struct linux_ioctl_handler video_handler = { linux_ioctl_v4l, LINUX_IOCTL_VIDEO_MIN, LINUX_IOCTL_VIDEO_MAX }; static struct linux_ioctl_handler video2_handler = { linux_ioctl_v4l2, LINUX_IOCTL_VIDEO2_MIN, LINUX_IOCTL_VIDEO2_MAX }; static struct linux_ioctl_handler fbsd_usb = { linux_ioctl_fbsd_usb, FBSD_LUSB_MIN, FBSD_LUSB_MAX }; static struct linux_ioctl_handler evdev_handler = { linux_ioctl_evdev, LINUX_IOCTL_EVDEV_MIN, LINUX_IOCTL_EVDEV_MAX }; DATA_SET(linux_ioctl_handler_set, cdrom_handler); DATA_SET(linux_ioctl_handler_set, vfat_handler); DATA_SET(linux_ioctl_handler_set, console_handler); DATA_SET(linux_ioctl_handler_set, hdio_handler); DATA_SET(linux_ioctl_handler_set, disk_handler); DATA_SET(linux_ioctl_handler_set, socket_handler); DATA_SET(linux_ioctl_handler_set, sound_handler); DATA_SET(linux_ioctl_handler_set, termio_handler); DATA_SET(linux_ioctl_handler_set, private_handler); DATA_SET(linux_ioctl_handler_set, drm_handler); DATA_SET(linux_ioctl_handler_set, sg_handler); DATA_SET(linux_ioctl_handler_set, video_handler); DATA_SET(linux_ioctl_handler_set, video2_handler); DATA_SET(linux_ioctl_handler_set, fbsd_usb); DATA_SET(linux_ioctl_handler_set, evdev_handler); struct handler_element { TAILQ_ENTRY(handler_element) list; int (*func)(struct thread *, struct linux_ioctl_args *); int low, high, span; }; static TAILQ_HEAD(, handler_element) handlers = TAILQ_HEAD_INITIALIZER(handlers); static struct sx linux_ioctl_sx; SX_SYSINIT(linux_ioctl, &linux_ioctl_sx, "Linux ioctl handlers"); /* * hdio related ioctls for VMWare support */ struct linux_hd_geometry { u_int8_t heads; u_int8_t sectors; u_int16_t cylinders; u_int32_t start; }; struct linux_hd_big_geometry { u_int8_t heads; u_int8_t sectors; u_int32_t cylinders; u_int32_t start; }; static int linux_ioctl_hdio(struct thread *td, struct linux_ioctl_args *args) { struct file *fp; int error; u_int sectorsize, fwcylinders, fwheads, fwsectors; off_t mediasize, bytespercyl; error = fget(td, args->fd, &cap_ioctl_rights, &fp); if (error != 0) return (error); switch (args->cmd & 0xffff) { case LINUX_HDIO_GET_GEO: case LINUX_HDIO_GET_GEO_BIG: error = fo_ioctl(fp, DIOCGMEDIASIZE, (caddr_t)&mediasize, td->td_ucred, td); if (!error) error = fo_ioctl(fp, DIOCGSECTORSIZE, (caddr_t)§orsize, td->td_ucred, td); if (!error) error = fo_ioctl(fp, DIOCGFWHEADS, (caddr_t)&fwheads, td->td_ucred, td); if (!error) error = fo_ioctl(fp, DIOCGFWSECTORS, (caddr_t)&fwsectors, td->td_ucred, td); /* * XXX: DIOCGFIRSTOFFSET is not yet implemented, so * so pretend that GEOM always says 0. This is NOT VALID * for slices or partitions, only the per-disk raw devices. */ fdrop(fp, td); if (error) return (error); /* * 1. Calculate the number of bytes in a cylinder, * given the firmware's notion of heads and sectors * per cylinder. * 2. Calculate the number of cylinders, given the total * size of the media. * All internal calculations should have 64-bit precision. */ bytespercyl = (off_t) sectorsize * fwheads * fwsectors; fwcylinders = mediasize / bytespercyl; #if defined(DEBUG) linux_msg(td, "HDIO_GET_GEO: mediasize %jd, c/h/s %d/%d/%d, " "bpc %jd", (intmax_t)mediasize, fwcylinders, fwheads, fwsectors, (intmax_t)bytespercyl); #endif if ((args->cmd & 0xffff) == LINUX_HDIO_GET_GEO) { struct linux_hd_geometry hdg; hdg.cylinders = fwcylinders; hdg.heads = fwheads; hdg.sectors = fwsectors; hdg.start = 0; error = copyout(&hdg, (void *)args->arg, sizeof(hdg)); } else if ((args->cmd & 0xffff) == LINUX_HDIO_GET_GEO_BIG) { struct linux_hd_big_geometry hdbg; memset(&hdbg, 0, sizeof(hdbg)); hdbg.cylinders = fwcylinders; hdbg.heads = fwheads; hdbg.sectors = fwsectors; hdbg.start = 0; error = copyout(&hdbg, (void *)args->arg, sizeof(hdbg)); } return (error); break; default: /* XXX */ linux_msg(td, "ioctl fd=%d, cmd=0x%x ('%c',%d) is not implemented", args->fd, (int)(args->cmd & 0xffff), (int)(args->cmd & 0xff00) >> 8, (int)(args->cmd & 0xff)); break; } fdrop(fp, td); return (ENOIOCTL); } static int linux_ioctl_disk(struct thread *td, struct linux_ioctl_args *args) { struct file *fp; int error; u_int sectorsize; off_t mediasize; error = fget(td, args->fd, &cap_ioctl_rights, &fp); if (error != 0) return (error); switch (args->cmd & 0xffff) { case LINUX_BLKGETSIZE: error = fo_ioctl(fp, DIOCGSECTORSIZE, (caddr_t)§orsize, td->td_ucred, td); if (!error) error = fo_ioctl(fp, DIOCGMEDIASIZE, (caddr_t)&mediasize, td->td_ucred, td); fdrop(fp, td); if (error) return (error); sectorsize = mediasize / sectorsize; /* * XXX: How do we know we return the right size of integer ? */ return (copyout(§orsize, (void *)args->arg, sizeof(sectorsize))); break; case LINUX_BLKSSZGET: error = fo_ioctl(fp, DIOCGSECTORSIZE, (caddr_t)§orsize, td->td_ucred, td); fdrop(fp, td); if (error) return (error); return (copyout(§orsize, (void *)args->arg, sizeof(sectorsize))); break; } fdrop(fp, td); return (ENOIOCTL); } /* * termio related ioctls */ struct linux_termio { unsigned short c_iflag; unsigned short c_oflag; unsigned short c_cflag; unsigned short c_lflag; unsigned char c_line; unsigned char c_cc[LINUX_NCC]; }; struct linux_termios { unsigned int c_iflag; unsigned int c_oflag; unsigned int c_cflag; unsigned int c_lflag; unsigned char c_line; unsigned char c_cc[LINUX_NCCS]; }; struct linux_winsize { unsigned short ws_row, ws_col; unsigned short ws_xpixel, ws_ypixel; }; struct speedtab { int sp_speed; /* Speed. */ int sp_code; /* Code. */ }; static struct speedtab sptab[] = { { B0, LINUX_B0 }, { B50, LINUX_B50 }, { B75, LINUX_B75 }, { B110, LINUX_B110 }, { B134, LINUX_B134 }, { B150, LINUX_B150 }, { B200, LINUX_B200 }, { B300, LINUX_B300 }, { B600, LINUX_B600 }, { B1200, LINUX_B1200 }, { B1800, LINUX_B1800 }, { B2400, LINUX_B2400 }, { B4800, LINUX_B4800 }, { B9600, LINUX_B9600 }, { B19200, LINUX_B19200 }, { B38400, LINUX_B38400 }, { B57600, LINUX_B57600 }, { B115200, LINUX_B115200 }, {-1, -1 } }; struct linux_serial_struct { int type; int line; int port; int irq; int flags; int xmit_fifo_size; int custom_divisor; int baud_base; unsigned short close_delay; char reserved_char[2]; int hub6; unsigned short closing_wait; unsigned short closing_wait2; int reserved[4]; }; static int linux_to_bsd_speed(int code, struct speedtab *table) { for ( ; table->sp_code != -1; table++) if (table->sp_code == code) return (table->sp_speed); return (-1); } static int bsd_to_linux_speed(int speed, struct speedtab *table) { for ( ; table->sp_speed != -1; table++) if (table->sp_speed == speed) return (table->sp_code); return (-1); } static void bsd_to_linux_termios(struct termios *bios, struct linux_termios *lios) { int i; #ifdef DEBUG if (ldebug(ioctl)) { printf("LINUX: BSD termios structure (input):\n"); printf("i=%08x o=%08x c=%08x l=%08x ispeed=%d ospeed=%d\n", bios->c_iflag, bios->c_oflag, bios->c_cflag, bios->c_lflag, bios->c_ispeed, bios->c_ospeed); printf("c_cc "); for (i=0; ic_cc[i]); printf("\n"); } #endif lios->c_iflag = 0; if (bios->c_iflag & IGNBRK) lios->c_iflag |= LINUX_IGNBRK; if (bios->c_iflag & BRKINT) lios->c_iflag |= LINUX_BRKINT; if (bios->c_iflag & IGNPAR) lios->c_iflag |= LINUX_IGNPAR; if (bios->c_iflag & PARMRK) lios->c_iflag |= LINUX_PARMRK; if (bios->c_iflag & INPCK) lios->c_iflag |= LINUX_INPCK; if (bios->c_iflag & ISTRIP) lios->c_iflag |= LINUX_ISTRIP; if (bios->c_iflag & INLCR) lios->c_iflag |= LINUX_INLCR; if (bios->c_iflag & IGNCR) lios->c_iflag |= LINUX_IGNCR; if (bios->c_iflag & ICRNL) lios->c_iflag |= LINUX_ICRNL; if (bios->c_iflag & IXON) lios->c_iflag |= LINUX_IXON; if (bios->c_iflag & IXANY) lios->c_iflag |= LINUX_IXANY; if (bios->c_iflag & IXOFF) lios->c_iflag |= LINUX_IXOFF; if (bios->c_iflag & IMAXBEL) lios->c_iflag |= LINUX_IMAXBEL; lios->c_oflag = 0; if (bios->c_oflag & OPOST) lios->c_oflag |= LINUX_OPOST; if (bios->c_oflag & ONLCR) lios->c_oflag |= LINUX_ONLCR; if (bios->c_oflag & TAB3) lios->c_oflag |= LINUX_XTABS; lios->c_cflag = bsd_to_linux_speed(bios->c_ispeed, sptab); lios->c_cflag |= (bios->c_cflag & CSIZE) >> 4; if (bios->c_cflag & CSTOPB) lios->c_cflag |= LINUX_CSTOPB; if (bios->c_cflag & CREAD) lios->c_cflag |= LINUX_CREAD; if (bios->c_cflag & PARENB) lios->c_cflag |= LINUX_PARENB; if (bios->c_cflag & PARODD) lios->c_cflag |= LINUX_PARODD; if (bios->c_cflag & HUPCL) lios->c_cflag |= LINUX_HUPCL; if (bios->c_cflag & CLOCAL) lios->c_cflag |= LINUX_CLOCAL; if (bios->c_cflag & CRTSCTS) lios->c_cflag |= LINUX_CRTSCTS; lios->c_lflag = 0; if (bios->c_lflag & ISIG) lios->c_lflag |= LINUX_ISIG; if (bios->c_lflag & ICANON) lios->c_lflag |= LINUX_ICANON; if (bios->c_lflag & ECHO) lios->c_lflag |= LINUX_ECHO; if (bios->c_lflag & ECHOE) lios->c_lflag |= LINUX_ECHOE; if (bios->c_lflag & ECHOK) lios->c_lflag |= LINUX_ECHOK; if (bios->c_lflag & ECHONL) lios->c_lflag |= LINUX_ECHONL; if (bios->c_lflag & NOFLSH) lios->c_lflag |= LINUX_NOFLSH; if (bios->c_lflag & TOSTOP) lios->c_lflag |= LINUX_TOSTOP; if (bios->c_lflag & ECHOCTL) lios->c_lflag |= LINUX_ECHOCTL; if (bios->c_lflag & ECHOPRT) lios->c_lflag |= LINUX_ECHOPRT; if (bios->c_lflag & ECHOKE) lios->c_lflag |= LINUX_ECHOKE; if (bios->c_lflag & FLUSHO) lios->c_lflag |= LINUX_FLUSHO; if (bios->c_lflag & PENDIN) lios->c_lflag |= LINUX_PENDIN; if (bios->c_lflag & IEXTEN) lios->c_lflag |= LINUX_IEXTEN; for (i=0; ic_cc[i] = LINUX_POSIX_VDISABLE; lios->c_cc[LINUX_VINTR] = bios->c_cc[VINTR]; lios->c_cc[LINUX_VQUIT] = bios->c_cc[VQUIT]; lios->c_cc[LINUX_VERASE] = bios->c_cc[VERASE]; lios->c_cc[LINUX_VKILL] = bios->c_cc[VKILL]; lios->c_cc[LINUX_VEOF] = bios->c_cc[VEOF]; lios->c_cc[LINUX_VEOL] = bios->c_cc[VEOL]; lios->c_cc[LINUX_VMIN] = bios->c_cc[VMIN]; lios->c_cc[LINUX_VTIME] = bios->c_cc[VTIME]; lios->c_cc[LINUX_VEOL2] = bios->c_cc[VEOL2]; lios->c_cc[LINUX_VSUSP] = bios->c_cc[VSUSP]; lios->c_cc[LINUX_VSTART] = bios->c_cc[VSTART]; lios->c_cc[LINUX_VSTOP] = bios->c_cc[VSTOP]; lios->c_cc[LINUX_VREPRINT] = bios->c_cc[VREPRINT]; lios->c_cc[LINUX_VDISCARD] = bios->c_cc[VDISCARD]; lios->c_cc[LINUX_VWERASE] = bios->c_cc[VWERASE]; lios->c_cc[LINUX_VLNEXT] = bios->c_cc[VLNEXT]; for (i=0; ic_cc[i] == _POSIX_VDISABLE) lios->c_cc[i] = LINUX_POSIX_VDISABLE; } lios->c_line = 0; #ifdef DEBUG if (ldebug(ioctl)) { printf("LINUX: LINUX termios structure (output):\n"); printf("i=%08x o=%08x c=%08x l=%08x line=%d\n", lios->c_iflag, lios->c_oflag, lios->c_cflag, lios->c_lflag, (int)lios->c_line); printf("c_cc "); for (i=0; ic_cc[i]); printf("\n"); } #endif } static void linux_to_bsd_termios(struct linux_termios *lios, struct termios *bios) { int i; #ifdef DEBUG if (ldebug(ioctl)) { printf("LINUX: LINUX termios structure (input):\n"); printf("i=%08x o=%08x c=%08x l=%08x line=%d\n", lios->c_iflag, lios->c_oflag, lios->c_cflag, lios->c_lflag, (int)lios->c_line); printf("c_cc "); for (i=0; ic_cc[i]); printf("\n"); } #endif bios->c_iflag = 0; if (lios->c_iflag & LINUX_IGNBRK) bios->c_iflag |= IGNBRK; if (lios->c_iflag & LINUX_BRKINT) bios->c_iflag |= BRKINT; if (lios->c_iflag & LINUX_IGNPAR) bios->c_iflag |= IGNPAR; if (lios->c_iflag & LINUX_PARMRK) bios->c_iflag |= PARMRK; if (lios->c_iflag & LINUX_INPCK) bios->c_iflag |= INPCK; if (lios->c_iflag & LINUX_ISTRIP) bios->c_iflag |= ISTRIP; if (lios->c_iflag & LINUX_INLCR) bios->c_iflag |= INLCR; if (lios->c_iflag & LINUX_IGNCR) bios->c_iflag |= IGNCR; if (lios->c_iflag & LINUX_ICRNL) bios->c_iflag |= ICRNL; if (lios->c_iflag & LINUX_IXON) bios->c_iflag |= IXON; if (lios->c_iflag & LINUX_IXANY) bios->c_iflag |= IXANY; if (lios->c_iflag & LINUX_IXOFF) bios->c_iflag |= IXOFF; if (lios->c_iflag & LINUX_IMAXBEL) bios->c_iflag |= IMAXBEL; bios->c_oflag = 0; if (lios->c_oflag & LINUX_OPOST) bios->c_oflag |= OPOST; if (lios->c_oflag & LINUX_ONLCR) bios->c_oflag |= ONLCR; if (lios->c_oflag & LINUX_XTABS) bios->c_oflag |= TAB3; bios->c_cflag = (lios->c_cflag & LINUX_CSIZE) << 4; if (lios->c_cflag & LINUX_CSTOPB) bios->c_cflag |= CSTOPB; if (lios->c_cflag & LINUX_CREAD) bios->c_cflag |= CREAD; if (lios->c_cflag & LINUX_PARENB) bios->c_cflag |= PARENB; if (lios->c_cflag & LINUX_PARODD) bios->c_cflag |= PARODD; if (lios->c_cflag & LINUX_HUPCL) bios->c_cflag |= HUPCL; if (lios->c_cflag & LINUX_CLOCAL) bios->c_cflag |= CLOCAL; if (lios->c_cflag & LINUX_CRTSCTS) bios->c_cflag |= CRTSCTS; bios->c_lflag = 0; if (lios->c_lflag & LINUX_ISIG) bios->c_lflag |= ISIG; if (lios->c_lflag & LINUX_ICANON) bios->c_lflag |= ICANON; if (lios->c_lflag & LINUX_ECHO) bios->c_lflag |= ECHO; if (lios->c_lflag & LINUX_ECHOE) bios->c_lflag |= ECHOE; if (lios->c_lflag & LINUX_ECHOK) bios->c_lflag |= ECHOK; if (lios->c_lflag & LINUX_ECHONL) bios->c_lflag |= ECHONL; if (lios->c_lflag & LINUX_NOFLSH) bios->c_lflag |= NOFLSH; if (lios->c_lflag & LINUX_TOSTOP) bios->c_lflag |= TOSTOP; if (lios->c_lflag & LINUX_ECHOCTL) bios->c_lflag |= ECHOCTL; if (lios->c_lflag & LINUX_ECHOPRT) bios->c_lflag |= ECHOPRT; if (lios->c_lflag & LINUX_ECHOKE) bios->c_lflag |= ECHOKE; if (lios->c_lflag & LINUX_FLUSHO) bios->c_lflag |= FLUSHO; if (lios->c_lflag & LINUX_PENDIN) bios->c_lflag |= PENDIN; if (lios->c_lflag & LINUX_IEXTEN) bios->c_lflag |= IEXTEN; for (i=0; ic_cc[i] = _POSIX_VDISABLE; bios->c_cc[VINTR] = lios->c_cc[LINUX_VINTR]; bios->c_cc[VQUIT] = lios->c_cc[LINUX_VQUIT]; bios->c_cc[VERASE] = lios->c_cc[LINUX_VERASE]; bios->c_cc[VKILL] = lios->c_cc[LINUX_VKILL]; bios->c_cc[VEOF] = lios->c_cc[LINUX_VEOF]; bios->c_cc[VEOL] = lios->c_cc[LINUX_VEOL]; bios->c_cc[VMIN] = lios->c_cc[LINUX_VMIN]; bios->c_cc[VTIME] = lios->c_cc[LINUX_VTIME]; bios->c_cc[VEOL2] = lios->c_cc[LINUX_VEOL2]; bios->c_cc[VSUSP] = lios->c_cc[LINUX_VSUSP]; bios->c_cc[VSTART] = lios->c_cc[LINUX_VSTART]; bios->c_cc[VSTOP] = lios->c_cc[LINUX_VSTOP]; bios->c_cc[VREPRINT] = lios->c_cc[LINUX_VREPRINT]; bios->c_cc[VDISCARD] = lios->c_cc[LINUX_VDISCARD]; bios->c_cc[VWERASE] = lios->c_cc[LINUX_VWERASE]; bios->c_cc[VLNEXT] = lios->c_cc[LINUX_VLNEXT]; for (i=0; ic_cc[i] == LINUX_POSIX_VDISABLE) bios->c_cc[i] = _POSIX_VDISABLE; } bios->c_ispeed = bios->c_ospeed = linux_to_bsd_speed(lios->c_cflag & LINUX_CBAUD, sptab); #ifdef DEBUG if (ldebug(ioctl)) { printf("LINUX: BSD termios structure (output):\n"); printf("i=%08x o=%08x c=%08x l=%08x ispeed=%d ospeed=%d\n", bios->c_iflag, bios->c_oflag, bios->c_cflag, bios->c_lflag, bios->c_ispeed, bios->c_ospeed); printf("c_cc "); for (i=0; ic_cc[i]); printf("\n"); } #endif } static void bsd_to_linux_termio(struct termios *bios, struct linux_termio *lio) { struct linux_termios lios; bsd_to_linux_termios(bios, &lios); lio->c_iflag = lios.c_iflag; lio->c_oflag = lios.c_oflag; lio->c_cflag = lios.c_cflag; lio->c_lflag = lios.c_lflag; lio->c_line = lios.c_line; memcpy(lio->c_cc, lios.c_cc, LINUX_NCC); } static void linux_to_bsd_termio(struct linux_termio *lio, struct termios *bios) { struct linux_termios lios; int i; lios.c_iflag = lio->c_iflag; lios.c_oflag = lio->c_oflag; lios.c_cflag = lio->c_cflag; lios.c_lflag = lio->c_lflag; for (i=LINUX_NCC; ic_cc, LINUX_NCC); linux_to_bsd_termios(&lios, bios); } static int linux_ioctl_termio(struct thread *td, struct linux_ioctl_args *args) { struct termios bios; struct linux_termios lios; struct linux_termio lio; struct file *fp; int error; error = fget(td, args->fd, &cap_ioctl_rights, &fp); if (error != 0) return (error); switch (args->cmd & 0xffff) { case LINUX_TCGETS: error = fo_ioctl(fp, TIOCGETA, (caddr_t)&bios, td->td_ucred, td); if (error) break; bsd_to_linux_termios(&bios, &lios); error = copyout(&lios, (void *)args->arg, sizeof(lios)); break; case LINUX_TCSETS: error = copyin((void *)args->arg, &lios, sizeof(lios)); if (error) break; linux_to_bsd_termios(&lios, &bios); error = (fo_ioctl(fp, TIOCSETA, (caddr_t)&bios, td->td_ucred, td)); break; case LINUX_TCSETSW: error = copyin((void *)args->arg, &lios, sizeof(lios)); if (error) break; linux_to_bsd_termios(&lios, &bios); error = (fo_ioctl(fp, TIOCSETAW, (caddr_t)&bios, td->td_ucred, td)); break; case LINUX_TCSETSF: error = copyin((void *)args->arg, &lios, sizeof(lios)); if (error) break; linux_to_bsd_termios(&lios, &bios); error = (fo_ioctl(fp, TIOCSETAF, (caddr_t)&bios, td->td_ucred, td)); break; case LINUX_TCGETA: error = fo_ioctl(fp, TIOCGETA, (caddr_t)&bios, td->td_ucred, td); if (error) break; bsd_to_linux_termio(&bios, &lio); error = (copyout(&lio, (void *)args->arg, sizeof(lio))); break; case LINUX_TCSETA: error = copyin((void *)args->arg, &lio, sizeof(lio)); if (error) break; linux_to_bsd_termio(&lio, &bios); error = (fo_ioctl(fp, TIOCSETA, (caddr_t)&bios, td->td_ucred, td)); break; case LINUX_TCSETAW: error = copyin((void *)args->arg, &lio, sizeof(lio)); if (error) break; linux_to_bsd_termio(&lio, &bios); error = (fo_ioctl(fp, TIOCSETAW, (caddr_t)&bios, td->td_ucred, td)); break; case LINUX_TCSETAF: error = copyin((void *)args->arg, &lio, sizeof(lio)); if (error) break; linux_to_bsd_termio(&lio, &bios); error = (fo_ioctl(fp, TIOCSETAF, (caddr_t)&bios, td->td_ucred, td)); break; /* LINUX_TCSBRK */ case LINUX_TCXONC: { switch (args->arg) { case LINUX_TCOOFF: args->cmd = TIOCSTOP; break; case LINUX_TCOON: args->cmd = TIOCSTART; break; case LINUX_TCIOFF: case LINUX_TCION: { int c; struct write_args wr; error = fo_ioctl(fp, TIOCGETA, (caddr_t)&bios, td->td_ucred, td); if (error) break; fdrop(fp, td); c = (args->arg == LINUX_TCIOFF) ? VSTOP : VSTART; c = bios.c_cc[c]; if (c != _POSIX_VDISABLE) { wr.fd = args->fd; wr.buf = &c; wr.nbyte = sizeof(c); return (sys_write(td, &wr)); } else return (0); } default: fdrop(fp, td); return (EINVAL); } args->arg = 0; error = (sys_ioctl(td, (struct ioctl_args *)args)); break; } case LINUX_TCFLSH: { int val; switch (args->arg) { case LINUX_TCIFLUSH: val = FREAD; break; case LINUX_TCOFLUSH: val = FWRITE; break; case LINUX_TCIOFLUSH: val = FREAD | FWRITE; break; default: fdrop(fp, td); return (EINVAL); } error = (fo_ioctl(fp,TIOCFLUSH,(caddr_t)&val,td->td_ucred,td)); break; } case LINUX_TIOCEXCL: args->cmd = TIOCEXCL; error = (sys_ioctl(td, (struct ioctl_args *)args)); break; case LINUX_TIOCNXCL: args->cmd = TIOCNXCL; error = (sys_ioctl(td, (struct ioctl_args *)args)); break; case LINUX_TIOCSCTTY: args->cmd = TIOCSCTTY; error = (sys_ioctl(td, (struct ioctl_args *)args)); break; case LINUX_TIOCGPGRP: args->cmd = TIOCGPGRP; error = (sys_ioctl(td, (struct ioctl_args *)args)); break; case LINUX_TIOCSPGRP: args->cmd = TIOCSPGRP; error = (sys_ioctl(td, (struct ioctl_args *)args)); break; /* LINUX_TIOCOUTQ */ /* LINUX_TIOCSTI */ case LINUX_TIOCGWINSZ: args->cmd = TIOCGWINSZ; error = (sys_ioctl(td, (struct ioctl_args *)args)); break; case LINUX_TIOCSWINSZ: args->cmd = TIOCSWINSZ; error = (sys_ioctl(td, (struct ioctl_args *)args)); break; case LINUX_TIOCMGET: args->cmd = TIOCMGET; error = (sys_ioctl(td, (struct ioctl_args *)args)); break; case LINUX_TIOCMBIS: args->cmd = TIOCMBIS; error = (sys_ioctl(td, (struct ioctl_args *)args)); break; case LINUX_TIOCMBIC: args->cmd = TIOCMBIC; error = (sys_ioctl(td, (struct ioctl_args *)args)); break; case LINUX_TIOCMSET: args->cmd = TIOCMSET; error = (sys_ioctl(td, (struct ioctl_args *)args)); break; /* TIOCGSOFTCAR */ /* TIOCSSOFTCAR */ case LINUX_FIONREAD: /* LINUX_TIOCINQ */ args->cmd = FIONREAD; error = (sys_ioctl(td, (struct ioctl_args *)args)); break; /* LINUX_TIOCLINUX */ case LINUX_TIOCCONS: args->cmd = TIOCCONS; error = (sys_ioctl(td, (struct ioctl_args *)args)); break; case LINUX_TIOCGSERIAL: { struct linux_serial_struct lss; bzero(&lss, sizeof(lss)); lss.type = LINUX_PORT_16550A; lss.flags = 0; lss.close_delay = 0; error = copyout(&lss, (void *)args->arg, sizeof(lss)); break; } case LINUX_TIOCSSERIAL: { struct linux_serial_struct lss; error = copyin((void *)args->arg, &lss, sizeof(lss)); if (error) break; /* XXX - It really helps to have an implementation that * does nothing. NOT! */ error = 0; break; } case LINUX_TIOCPKT: args->cmd = TIOCPKT; error = (sys_ioctl(td, (struct ioctl_args *)args)); break; case LINUX_FIONBIO: args->cmd = FIONBIO; error = (sys_ioctl(td, (struct ioctl_args *)args)); break; case LINUX_TIOCNOTTY: args->cmd = TIOCNOTTY; error = (sys_ioctl(td, (struct ioctl_args *)args)); break; case LINUX_TIOCSETD: { int line; switch (args->arg) { case LINUX_N_TTY: line = TTYDISC; break; case LINUX_N_SLIP: line = SLIPDISC; break; case LINUX_N_PPP: line = PPPDISC; break; default: fdrop(fp, td); return (EINVAL); } error = (fo_ioctl(fp, TIOCSETD, (caddr_t)&line, td->td_ucred, td)); break; } case LINUX_TIOCGETD: { int linux_line; int bsd_line = TTYDISC; error = fo_ioctl(fp, TIOCGETD, (caddr_t)&bsd_line, td->td_ucred, td); if (error) break; switch (bsd_line) { case TTYDISC: linux_line = LINUX_N_TTY; break; case SLIPDISC: linux_line = LINUX_N_SLIP; break; case PPPDISC: linux_line = LINUX_N_PPP; break; default: fdrop(fp, td); return (EINVAL); } error = (copyout(&linux_line, (void *)args->arg, sizeof(int))); break; } /* LINUX_TCSBRKP */ /* LINUX_TIOCTTYGSTRUCT */ case LINUX_FIONCLEX: args->cmd = FIONCLEX; error = (sys_ioctl(td, (struct ioctl_args *)args)); break; case LINUX_FIOCLEX: args->cmd = FIOCLEX; error = (sys_ioctl(td, (struct ioctl_args *)args)); break; case LINUX_FIOASYNC: args->cmd = FIOASYNC; error = (sys_ioctl(td, (struct ioctl_args *)args)); break; /* LINUX_TIOCSERCONFIG */ /* LINUX_TIOCSERGWILD */ /* LINUX_TIOCSERSWILD */ /* LINUX_TIOCGLCKTRMIOS */ /* LINUX_TIOCSLCKTRMIOS */ case LINUX_TIOCSBRK: args->cmd = TIOCSBRK; error = (sys_ioctl(td, (struct ioctl_args *)args)); break; case LINUX_TIOCCBRK: args->cmd = TIOCCBRK; error = (sys_ioctl(td, (struct ioctl_args *)args)); break; case LINUX_TIOCGPTN: { int nb; error = fo_ioctl(fp, TIOCGPTN, (caddr_t)&nb, td->td_ucred, td); if (!error) error = copyout(&nb, (void *)args->arg, sizeof(int)); break; } case LINUX_TIOCSPTLCK: /* Our unlockpt() does nothing. */ error = 0; break; default: error = ENOIOCTL; break; } fdrop(fp, td); return (error); } /* * CDROM related ioctls */ struct linux_cdrom_msf { u_char cdmsf_min0; u_char cdmsf_sec0; u_char cdmsf_frame0; u_char cdmsf_min1; u_char cdmsf_sec1; u_char cdmsf_frame1; }; struct linux_cdrom_tochdr { u_char cdth_trk0; u_char cdth_trk1; }; union linux_cdrom_addr { struct { u_char minute; u_char second; u_char frame; } msf; int lba; }; struct linux_cdrom_tocentry { u_char cdte_track; u_char cdte_adr:4; u_char cdte_ctrl:4; u_char cdte_format; union linux_cdrom_addr cdte_addr; u_char cdte_datamode; }; struct linux_cdrom_subchnl { u_char cdsc_format; u_char cdsc_audiostatus; u_char cdsc_adr:4; u_char cdsc_ctrl:4; u_char cdsc_trk; u_char cdsc_ind; union linux_cdrom_addr cdsc_absaddr; union linux_cdrom_addr cdsc_reladdr; }; struct l_cdrom_read_audio { union linux_cdrom_addr addr; u_char addr_format; l_int nframes; u_char *buf; }; struct l_dvd_layer { u_char book_version:4; u_char book_type:4; u_char min_rate:4; u_char disc_size:4; u_char layer_type:4; u_char track_path:1; u_char nlayers:2; u_char track_density:4; u_char linear_density:4; u_char bca:1; u_int32_t start_sector; u_int32_t end_sector; u_int32_t end_sector_l0; }; struct l_dvd_physical { u_char type; u_char layer_num; struct l_dvd_layer layer[4]; }; struct l_dvd_copyright { u_char type; u_char layer_num; u_char cpst; u_char rmi; }; struct l_dvd_disckey { u_char type; l_uint agid:2; u_char value[2048]; }; struct l_dvd_bca { u_char type; l_int len; u_char value[188]; }; struct l_dvd_manufact { u_char type; u_char layer_num; l_int len; u_char value[2048]; }; typedef union { u_char type; struct l_dvd_physical physical; struct l_dvd_copyright copyright; struct l_dvd_disckey disckey; struct l_dvd_bca bca; struct l_dvd_manufact manufact; } l_dvd_struct; typedef u_char l_dvd_key[5]; typedef u_char l_dvd_challenge[10]; struct l_dvd_lu_send_agid { u_char type; l_uint agid:2; }; struct l_dvd_host_send_challenge { u_char type; l_uint agid:2; l_dvd_challenge chal; }; struct l_dvd_send_key { u_char type; l_uint agid:2; l_dvd_key key; }; struct l_dvd_lu_send_challenge { u_char type; l_uint agid:2; l_dvd_challenge chal; }; struct l_dvd_lu_send_title_key { u_char type; l_uint agid:2; l_dvd_key title_key; l_int lba; l_uint cpm:1; l_uint cp_sec:1; l_uint cgms:2; }; struct l_dvd_lu_send_asf { u_char type; l_uint agid:2; l_uint asf:1; }; struct l_dvd_host_send_rpcstate { u_char type; u_char pdrc; }; struct l_dvd_lu_send_rpcstate { u_char type:2; u_char vra:3; u_char ucca:3; u_char region_mask; u_char rpc_scheme; }; typedef union { u_char type; struct l_dvd_lu_send_agid lsa; struct l_dvd_host_send_challenge hsc; struct l_dvd_send_key lsk; struct l_dvd_lu_send_challenge lsc; struct l_dvd_send_key hsk; struct l_dvd_lu_send_title_key lstk; struct l_dvd_lu_send_asf lsasf; struct l_dvd_host_send_rpcstate hrpcs; struct l_dvd_lu_send_rpcstate lrpcs; } l_dvd_authinfo; static void bsd_to_linux_msf_lba(u_char af, union msf_lba *bp, union linux_cdrom_addr *lp) { if (af == CD_LBA_FORMAT) lp->lba = bp->lba; else { lp->msf.minute = bp->msf.minute; lp->msf.second = bp->msf.second; lp->msf.frame = bp->msf.frame; } } static void set_linux_cdrom_addr(union linux_cdrom_addr *addr, int format, int lba) { if (format == LINUX_CDROM_MSF) { addr->msf.frame = lba % 75; lba /= 75; lba += 2; addr->msf.second = lba % 60; addr->msf.minute = lba / 60; } else addr->lba = lba; } static int linux_to_bsd_dvd_struct(l_dvd_struct *lp, struct dvd_struct *bp) { bp->format = lp->type; switch (bp->format) { case DVD_STRUCT_PHYSICAL: if (bp->layer_num >= 4) return (EINVAL); bp->layer_num = lp->physical.layer_num; break; case DVD_STRUCT_COPYRIGHT: bp->layer_num = lp->copyright.layer_num; break; case DVD_STRUCT_DISCKEY: bp->agid = lp->disckey.agid; break; case DVD_STRUCT_BCA: case DVD_STRUCT_MANUFACT: break; default: return (EINVAL); } return (0); } static int bsd_to_linux_dvd_struct(struct dvd_struct *bp, l_dvd_struct *lp) { switch (bp->format) { case DVD_STRUCT_PHYSICAL: { struct dvd_layer *blp = (struct dvd_layer *)bp->data; struct l_dvd_layer *llp = &lp->physical.layer[bp->layer_num]; memset(llp, 0, sizeof(*llp)); llp->book_version = blp->book_version; llp->book_type = blp->book_type; llp->min_rate = blp->max_rate; llp->disc_size = blp->disc_size; llp->layer_type = blp->layer_type; llp->track_path = blp->track_path; llp->nlayers = blp->nlayers; llp->track_density = blp->track_density; llp->linear_density = blp->linear_density; llp->bca = blp->bca; llp->start_sector = blp->start_sector; llp->end_sector = blp->end_sector; llp->end_sector_l0 = blp->end_sector_l0; break; } case DVD_STRUCT_COPYRIGHT: lp->copyright.cpst = bp->cpst; lp->copyright.rmi = bp->rmi; break; case DVD_STRUCT_DISCKEY: memcpy(lp->disckey.value, bp->data, sizeof(lp->disckey.value)); break; case DVD_STRUCT_BCA: lp->bca.len = bp->length; memcpy(lp->bca.value, bp->data, sizeof(lp->bca.value)); break; case DVD_STRUCT_MANUFACT: lp->manufact.len = bp->length; memcpy(lp->manufact.value, bp->data, sizeof(lp->manufact.value)); /* lp->manufact.layer_num is unused in Linux (redhat 7.0). */ break; default: return (EINVAL); } return (0); } static int linux_to_bsd_dvd_authinfo(l_dvd_authinfo *lp, int *bcode, struct dvd_authinfo *bp) { switch (lp->type) { case LINUX_DVD_LU_SEND_AGID: *bcode = DVDIOCREPORTKEY; bp->format = DVD_REPORT_AGID; bp->agid = lp->lsa.agid; break; case LINUX_DVD_HOST_SEND_CHALLENGE: *bcode = DVDIOCSENDKEY; bp->format = DVD_SEND_CHALLENGE; bp->agid = lp->hsc.agid; memcpy(bp->keychal, lp->hsc.chal, 10); break; case LINUX_DVD_LU_SEND_KEY1: *bcode = DVDIOCREPORTKEY; bp->format = DVD_REPORT_KEY1; bp->agid = lp->lsk.agid; break; case LINUX_DVD_LU_SEND_CHALLENGE: *bcode = DVDIOCREPORTKEY; bp->format = DVD_REPORT_CHALLENGE; bp->agid = lp->lsc.agid; break; case LINUX_DVD_HOST_SEND_KEY2: *bcode = DVDIOCSENDKEY; bp->format = DVD_SEND_KEY2; bp->agid = lp->hsk.agid; memcpy(bp->keychal, lp->hsk.key, 5); break; case LINUX_DVD_LU_SEND_TITLE_KEY: *bcode = DVDIOCREPORTKEY; bp->format = DVD_REPORT_TITLE_KEY; bp->agid = lp->lstk.agid; bp->lba = lp->lstk.lba; break; case LINUX_DVD_LU_SEND_ASF: *bcode = DVDIOCREPORTKEY; bp->format = DVD_REPORT_ASF; bp->agid = lp->lsasf.agid; break; case LINUX_DVD_INVALIDATE_AGID: *bcode = DVDIOCREPORTKEY; bp->format = DVD_INVALIDATE_AGID; bp->agid = lp->lsa.agid; break; case LINUX_DVD_LU_SEND_RPC_STATE: *bcode = DVDIOCREPORTKEY; bp->format = DVD_REPORT_RPC; break; case LINUX_DVD_HOST_SEND_RPC_STATE: *bcode = DVDIOCSENDKEY; bp->format = DVD_SEND_RPC; bp->region = lp->hrpcs.pdrc; break; default: return (EINVAL); } return (0); } static int bsd_to_linux_dvd_authinfo(struct dvd_authinfo *bp, l_dvd_authinfo *lp) { switch (lp->type) { case LINUX_DVD_LU_SEND_AGID: lp->lsa.agid = bp->agid; break; case LINUX_DVD_HOST_SEND_CHALLENGE: lp->type = LINUX_DVD_LU_SEND_KEY1; break; case LINUX_DVD_LU_SEND_KEY1: memcpy(lp->lsk.key, bp->keychal, sizeof(lp->lsk.key)); break; case LINUX_DVD_LU_SEND_CHALLENGE: memcpy(lp->lsc.chal, bp->keychal, sizeof(lp->lsc.chal)); break; case LINUX_DVD_HOST_SEND_KEY2: lp->type = LINUX_DVD_AUTH_ESTABLISHED; break; case LINUX_DVD_LU_SEND_TITLE_KEY: memcpy(lp->lstk.title_key, bp->keychal, sizeof(lp->lstk.title_key)); lp->lstk.cpm = bp->cpm; lp->lstk.cp_sec = bp->cp_sec; lp->lstk.cgms = bp->cgms; break; case LINUX_DVD_LU_SEND_ASF: lp->lsasf.asf = bp->asf; break; case LINUX_DVD_INVALIDATE_AGID: break; case LINUX_DVD_LU_SEND_RPC_STATE: lp->lrpcs.type = bp->reg_type; lp->lrpcs.vra = bp->vend_rsts; lp->lrpcs.ucca = bp->user_rsts; lp->lrpcs.region_mask = bp->region; lp->lrpcs.rpc_scheme = bp->rpc_scheme; break; case LINUX_DVD_HOST_SEND_RPC_STATE: break; default: return (EINVAL); } return (0); } static int linux_ioctl_cdrom(struct thread *td, struct linux_ioctl_args *args) { struct file *fp; int error; error = fget(td, args->fd, &cap_ioctl_rights, &fp); if (error != 0) return (error); switch (args->cmd & 0xffff) { case LINUX_CDROMPAUSE: args->cmd = CDIOCPAUSE; error = (sys_ioctl(td, (struct ioctl_args *)args)); break; case LINUX_CDROMRESUME: args->cmd = CDIOCRESUME; error = (sys_ioctl(td, (struct ioctl_args *)args)); break; case LINUX_CDROMPLAYMSF: args->cmd = CDIOCPLAYMSF; error = (sys_ioctl(td, (struct ioctl_args *)args)); break; case LINUX_CDROMPLAYTRKIND: args->cmd = CDIOCPLAYTRACKS; error = (sys_ioctl(td, (struct ioctl_args *)args)); break; case LINUX_CDROMREADTOCHDR: { struct ioc_toc_header th; struct linux_cdrom_tochdr lth; error = fo_ioctl(fp, CDIOREADTOCHEADER, (caddr_t)&th, td->td_ucred, td); if (!error) { lth.cdth_trk0 = th.starting_track; lth.cdth_trk1 = th.ending_track; copyout(<h, (void *)args->arg, sizeof(lth)); } break; } case LINUX_CDROMREADTOCENTRY: { struct linux_cdrom_tocentry lte; struct ioc_read_toc_single_entry irtse; error = copyin((void *)args->arg, <e, sizeof(lte)); if (error) break; irtse.address_format = lte.cdte_format; irtse.track = lte.cdte_track; error = fo_ioctl(fp, CDIOREADTOCENTRY, (caddr_t)&irtse, td->td_ucred, td); if (!error) { lte.cdte_ctrl = irtse.entry.control; lte.cdte_adr = irtse.entry.addr_type; bsd_to_linux_msf_lba(irtse.address_format, &irtse.entry.addr, <e.cdte_addr); error = copyout(<e, (void *)args->arg, sizeof(lte)); } break; } case LINUX_CDROMSTOP: args->cmd = CDIOCSTOP; error = (sys_ioctl(td, (struct ioctl_args *)args)); break; case LINUX_CDROMSTART: args->cmd = CDIOCSTART; error = (sys_ioctl(td, (struct ioctl_args *)args)); break; case LINUX_CDROMEJECT: args->cmd = CDIOCEJECT; error = (sys_ioctl(td, (struct ioctl_args *)args)); break; /* LINUX_CDROMVOLCTRL */ case LINUX_CDROMSUBCHNL: { struct linux_cdrom_subchnl sc; struct ioc_read_subchannel bsdsc; struct cd_sub_channel_info bsdinfo; bsdsc.address_format = CD_LBA_FORMAT; bsdsc.data_format = CD_CURRENT_POSITION; bsdsc.track = 0; bsdsc.data_len = sizeof(bsdinfo); bsdsc.data = &bsdinfo; error = fo_ioctl(fp, CDIOCREADSUBCHANNEL_SYSSPACE, (caddr_t)&bsdsc, td->td_ucred, td); if (error) break; error = copyin((void *)args->arg, &sc, sizeof(sc)); if (error) break; sc.cdsc_audiostatus = bsdinfo.header.audio_status; sc.cdsc_adr = bsdinfo.what.position.addr_type; sc.cdsc_ctrl = bsdinfo.what.position.control; sc.cdsc_trk = bsdinfo.what.position.track_number; sc.cdsc_ind = bsdinfo.what.position.index_number; set_linux_cdrom_addr(&sc.cdsc_absaddr, sc.cdsc_format, bsdinfo.what.position.absaddr.lba); set_linux_cdrom_addr(&sc.cdsc_reladdr, sc.cdsc_format, bsdinfo.what.position.reladdr.lba); error = copyout(&sc, (void *)args->arg, sizeof(sc)); break; } /* LINUX_CDROMREADMODE2 */ /* LINUX_CDROMREADMODE1 */ /* LINUX_CDROMREADAUDIO */ /* LINUX_CDROMEJECT_SW */ /* LINUX_CDROMMULTISESSION */ /* LINUX_CDROM_GET_UPC */ case LINUX_CDROMRESET: args->cmd = CDIOCRESET; error = (sys_ioctl(td, (struct ioctl_args *)args)); break; /* LINUX_CDROMVOLREAD */ /* LINUX_CDROMREADRAW */ /* LINUX_CDROMREADCOOKED */ /* LINUX_CDROMSEEK */ /* LINUX_CDROMPLAYBLK */ /* LINUX_CDROMREADALL */ /* LINUX_CDROMCLOSETRAY */ /* LINUX_CDROMLOADFROMSLOT */ /* LINUX_CDROMGETSPINDOWN */ /* LINUX_CDROMSETSPINDOWN */ /* LINUX_CDROM_SET_OPTIONS */ /* LINUX_CDROM_CLEAR_OPTIONS */ /* LINUX_CDROM_SELECT_SPEED */ /* LINUX_CDROM_SELECT_DISC */ /* LINUX_CDROM_MEDIA_CHANGED */ /* LINUX_CDROM_DRIVE_STATUS */ /* LINUX_CDROM_DISC_STATUS */ /* LINUX_CDROM_CHANGER_NSLOTS */ /* LINUX_CDROM_LOCKDOOR */ /* LINUX_CDROM_DEBUG */ /* LINUX_CDROM_GET_CAPABILITY */ /* LINUX_CDROMAUDIOBUFSIZ */ case LINUX_DVD_READ_STRUCT: { l_dvd_struct *lds; struct dvd_struct *bds; lds = malloc(sizeof(*lds), M_LINUX, M_WAITOK); bds = malloc(sizeof(*bds), M_LINUX, M_WAITOK); error = copyin((void *)args->arg, lds, sizeof(*lds)); if (error) goto out; error = linux_to_bsd_dvd_struct(lds, bds); if (error) goto out; error = fo_ioctl(fp, DVDIOCREADSTRUCTURE, (caddr_t)bds, td->td_ucred, td); if (error) goto out; error = bsd_to_linux_dvd_struct(bds, lds); if (error) goto out; error = copyout(lds, (void *)args->arg, sizeof(*lds)); out: free(bds, M_LINUX); free(lds, M_LINUX); break; } /* LINUX_DVD_WRITE_STRUCT */ case LINUX_DVD_AUTH: { l_dvd_authinfo lda; struct dvd_authinfo bda; int bcode; error = copyin((void *)args->arg, &lda, sizeof(lda)); if (error) break; error = linux_to_bsd_dvd_authinfo(&lda, &bcode, &bda); if (error) break; error = fo_ioctl(fp, bcode, (caddr_t)&bda, td->td_ucred, td); if (error) { if (lda.type == LINUX_DVD_HOST_SEND_KEY2) { lda.type = LINUX_DVD_AUTH_FAILURE; copyout(&lda, (void *)args->arg, sizeof(lda)); } break; } error = bsd_to_linux_dvd_authinfo(&bda, &lda); if (error) break; error = copyout(&lda, (void *)args->arg, sizeof(lda)); break; } case LINUX_SCSI_GET_BUS_NUMBER: { struct sg_scsi_id id; error = fo_ioctl(fp, SG_GET_SCSI_ID, (caddr_t)&id, td->td_ucred, td); if (error) break; error = copyout(&id.channel, (void *)args->arg, sizeof(int)); break; } case LINUX_SCSI_GET_IDLUN: { struct sg_scsi_id id; struct scsi_idlun idl; error = fo_ioctl(fp, SG_GET_SCSI_ID, (caddr_t)&id, td->td_ucred, td); if (error) break; idl.dev_id = (id.scsi_id & 0xff) + ((id.lun & 0xff) << 8) + ((id.channel & 0xff) << 16) + ((id.host_no & 0xff) << 24); idl.host_unique_id = id.host_no; error = copyout(&idl, (void *)args->arg, sizeof(idl)); break; } /* LINUX_CDROM_SEND_PACKET */ /* LINUX_CDROM_NEXT_WRITABLE */ /* LINUX_CDROM_LAST_WRITTEN */ default: error = ENOIOCTL; break; } fdrop(fp, td); return (error); } static int linux_ioctl_vfat(struct thread *td, struct linux_ioctl_args *args) { return (ENOTTY); } /* * Sound related ioctls */ struct linux_old_mixer_info { char id[16]; char name[32]; }; static u_int32_t dirbits[4] = { IOC_VOID, IOC_IN, IOC_OUT, IOC_INOUT }; #define SETDIR(c) (((c) & ~IOC_DIRMASK) | dirbits[args->cmd >> 30]) static int linux_ioctl_sound(struct thread *td, struct linux_ioctl_args *args) { switch (args->cmd & 0xffff) { case LINUX_SOUND_MIXER_WRITE_VOLUME: args->cmd = SETDIR(SOUND_MIXER_WRITE_VOLUME); return (sys_ioctl(td, (struct ioctl_args *)args)); case LINUX_SOUND_MIXER_WRITE_BASS: args->cmd = SETDIR(SOUND_MIXER_WRITE_BASS); return (sys_ioctl(td, (struct ioctl_args *)args)); case LINUX_SOUND_MIXER_WRITE_TREBLE: args->cmd = SETDIR(SOUND_MIXER_WRITE_TREBLE); return (sys_ioctl(td, (struct ioctl_args *)args)); case LINUX_SOUND_MIXER_WRITE_SYNTH: args->cmd = SETDIR(SOUND_MIXER_WRITE_SYNTH); return (sys_ioctl(td, (struct ioctl_args *)args)); case LINUX_SOUND_MIXER_WRITE_PCM: args->cmd = SETDIR(SOUND_MIXER_WRITE_PCM); return (sys_ioctl(td, (struct ioctl_args *)args)); case LINUX_SOUND_MIXER_WRITE_SPEAKER: args->cmd = SETDIR(SOUND_MIXER_WRITE_SPEAKER); return (sys_ioctl(td, (struct ioctl_args *)args)); case LINUX_SOUND_MIXER_WRITE_LINE: args->cmd = SETDIR(SOUND_MIXER_WRITE_LINE); return (sys_ioctl(td, (struct ioctl_args *)args)); case LINUX_SOUND_MIXER_WRITE_MIC: args->cmd = SETDIR(SOUND_MIXER_WRITE_MIC); return (sys_ioctl(td, (struct ioctl_args *)args)); case LINUX_SOUND_MIXER_WRITE_CD: args->cmd = SETDIR(SOUND_MIXER_WRITE_CD); return (sys_ioctl(td, (struct ioctl_args *)args)); case LINUX_SOUND_MIXER_WRITE_IMIX: args->cmd = SETDIR(SOUND_MIXER_WRITE_IMIX); return (sys_ioctl(td, (struct ioctl_args *)args)); case LINUX_SOUND_MIXER_WRITE_ALTPCM: args->cmd = SETDIR(SOUND_MIXER_WRITE_ALTPCM); return (sys_ioctl(td, (struct ioctl_args *)args)); case LINUX_SOUND_MIXER_WRITE_RECLEV: args->cmd = SETDIR(SOUND_MIXER_WRITE_RECLEV); return (sys_ioctl(td, (struct ioctl_args *)args)); case LINUX_SOUND_MIXER_WRITE_IGAIN: args->cmd = SETDIR(SOUND_MIXER_WRITE_IGAIN); return (sys_ioctl(td, (struct ioctl_args *)args)); case LINUX_SOUND_MIXER_WRITE_OGAIN: args->cmd = SETDIR(SOUND_MIXER_WRITE_OGAIN); return (sys_ioctl(td, (struct ioctl_args *)args)); case LINUX_SOUND_MIXER_WRITE_LINE1: args->cmd = SETDIR(SOUND_MIXER_WRITE_LINE1); return (sys_ioctl(td, (struct ioctl_args *)args)); case LINUX_SOUND_MIXER_WRITE_LINE2: args->cmd = SETDIR(SOUND_MIXER_WRITE_LINE2); return (sys_ioctl(td, (struct ioctl_args *)args)); case LINUX_SOUND_MIXER_WRITE_LINE3: args->cmd = SETDIR(SOUND_MIXER_WRITE_LINE3); return (sys_ioctl(td, (struct ioctl_args *)args)); case LINUX_SOUND_MIXER_INFO: { /* Key on encoded length */ switch ((args->cmd >> 16) & 0x1fff) { case 0x005c: { /* SOUND_MIXER_INFO */ args->cmd = SOUND_MIXER_INFO; return (sys_ioctl(td, (struct ioctl_args *)args)); } case 0x0030: { /* SOUND_OLD_MIXER_INFO */ struct linux_old_mixer_info info; bzero(&info, sizeof(info)); strncpy(info.id, "OSS", sizeof(info.id) - 1); strncpy(info.name, "FreeBSD OSS Mixer", sizeof(info.name) - 1); copyout(&info, (void *)args->arg, sizeof(info)); return (0); } default: return (ENOIOCTL); } break; } case LINUX_OSS_GETVERSION: { int version = linux_get_oss_version(td); return (copyout(&version, (void *)args->arg, sizeof(int))); } case LINUX_SOUND_MIXER_READ_STEREODEVS: args->cmd = SOUND_MIXER_READ_STEREODEVS; return (sys_ioctl(td, (struct ioctl_args *)args)); case LINUX_SOUND_MIXER_READ_CAPS: args->cmd = SOUND_MIXER_READ_CAPS; return (sys_ioctl(td, (struct ioctl_args *)args)); case LINUX_SOUND_MIXER_READ_RECMASK: args->cmd = SOUND_MIXER_READ_RECMASK; return (sys_ioctl(td, (struct ioctl_args *)args)); case LINUX_SOUND_MIXER_READ_DEVMASK: args->cmd = SOUND_MIXER_READ_DEVMASK; return (sys_ioctl(td, (struct ioctl_args *)args)); case LINUX_SOUND_MIXER_WRITE_RECSRC: args->cmd = SETDIR(SOUND_MIXER_WRITE_RECSRC); return (sys_ioctl(td, (struct ioctl_args *)args)); case LINUX_SNDCTL_DSP_RESET: args->cmd = SNDCTL_DSP_RESET; return (sys_ioctl(td, (struct ioctl_args *)args)); case LINUX_SNDCTL_DSP_SYNC: args->cmd = SNDCTL_DSP_SYNC; return (sys_ioctl(td, (struct ioctl_args *)args)); case LINUX_SNDCTL_DSP_SPEED: args->cmd = SNDCTL_DSP_SPEED; return (sys_ioctl(td, (struct ioctl_args *)args)); case LINUX_SNDCTL_DSP_STEREO: args->cmd = SNDCTL_DSP_STEREO; return (sys_ioctl(td, (struct ioctl_args *)args)); case LINUX_SNDCTL_DSP_GETBLKSIZE: /* LINUX_SNDCTL_DSP_SETBLKSIZE */ args->cmd = SNDCTL_DSP_GETBLKSIZE; return (sys_ioctl(td, (struct ioctl_args *)args)); case LINUX_SNDCTL_DSP_SETFMT: args->cmd = SNDCTL_DSP_SETFMT; return (sys_ioctl(td, (struct ioctl_args *)args)); case LINUX_SOUND_PCM_WRITE_CHANNELS: args->cmd = SOUND_PCM_WRITE_CHANNELS; return (sys_ioctl(td, (struct ioctl_args *)args)); case LINUX_SOUND_PCM_WRITE_FILTER: args->cmd = SOUND_PCM_WRITE_FILTER; return (sys_ioctl(td, (struct ioctl_args *)args)); case LINUX_SNDCTL_DSP_POST: args->cmd = SNDCTL_DSP_POST; return (sys_ioctl(td, (struct ioctl_args *)args)); case LINUX_SNDCTL_DSP_SUBDIVIDE: args->cmd = SNDCTL_DSP_SUBDIVIDE; return (sys_ioctl(td, (struct ioctl_args *)args)); case LINUX_SNDCTL_DSP_SETFRAGMENT: args->cmd = SNDCTL_DSP_SETFRAGMENT; return (sys_ioctl(td, (struct ioctl_args *)args)); case LINUX_SNDCTL_DSP_GETFMTS: args->cmd = SNDCTL_DSP_GETFMTS; return (sys_ioctl(td, (struct ioctl_args *)args)); case LINUX_SNDCTL_DSP_GETOSPACE: args->cmd = SNDCTL_DSP_GETOSPACE; return (sys_ioctl(td, (struct ioctl_args *)args)); case LINUX_SNDCTL_DSP_GETISPACE: args->cmd = SNDCTL_DSP_GETISPACE; return (sys_ioctl(td, (struct ioctl_args *)args)); case LINUX_SNDCTL_DSP_NONBLOCK: args->cmd = SNDCTL_DSP_NONBLOCK; return (sys_ioctl(td, (struct ioctl_args *)args)); case LINUX_SNDCTL_DSP_GETCAPS: args->cmd = SNDCTL_DSP_GETCAPS; return (sys_ioctl(td, (struct ioctl_args *)args)); case LINUX_SNDCTL_DSP_SETTRIGGER: /* LINUX_SNDCTL_GETTRIGGER */ args->cmd = SNDCTL_DSP_SETTRIGGER; return (sys_ioctl(td, (struct ioctl_args *)args)); case LINUX_SNDCTL_DSP_GETIPTR: args->cmd = SNDCTL_DSP_GETIPTR; return (sys_ioctl(td, (struct ioctl_args *)args)); case LINUX_SNDCTL_DSP_GETOPTR: args->cmd = SNDCTL_DSP_GETOPTR; return (sys_ioctl(td, (struct ioctl_args *)args)); case LINUX_SNDCTL_DSP_SETDUPLEX: args->cmd = SNDCTL_DSP_SETDUPLEX; return (sys_ioctl(td, (struct ioctl_args *)args)); case LINUX_SNDCTL_DSP_GETODELAY: args->cmd = SNDCTL_DSP_GETODELAY; return (sys_ioctl(td, (struct ioctl_args *)args)); case LINUX_SNDCTL_SEQ_RESET: args->cmd = SNDCTL_SEQ_RESET; return (sys_ioctl(td, (struct ioctl_args *)args)); case LINUX_SNDCTL_SEQ_SYNC: args->cmd = SNDCTL_SEQ_SYNC; return (sys_ioctl(td, (struct ioctl_args *)args)); case LINUX_SNDCTL_SYNTH_INFO: args->cmd = SNDCTL_SYNTH_INFO; return (sys_ioctl(td, (struct ioctl_args *)args)); case LINUX_SNDCTL_SEQ_CTRLRATE: args->cmd = SNDCTL_SEQ_CTRLRATE; return (sys_ioctl(td, (struct ioctl_args *)args)); case LINUX_SNDCTL_SEQ_GETOUTCOUNT: args->cmd = SNDCTL_SEQ_GETOUTCOUNT; return (sys_ioctl(td, (struct ioctl_args *)args)); case LINUX_SNDCTL_SEQ_GETINCOUNT: args->cmd = SNDCTL_SEQ_GETINCOUNT; return (sys_ioctl(td, (struct ioctl_args *)args)); case LINUX_SNDCTL_SEQ_PERCMODE: args->cmd = SNDCTL_SEQ_PERCMODE; return (sys_ioctl(td, (struct ioctl_args *)args)); case LINUX_SNDCTL_FM_LOAD_INSTR: args->cmd = SNDCTL_FM_LOAD_INSTR; return (sys_ioctl(td, (struct ioctl_args *)args)); case LINUX_SNDCTL_SEQ_TESTMIDI: args->cmd = SNDCTL_SEQ_TESTMIDI; return (sys_ioctl(td, (struct ioctl_args *)args)); case LINUX_SNDCTL_SEQ_RESETSAMPLES: args->cmd = SNDCTL_SEQ_RESETSAMPLES; return (sys_ioctl(td, (struct ioctl_args *)args)); case LINUX_SNDCTL_SEQ_NRSYNTHS: args->cmd = SNDCTL_SEQ_NRSYNTHS; return (sys_ioctl(td, (struct ioctl_args *)args)); case LINUX_SNDCTL_SEQ_NRMIDIS: args->cmd = SNDCTL_SEQ_NRMIDIS; return (sys_ioctl(td, (struct ioctl_args *)args)); case LINUX_SNDCTL_MIDI_INFO: args->cmd = SNDCTL_MIDI_INFO; return (sys_ioctl(td, (struct ioctl_args *)args)); case LINUX_SNDCTL_SEQ_TRESHOLD: args->cmd = SNDCTL_SEQ_TRESHOLD; return (sys_ioctl(td, (struct ioctl_args *)args)); case LINUX_SNDCTL_SYNTH_MEMAVL: args->cmd = SNDCTL_SYNTH_MEMAVL; return (sys_ioctl(td, (struct ioctl_args *)args)); } return (ENOIOCTL); } /* * Console related ioctls */ static int linux_ioctl_console(struct thread *td, struct linux_ioctl_args *args) { struct file *fp; int error; error = fget(td, args->fd, &cap_ioctl_rights, &fp); if (error != 0) return (error); switch (args->cmd & 0xffff) { case LINUX_KIOCSOUND: args->cmd = KIOCSOUND; error = (sys_ioctl(td, (struct ioctl_args *)args)); break; case LINUX_KDMKTONE: args->cmd = KDMKTONE; error = (sys_ioctl(td, (struct ioctl_args *)args)); break; case LINUX_KDGETLED: args->cmd = KDGETLED; error = (sys_ioctl(td, (struct ioctl_args *)args)); break; case LINUX_KDSETLED: args->cmd = KDSETLED; error = (sys_ioctl(td, (struct ioctl_args *)args)); break; case LINUX_KDSETMODE: args->cmd = KDSETMODE; error = (sys_ioctl(td, (struct ioctl_args *)args)); break; case LINUX_KDGETMODE: args->cmd = KDGETMODE; error = (sys_ioctl(td, (struct ioctl_args *)args)); break; case LINUX_KDGKBMODE: args->cmd = KDGKBMODE; error = (sys_ioctl(td, (struct ioctl_args *)args)); break; case LINUX_KDSKBMODE: { int kbdmode; switch (args->arg) { case LINUX_KBD_RAW: kbdmode = K_RAW; break; case LINUX_KBD_XLATE: kbdmode = K_XLATE; break; case LINUX_KBD_MEDIUMRAW: kbdmode = K_RAW; break; default: fdrop(fp, td); return (EINVAL); } error = (fo_ioctl(fp, KDSKBMODE, (caddr_t)&kbdmode, td->td_ucred, td)); break; } case LINUX_VT_OPENQRY: args->cmd = VT_OPENQRY; error = (sys_ioctl(td, (struct ioctl_args *)args)); break; case LINUX_VT_GETMODE: args->cmd = VT_GETMODE; error = (sys_ioctl(td, (struct ioctl_args *)args)); break; case LINUX_VT_SETMODE: { struct vt_mode mode; if ((error = copyin((void *)args->arg, &mode, sizeof(mode)))) break; if (LINUX_SIG_VALID(mode.relsig)) mode.relsig = linux_to_bsd_signal(mode.relsig); else mode.relsig = 0; if (LINUX_SIG_VALID(mode.acqsig)) mode.acqsig = linux_to_bsd_signal(mode.acqsig); else mode.acqsig = 0; /* XXX. Linux ignores frsig and set it to 0. */ mode.frsig = 0; if ((error = copyout(&mode, (void *)args->arg, sizeof(mode)))) break; args->cmd = VT_SETMODE; error = (sys_ioctl(td, (struct ioctl_args *)args)); break; } case LINUX_VT_GETSTATE: args->cmd = VT_GETACTIVE; error = (sys_ioctl(td, (struct ioctl_args *)args)); break; case LINUX_VT_RELDISP: args->cmd = VT_RELDISP; error = (sys_ioctl(td, (struct ioctl_args *)args)); break; case LINUX_VT_ACTIVATE: args->cmd = VT_ACTIVATE; error = (sys_ioctl(td, (struct ioctl_args *)args)); break; case LINUX_VT_WAITACTIVE: args->cmd = VT_WAITACTIVE; error = (sys_ioctl(td, (struct ioctl_args *)args)); break; default: error = ENOIOCTL; break; } fdrop(fp, td); return (error); } /* * Criteria for interface name translation */ #define IFP_IS_ETH(ifp) (ifp->if_type == IFT_ETHER) /* * Translate a Linux interface name to a FreeBSD interface name, * and return the associated ifnet structure * bsdname and lxname need to be least IFNAMSIZ bytes long, but * can point to the same buffer. */ static struct ifnet * ifname_linux_to_bsd(struct thread *td, const char *lxname, char *bsdname) { struct ifnet *ifp; int len, unit; char *ep; int is_eth, index; for (len = 0; len < LINUX_IFNAMSIZ; ++len) if (!isalpha(lxname[len])) break; if (len == 0 || len == LINUX_IFNAMSIZ) return (NULL); unit = (int)strtoul(lxname + len, &ep, 10); if (ep == NULL || ep == lxname + len || ep >= lxname + LINUX_IFNAMSIZ) return (NULL); index = 0; is_eth = (len == 3 && !strncmp(lxname, "eth", len)) ? 1 : 0; CURVNET_SET(TD_TO_VNET(td)); IFNET_RLOCK(); - TAILQ_FOREACH(ifp, &V_ifnet, if_link) { + CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) { /* * Allow Linux programs to use FreeBSD names. Don't presume * we never have an interface named "eth", so don't make * the test optional based on is_eth. */ if (strncmp(ifp->if_xname, lxname, LINUX_IFNAMSIZ) == 0) break; if (is_eth && IFP_IS_ETH(ifp) && unit == index++) break; } IFNET_RUNLOCK(); CURVNET_RESTORE(); if (ifp != NULL) strlcpy(bsdname, ifp->if_xname, IFNAMSIZ); return (ifp); } /* * Implement the SIOCGIFNAME ioctl */ static int linux_ioctl_ifname(struct thread *td, struct l_ifreq *uifr) { struct l_ifreq ifr; struct ifnet *ifp; int error, ethno, index; error = copyin(uifr, &ifr, sizeof(ifr)); if (error != 0) return (error); CURVNET_SET(TD_TO_VNET(curthread)); IFNET_RLOCK(); index = 1; /* ifr.ifr_ifindex starts from 1 */ ethno = 0; error = ENODEV; - TAILQ_FOREACH(ifp, &V_ifnet, if_link) { + CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) { if (ifr.ifr_ifindex == index) { if (IFP_IS_ETH(ifp)) snprintf(ifr.ifr_name, LINUX_IFNAMSIZ, "eth%d", ethno); else strlcpy(ifr.ifr_name, ifp->if_xname, LINUX_IFNAMSIZ); error = 0; break; } if (IFP_IS_ETH(ifp)) ethno++; index++; } IFNET_RUNLOCK(); if (error == 0) error = copyout(&ifr, uifr, sizeof(ifr)); CURVNET_RESTORE(); return (error); } /* * Implement the SIOCGIFCONF ioctl */ static int linux_ifconf(struct thread *td, struct ifconf *uifc) { #ifdef COMPAT_LINUX32 struct l_ifconf ifc; #else struct ifconf ifc; #endif struct l_ifreq ifr; struct ifnet *ifp; struct ifaddr *ifa; struct sbuf *sb; int error, ethno, full = 0, valid_len, max_len; error = copyin(uifc, &ifc, sizeof(ifc)); if (error != 0) return (error); max_len = MAXPHYS - 1; CURVNET_SET(TD_TO_VNET(td)); /* handle the 'request buffer size' case */ if ((l_uintptr_t)ifc.ifc_buf == PTROUT(NULL)) { ifc.ifc_len = 0; IFNET_RLOCK(); - TAILQ_FOREACH(ifp, &V_ifnet, if_link) { + CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) { CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { struct sockaddr *sa = ifa->ifa_addr; if (sa->sa_family == AF_INET) ifc.ifc_len += sizeof(ifr); } } IFNET_RUNLOCK(); error = copyout(&ifc, uifc, sizeof(ifc)); CURVNET_RESTORE(); return (error); } if (ifc.ifc_len <= 0) { CURVNET_RESTORE(); return (EINVAL); } again: /* Keep track of eth interfaces */ ethno = 0; if (ifc.ifc_len <= max_len) { max_len = ifc.ifc_len; full = 1; } sb = sbuf_new(NULL, NULL, max_len + 1, SBUF_FIXEDLEN); max_len = 0; valid_len = 0; /* Return all AF_INET addresses of all interfaces */ IFNET_RLOCK(); - TAILQ_FOREACH(ifp, &V_ifnet, if_link) { + CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) { int addrs = 0; bzero(&ifr, sizeof(ifr)); if (IFP_IS_ETH(ifp)) snprintf(ifr.ifr_name, LINUX_IFNAMSIZ, "eth%d", ethno++); else strlcpy(ifr.ifr_name, ifp->if_xname, LINUX_IFNAMSIZ); /* Walk the address list */ CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { struct sockaddr *sa = ifa->ifa_addr; if (sa->sa_family == AF_INET) { ifr.ifr_addr.sa_family = LINUX_AF_INET; memcpy(ifr.ifr_addr.sa_data, sa->sa_data, sizeof(ifr.ifr_addr.sa_data)); sbuf_bcat(sb, &ifr, sizeof(ifr)); max_len += sizeof(ifr); addrs++; } if (sbuf_error(sb) == 0) valid_len = sbuf_len(sb); } if (addrs == 0) { bzero((caddr_t)&ifr.ifr_addr, sizeof(ifr.ifr_addr)); sbuf_bcat(sb, &ifr, sizeof(ifr)); max_len += sizeof(ifr); if (sbuf_error(sb) == 0) valid_len = sbuf_len(sb); } } IFNET_RUNLOCK(); if (valid_len != max_len && !full) { sbuf_delete(sb); goto again; } ifc.ifc_len = valid_len; sbuf_finish(sb); error = copyout(sbuf_data(sb), PTRIN(ifc.ifc_buf), ifc.ifc_len); if (error == 0) error = copyout(&ifc, uifc, sizeof(ifc)); sbuf_delete(sb); CURVNET_RESTORE(); return (error); } static int linux_gifflags(struct thread *td, struct ifnet *ifp, struct l_ifreq *ifr) { l_short flags; flags = (ifp->if_flags | ifp->if_drv_flags) & 0xffff; /* these flags have no Linux equivalent */ flags &= ~(IFF_DRV_OACTIVE|IFF_SIMPLEX| IFF_LINK0|IFF_LINK1|IFF_LINK2); /* Linux' multicast flag is in a different bit */ if (flags & IFF_MULTICAST) { flags &= ~IFF_MULTICAST; flags |= 0x1000; } return (copyout(&flags, &ifr->ifr_flags, sizeof(flags))); } #define ARPHRD_ETHER 1 #define ARPHRD_LOOPBACK 772 static int linux_gifhwaddr(struct ifnet *ifp, struct l_ifreq *ifr) { struct ifaddr *ifa; struct sockaddr_dl *sdl; struct l_sockaddr lsa; if (ifp->if_type == IFT_LOOP) { bzero(&lsa, sizeof(lsa)); lsa.sa_family = ARPHRD_LOOPBACK; return (copyout(&lsa, &ifr->ifr_hwaddr, sizeof(lsa))); } if (ifp->if_type != IFT_ETHER) return (ENOENT); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { sdl = (struct sockaddr_dl*)ifa->ifa_addr; if (sdl != NULL && (sdl->sdl_family == AF_LINK) && (sdl->sdl_type == IFT_ETHER)) { bzero(&lsa, sizeof(lsa)); lsa.sa_family = ARPHRD_ETHER; bcopy(LLADDR(sdl), lsa.sa_data, LINUX_IFHWADDRLEN); return (copyout(&lsa, &ifr->ifr_hwaddr, sizeof(lsa))); } } return (ENOENT); } /* * If we fault in bsd_to_linux_ifreq() then we will fault when we call * the native ioctl(). Thus, we don't really need to check the return * value of this function. */ static int bsd_to_linux_ifreq(struct ifreq *arg) { struct ifreq ifr; size_t ifr_len = sizeof(struct ifreq); int error; if ((error = copyin(arg, &ifr, ifr_len))) return (error); *(u_short *)&ifr.ifr_addr = ifr.ifr_addr.sa_family; error = copyout(&ifr, arg, ifr_len); return (error); } /* * Socket related ioctls */ static int linux_ioctl_socket(struct thread *td, struct linux_ioctl_args *args) { char lifname[LINUX_IFNAMSIZ], ifname[IFNAMSIZ]; struct ifnet *ifp; struct file *fp; int error, type; ifp = NULL; error = 0; error = fget(td, args->fd, &cap_ioctl_rights, &fp); if (error != 0) return (error); type = fp->f_type; fdrop(fp, td); if (type != DTYPE_SOCKET) { /* not a socket - probably a tap / vmnet device */ switch (args->cmd) { case LINUX_SIOCGIFADDR: case LINUX_SIOCSIFADDR: case LINUX_SIOCGIFFLAGS: return (linux_ioctl_special(td, args)); default: return (ENOIOCTL); } } switch (args->cmd & 0xffff) { case LINUX_FIOGETOWN: case LINUX_FIOSETOWN: case LINUX_SIOCADDMULTI: case LINUX_SIOCATMARK: case LINUX_SIOCDELMULTI: case LINUX_SIOCGIFNAME: case LINUX_SIOCGIFCONF: case LINUX_SIOCGPGRP: case LINUX_SIOCSPGRP: case LINUX_SIOCGIFCOUNT: /* these ioctls don't take an interface name */ #ifdef DEBUG printf("%s(): ioctl %d\n", __func__, args->cmd & 0xffff); #endif break; case LINUX_SIOCGIFFLAGS: case LINUX_SIOCGIFADDR: case LINUX_SIOCSIFADDR: case LINUX_SIOCGIFDSTADDR: case LINUX_SIOCGIFBRDADDR: case LINUX_SIOCGIFNETMASK: case LINUX_SIOCSIFNETMASK: case LINUX_SIOCGIFMTU: case LINUX_SIOCSIFMTU: case LINUX_SIOCSIFNAME: case LINUX_SIOCGIFHWADDR: case LINUX_SIOCSIFHWADDR: case LINUX_SIOCDEVPRIVATE: case LINUX_SIOCDEVPRIVATE+1: case LINUX_SIOCGIFINDEX: /* copy in the interface name and translate it. */ error = copyin((void *)args->arg, lifname, LINUX_IFNAMSIZ); if (error != 0) return (error); #ifdef DEBUG printf("%s(): ioctl %d on %.*s\n", __func__, args->cmd & 0xffff, LINUX_IFNAMSIZ, lifname); #endif memset(ifname, 0, sizeof(ifname)); ifp = ifname_linux_to_bsd(td, lifname, ifname); if (ifp == NULL) return (EINVAL); /* * We need to copy it back out in case we pass the * request on to our native ioctl(), which will expect * the ifreq to be in user space and have the correct * interface name. */ error = copyout(ifname, (void *)args->arg, IFNAMSIZ); if (error != 0) return (error); #ifdef DEBUG printf("%s(): %s translated to %s\n", __func__, lifname, ifname); #endif break; default: return (ENOIOCTL); } switch (args->cmd & 0xffff) { case LINUX_FIOSETOWN: args->cmd = FIOSETOWN; error = sys_ioctl(td, (struct ioctl_args *)args); break; case LINUX_SIOCSPGRP: args->cmd = SIOCSPGRP; error = sys_ioctl(td, (struct ioctl_args *)args); break; case LINUX_FIOGETOWN: args->cmd = FIOGETOWN; error = sys_ioctl(td, (struct ioctl_args *)args); break; case LINUX_SIOCGPGRP: args->cmd = SIOCGPGRP; error = sys_ioctl(td, (struct ioctl_args *)args); break; case LINUX_SIOCATMARK: args->cmd = SIOCATMARK; error = sys_ioctl(td, (struct ioctl_args *)args); break; /* LINUX_SIOCGSTAMP */ case LINUX_SIOCGIFNAME: error = linux_ioctl_ifname(td, (struct l_ifreq *)args->arg); break; case LINUX_SIOCGIFCONF: error = linux_ifconf(td, (struct ifconf *)args->arg); break; case LINUX_SIOCGIFFLAGS: args->cmd = SIOCGIFFLAGS; error = linux_gifflags(td, ifp, (struct l_ifreq *)args->arg); break; case LINUX_SIOCGIFADDR: args->cmd = SIOCGIFADDR; error = sys_ioctl(td, (struct ioctl_args *)args); bsd_to_linux_ifreq((struct ifreq *)args->arg); break; case LINUX_SIOCSIFADDR: /* XXX probably doesn't work, included for completeness */ args->cmd = SIOCSIFADDR; error = sys_ioctl(td, (struct ioctl_args *)args); break; case LINUX_SIOCGIFDSTADDR: args->cmd = SIOCGIFDSTADDR; error = sys_ioctl(td, (struct ioctl_args *)args); bsd_to_linux_ifreq((struct ifreq *)args->arg); break; case LINUX_SIOCGIFBRDADDR: args->cmd = SIOCGIFBRDADDR; error = sys_ioctl(td, (struct ioctl_args *)args); bsd_to_linux_ifreq((struct ifreq *)args->arg); break; case LINUX_SIOCGIFNETMASK: args->cmd = SIOCGIFNETMASK; error = sys_ioctl(td, (struct ioctl_args *)args); bsd_to_linux_ifreq((struct ifreq *)args->arg); break; case LINUX_SIOCSIFNETMASK: error = ENOIOCTL; break; case LINUX_SIOCGIFMTU: args->cmd = SIOCGIFMTU; error = sys_ioctl(td, (struct ioctl_args *)args); break; case LINUX_SIOCSIFMTU: args->cmd = SIOCSIFMTU; error = sys_ioctl(td, (struct ioctl_args *)args); break; case LINUX_SIOCSIFNAME: error = ENOIOCTL; break; case LINUX_SIOCGIFHWADDR: error = linux_gifhwaddr(ifp, (struct l_ifreq *)args->arg); break; case LINUX_SIOCSIFHWADDR: error = ENOIOCTL; break; case LINUX_SIOCADDMULTI: args->cmd = SIOCADDMULTI; error = sys_ioctl(td, (struct ioctl_args *)args); break; case LINUX_SIOCDELMULTI: args->cmd = SIOCDELMULTI; error = sys_ioctl(td, (struct ioctl_args *)args); break; case LINUX_SIOCGIFINDEX: args->cmd = SIOCGIFINDEX; error = sys_ioctl(td, (struct ioctl_args *)args); break; case LINUX_SIOCGIFCOUNT: error = 0; break; /* * XXX This is slightly bogus, but these ioctls are currently * XXX only used by the aironet (if_an) network driver. */ case LINUX_SIOCDEVPRIVATE: args->cmd = SIOCGPRIVATE_0; error = sys_ioctl(td, (struct ioctl_args *)args); break; case LINUX_SIOCDEVPRIVATE+1: args->cmd = SIOCGPRIVATE_1; error = sys_ioctl(td, (struct ioctl_args *)args); break; } if (ifp != NULL) /* restore the original interface name */ copyout(lifname, (void *)args->arg, LINUX_IFNAMSIZ); #ifdef DEBUG printf("%s(): returning %d\n", __func__, error); #endif return (error); } /* * Device private ioctl handler */ static int linux_ioctl_private(struct thread *td, struct linux_ioctl_args *args) { struct file *fp; int error, type; error = fget(td, args->fd, &cap_ioctl_rights, &fp); if (error != 0) return (error); type = fp->f_type; fdrop(fp, td); if (type == DTYPE_SOCKET) return (linux_ioctl_socket(td, args)); return (ENOIOCTL); } /* * DRM ioctl handler (sys/dev/drm) */ static int linux_ioctl_drm(struct thread *td, struct linux_ioctl_args *args) { args->cmd = SETDIR(args->cmd); return (sys_ioctl(td, (struct ioctl_args *)args)); } #ifdef COMPAT_LINUX32 #define CP(src,dst,fld) do { (dst).fld = (src).fld; } while (0) #define PTRIN_CP(src,dst,fld) \ do { (dst).fld = PTRIN((src).fld); } while (0) #define PTROUT_CP(src,dst,fld) \ do { (dst).fld = PTROUT((src).fld); } while (0) static int linux_ioctl_sg_io(struct thread *td, struct linux_ioctl_args *args) { struct sg_io_hdr io; struct sg_io_hdr32 io32; struct file *fp; int error; error = fget(td, args->fd, &cap_ioctl_rights, &fp); if (error != 0) { printf("sg_linux_ioctl: fget returned %d\n", error); return (error); } if ((error = copyin((void *)args->arg, &io32, sizeof(io32))) != 0) goto out; CP(io32, io, interface_id); CP(io32, io, dxfer_direction); CP(io32, io, cmd_len); CP(io32, io, mx_sb_len); CP(io32, io, iovec_count); CP(io32, io, dxfer_len); PTRIN_CP(io32, io, dxferp); PTRIN_CP(io32, io, cmdp); PTRIN_CP(io32, io, sbp); CP(io32, io, timeout); CP(io32, io, flags); CP(io32, io, pack_id); PTRIN_CP(io32, io, usr_ptr); CP(io32, io, status); CP(io32, io, masked_status); CP(io32, io, msg_status); CP(io32, io, sb_len_wr); CP(io32, io, host_status); CP(io32, io, driver_status); CP(io32, io, resid); CP(io32, io, duration); CP(io32, io, info); if ((error = fo_ioctl(fp, SG_IO, (caddr_t)&io, td->td_ucred, td)) != 0) goto out; CP(io, io32, interface_id); CP(io, io32, dxfer_direction); CP(io, io32, cmd_len); CP(io, io32, mx_sb_len); CP(io, io32, iovec_count); CP(io, io32, dxfer_len); PTROUT_CP(io, io32, dxferp); PTROUT_CP(io, io32, cmdp); PTROUT_CP(io, io32, sbp); CP(io, io32, timeout); CP(io, io32, flags); CP(io, io32, pack_id); PTROUT_CP(io, io32, usr_ptr); CP(io, io32, status); CP(io, io32, masked_status); CP(io, io32, msg_status); CP(io, io32, sb_len_wr); CP(io, io32, host_status); CP(io, io32, driver_status); CP(io, io32, resid); CP(io, io32, duration); CP(io, io32, info); error = copyout(&io32, (void *)args->arg, sizeof(io32)); out: fdrop(fp, td); return (error); } #endif static int linux_ioctl_sg(struct thread *td, struct linux_ioctl_args *args) { switch (args->cmd) { case LINUX_SG_GET_VERSION_NUM: args->cmd = SG_GET_VERSION_NUM; break; case LINUX_SG_SET_TIMEOUT: args->cmd = SG_SET_TIMEOUT; break; case LINUX_SG_GET_TIMEOUT: args->cmd = SG_GET_TIMEOUT; break; case LINUX_SG_IO: args->cmd = SG_IO; #ifdef COMPAT_LINUX32 return (linux_ioctl_sg_io(td, args)); #endif break; case LINUX_SG_GET_RESERVED_SIZE: args->cmd = SG_GET_RESERVED_SIZE; break; case LINUX_SG_GET_SCSI_ID: args->cmd = SG_GET_SCSI_ID; break; case LINUX_SG_GET_SG_TABLESIZE: args->cmd = SG_GET_SG_TABLESIZE; break; default: return (ENODEV); } return (sys_ioctl(td, (struct ioctl_args *)args)); } /* * Video4Linux (V4L) ioctl handler */ static int linux_to_bsd_v4l_tuner(struct l_video_tuner *lvt, struct video_tuner *vt) { vt->tuner = lvt->tuner; strlcpy(vt->name, lvt->name, LINUX_VIDEO_TUNER_NAME_SIZE); vt->rangelow = lvt->rangelow; /* possible long size conversion */ vt->rangehigh = lvt->rangehigh; /* possible long size conversion */ vt->flags = lvt->flags; vt->mode = lvt->mode; vt->signal = lvt->signal; return (0); } static int bsd_to_linux_v4l_tuner(struct video_tuner *vt, struct l_video_tuner *lvt) { lvt->tuner = vt->tuner; strlcpy(lvt->name, vt->name, LINUX_VIDEO_TUNER_NAME_SIZE); lvt->rangelow = vt->rangelow; /* possible long size conversion */ lvt->rangehigh = vt->rangehigh; /* possible long size conversion */ lvt->flags = vt->flags; lvt->mode = vt->mode; lvt->signal = vt->signal; return (0); } #ifdef COMPAT_LINUX_V4L_CLIPLIST static int linux_to_bsd_v4l_clip(struct l_video_clip *lvc, struct video_clip *vc) { vc->x = lvc->x; vc->y = lvc->y; vc->width = lvc->width; vc->height = lvc->height; vc->next = PTRIN(lvc->next); /* possible pointer size conversion */ return (0); } #endif static int linux_to_bsd_v4l_window(struct l_video_window *lvw, struct video_window *vw) { vw->x = lvw->x; vw->y = lvw->y; vw->width = lvw->width; vw->height = lvw->height; vw->chromakey = lvw->chromakey; vw->flags = lvw->flags; vw->clips = PTRIN(lvw->clips); /* possible pointer size conversion */ vw->clipcount = lvw->clipcount; return (0); } static int bsd_to_linux_v4l_window(struct video_window *vw, struct l_video_window *lvw) { lvw->x = vw->x; lvw->y = vw->y; lvw->width = vw->width; lvw->height = vw->height; lvw->chromakey = vw->chromakey; lvw->flags = vw->flags; lvw->clips = PTROUT(vw->clips); /* possible pointer size conversion */ lvw->clipcount = vw->clipcount; return (0); } static int linux_to_bsd_v4l_buffer(struct l_video_buffer *lvb, struct video_buffer *vb) { vb->base = PTRIN(lvb->base); /* possible pointer size conversion */ vb->height = lvb->height; vb->width = lvb->width; vb->depth = lvb->depth; vb->bytesperline = lvb->bytesperline; return (0); } static int bsd_to_linux_v4l_buffer(struct video_buffer *vb, struct l_video_buffer *lvb) { lvb->base = PTROUT(vb->base); /* possible pointer size conversion */ lvb->height = vb->height; lvb->width = vb->width; lvb->depth = vb->depth; lvb->bytesperline = vb->bytesperline; return (0); } static int linux_to_bsd_v4l_code(struct l_video_code *lvc, struct video_code *vc) { strlcpy(vc->loadwhat, lvc->loadwhat, LINUX_VIDEO_CODE_LOADWHAT_SIZE); vc->datasize = lvc->datasize; vc->data = PTRIN(lvc->data); /* possible pointer size conversion */ return (0); } #ifdef COMPAT_LINUX_V4L_CLIPLIST static int linux_v4l_clip_copy(void *lvc, struct video_clip **ppvc) { int error; struct video_clip vclip; struct l_video_clip l_vclip; error = copyin(lvc, &l_vclip, sizeof(l_vclip)); if (error) return (error); linux_to_bsd_v4l_clip(&l_vclip, &vclip); /* XXX: If there can be no concurrency: s/M_NOWAIT/M_WAITOK/ */ if ((*ppvc = malloc(sizeof(**ppvc), M_LINUX, M_NOWAIT)) == NULL) return (ENOMEM); /* XXX: Linux has no ENOMEM here. */ memcpy(*ppvc, &vclip, sizeof(vclip)); (*ppvc)->next = NULL; return (0); } static int linux_v4l_cliplist_free(struct video_window *vw) { struct video_clip **ppvc; struct video_clip **ppvc_next; for (ppvc = &(vw->clips); *ppvc != NULL; ppvc = ppvc_next) { ppvc_next = &((*ppvc)->next); free(*ppvc, M_LINUX); } vw->clips = NULL; return (0); } static int linux_v4l_cliplist_copy(struct l_video_window *lvw, struct video_window *vw) { int error; int clipcount; void *plvc; struct video_clip **ppvc; /* * XXX: The cliplist is used to pass in a list of clipping * rectangles or, if clipcount == VIDEO_CLIP_BITMAP, a * clipping bitmap. Some Linux apps, however, appear to * leave cliplist and clips uninitialized. In any case, * the cliplist is not used by pwc(4), at the time of * writing, FreeBSD's only V4L driver. When a driver * that uses the cliplist is developed, this code may * need re-examiniation. */ error = 0; clipcount = vw->clipcount; if (clipcount == VIDEO_CLIP_BITMAP) { /* * In this case, the pointer (clips) is overloaded * to be a "void *" to a bitmap, therefore there * is no struct video_clip to copy now. */ } else if (clipcount > 0 && clipcount <= 16384) { /* * Clips points to list of clip rectangles, so * copy the list. * * XXX: Upper limit of 16384 was used here to try to * avoid cases when clipcount and clips pointer * are uninitialized and therefore have high random * values, as is the case in the Linux Skype * application. The value 16384 was chosen as that * is what is used in the Linux stradis(4) MPEG * decoder driver, the only place we found an * example of cliplist use. */ plvc = PTRIN(lvw->clips); vw->clips = NULL; ppvc = &(vw->clips); while (clipcount-- > 0) { if (plvc == NULL) { error = EFAULT; break; } else { error = linux_v4l_clip_copy(plvc, ppvc); if (error) { linux_v4l_cliplist_free(vw); break; } } ppvc = &((*ppvc)->next); plvc = PTRIN(((struct l_video_clip *) plvc)->next); } } else { /* * clipcount == 0 or negative (but not VIDEO_CLIP_BITMAP) * Force cliplist to null. */ vw->clipcount = 0; vw->clips = NULL; } return (error); } #endif static int linux_ioctl_v4l(struct thread *td, struct linux_ioctl_args *args) { struct file *fp; int error; struct video_tuner vtun; struct video_window vwin; struct video_buffer vbuf; struct video_code vcode; struct l_video_tuner l_vtun; struct l_video_window l_vwin; struct l_video_buffer l_vbuf; struct l_video_code l_vcode; switch (args->cmd & 0xffff) { case LINUX_VIDIOCGCAP: args->cmd = VIDIOCGCAP; break; case LINUX_VIDIOCGCHAN: args->cmd = VIDIOCGCHAN; break; case LINUX_VIDIOCSCHAN: args->cmd = VIDIOCSCHAN; break; case LINUX_VIDIOCGTUNER: error = fget(td, args->fd, &cap_ioctl_rights, &fp); if (error != 0) return (error); error = copyin((void *) args->arg, &l_vtun, sizeof(l_vtun)); if (error) { fdrop(fp, td); return (error); } linux_to_bsd_v4l_tuner(&l_vtun, &vtun); error = fo_ioctl(fp, VIDIOCGTUNER, &vtun, td->td_ucred, td); if (!error) { bsd_to_linux_v4l_tuner(&vtun, &l_vtun); error = copyout(&l_vtun, (void *) args->arg, sizeof(l_vtun)); } fdrop(fp, td); return (error); case LINUX_VIDIOCSTUNER: error = fget(td, args->fd, &cap_ioctl_rights, &fp); if (error != 0) return (error); error = copyin((void *) args->arg, &l_vtun, sizeof(l_vtun)); if (error) { fdrop(fp, td); return (error); } linux_to_bsd_v4l_tuner(&l_vtun, &vtun); error = fo_ioctl(fp, VIDIOCSTUNER, &vtun, td->td_ucred, td); fdrop(fp, td); return (error); case LINUX_VIDIOCGPICT: args->cmd = VIDIOCGPICT; break; case LINUX_VIDIOCSPICT: args->cmd = VIDIOCSPICT; break; case LINUX_VIDIOCCAPTURE: args->cmd = VIDIOCCAPTURE; break; case LINUX_VIDIOCGWIN: error = fget(td, args->fd, &cap_ioctl_rights, &fp); if (error != 0) return (error); error = fo_ioctl(fp, VIDIOCGWIN, &vwin, td->td_ucred, td); if (!error) { bsd_to_linux_v4l_window(&vwin, &l_vwin); error = copyout(&l_vwin, (void *) args->arg, sizeof(l_vwin)); } fdrop(fp, td); return (error); case LINUX_VIDIOCSWIN: error = fget(td, args->fd, &cap_ioctl_rights, &fp); if (error != 0) return (error); error = copyin((void *) args->arg, &l_vwin, sizeof(l_vwin)); if (error) { fdrop(fp, td); return (error); } linux_to_bsd_v4l_window(&l_vwin, &vwin); #ifdef COMPAT_LINUX_V4L_CLIPLIST error = linux_v4l_cliplist_copy(&l_vwin, &vwin); if (error) { fdrop(fp, td); return (error); } #endif error = fo_ioctl(fp, VIDIOCSWIN, &vwin, td->td_ucred, td); fdrop(fp, td); #ifdef COMPAT_LINUX_V4L_CLIPLIST linux_v4l_cliplist_free(&vwin); #endif return (error); case LINUX_VIDIOCGFBUF: error = fget(td, args->fd, &cap_ioctl_rights, &fp); if (error != 0) return (error); error = fo_ioctl(fp, VIDIOCGFBUF, &vbuf, td->td_ucred, td); if (!error) { bsd_to_linux_v4l_buffer(&vbuf, &l_vbuf); error = copyout(&l_vbuf, (void *) args->arg, sizeof(l_vbuf)); } fdrop(fp, td); return (error); case LINUX_VIDIOCSFBUF: error = fget(td, args->fd, &cap_ioctl_rights, &fp); if (error != 0) return (error); error = copyin((void *) args->arg, &l_vbuf, sizeof(l_vbuf)); if (error) { fdrop(fp, td); return (error); } linux_to_bsd_v4l_buffer(&l_vbuf, &vbuf); error = fo_ioctl(fp, VIDIOCSFBUF, &vbuf, td->td_ucred, td); fdrop(fp, td); return (error); case LINUX_VIDIOCKEY: args->cmd = VIDIOCKEY; break; case LINUX_VIDIOCGFREQ: args->cmd = VIDIOCGFREQ; break; case LINUX_VIDIOCSFREQ: args->cmd = VIDIOCSFREQ; break; case LINUX_VIDIOCGAUDIO: args->cmd = VIDIOCGAUDIO; break; case LINUX_VIDIOCSAUDIO: args->cmd = VIDIOCSAUDIO; break; case LINUX_VIDIOCSYNC: args->cmd = VIDIOCSYNC; break; case LINUX_VIDIOCMCAPTURE: args->cmd = VIDIOCMCAPTURE; break; case LINUX_VIDIOCGMBUF: args->cmd = VIDIOCGMBUF; break; case LINUX_VIDIOCGUNIT: args->cmd = VIDIOCGUNIT; break; case LINUX_VIDIOCGCAPTURE: args->cmd = VIDIOCGCAPTURE; break; case LINUX_VIDIOCSCAPTURE: args->cmd = VIDIOCSCAPTURE; break; case LINUX_VIDIOCSPLAYMODE: args->cmd = VIDIOCSPLAYMODE; break; case LINUX_VIDIOCSWRITEMODE: args->cmd = VIDIOCSWRITEMODE; break; case LINUX_VIDIOCGPLAYINFO: args->cmd = VIDIOCGPLAYINFO; break; case LINUX_VIDIOCSMICROCODE: error = fget(td, args->fd, &cap_ioctl_rights, &fp); if (error != 0) return (error); error = copyin((void *) args->arg, &l_vcode, sizeof(l_vcode)); if (error) { fdrop(fp, td); return (error); } linux_to_bsd_v4l_code(&l_vcode, &vcode); error = fo_ioctl(fp, VIDIOCSMICROCODE, &vcode, td->td_ucred, td); fdrop(fp, td); return (error); case LINUX_VIDIOCGVBIFMT: args->cmd = VIDIOCGVBIFMT; break; case LINUX_VIDIOCSVBIFMT: args->cmd = VIDIOCSVBIFMT; break; default: return (ENOIOCTL); } error = sys_ioctl(td, (struct ioctl_args *)args); return (error); } /* * Special ioctl handler */ static int linux_ioctl_special(struct thread *td, struct linux_ioctl_args *args) { int error; switch (args->cmd) { case LINUX_SIOCGIFADDR: args->cmd = SIOCGIFADDR; error = sys_ioctl(td, (struct ioctl_args *)args); break; case LINUX_SIOCSIFADDR: args->cmd = SIOCSIFADDR; error = sys_ioctl(td, (struct ioctl_args *)args); break; case LINUX_SIOCGIFFLAGS: args->cmd = SIOCGIFFLAGS; error = sys_ioctl(td, (struct ioctl_args *)args); break; default: error = ENOIOCTL; } return (error); } static int linux_to_bsd_v4l2_standard(struct l_v4l2_standard *lvstd, struct v4l2_standard *vstd) { vstd->index = lvstd->index; vstd->id = lvstd->id; CTASSERT(sizeof(vstd->name) == sizeof(lvstd->name)); memcpy(vstd->name, lvstd->name, sizeof(vstd->name)); vstd->frameperiod = lvstd->frameperiod; vstd->framelines = lvstd->framelines; CTASSERT(sizeof(vstd->reserved) == sizeof(lvstd->reserved)); memcpy(vstd->reserved, lvstd->reserved, sizeof(vstd->reserved)); return (0); } static int bsd_to_linux_v4l2_standard(struct v4l2_standard *vstd, struct l_v4l2_standard *lvstd) { lvstd->index = vstd->index; lvstd->id = vstd->id; CTASSERT(sizeof(vstd->name) == sizeof(lvstd->name)); memcpy(lvstd->name, vstd->name, sizeof(lvstd->name)); lvstd->frameperiod = vstd->frameperiod; lvstd->framelines = vstd->framelines; CTASSERT(sizeof(vstd->reserved) == sizeof(lvstd->reserved)); memcpy(lvstd->reserved, vstd->reserved, sizeof(lvstd->reserved)); return (0); } static int linux_to_bsd_v4l2_buffer(struct l_v4l2_buffer *lvb, struct v4l2_buffer *vb) { vb->index = lvb->index; vb->type = lvb->type; vb->bytesused = lvb->bytesused; vb->flags = lvb->flags; vb->field = lvb->field; vb->timestamp.tv_sec = lvb->timestamp.tv_sec; vb->timestamp.tv_usec = lvb->timestamp.tv_usec; memcpy(&vb->timecode, &lvb->timecode, sizeof (lvb->timecode)); vb->sequence = lvb->sequence; vb->memory = lvb->memory; if (lvb->memory == V4L2_MEMORY_USERPTR) /* possible pointer size conversion */ vb->m.userptr = (unsigned long)PTRIN(lvb->m.userptr); else vb->m.offset = lvb->m.offset; vb->length = lvb->length; vb->input = lvb->input; vb->reserved = lvb->reserved; return (0); } static int bsd_to_linux_v4l2_buffer(struct v4l2_buffer *vb, struct l_v4l2_buffer *lvb) { lvb->index = vb->index; lvb->type = vb->type; lvb->bytesused = vb->bytesused; lvb->flags = vb->flags; lvb->field = vb->field; lvb->timestamp.tv_sec = vb->timestamp.tv_sec; lvb->timestamp.tv_usec = vb->timestamp.tv_usec; memcpy(&lvb->timecode, &vb->timecode, sizeof (vb->timecode)); lvb->sequence = vb->sequence; lvb->memory = vb->memory; if (vb->memory == V4L2_MEMORY_USERPTR) /* possible pointer size conversion */ lvb->m.userptr = PTROUT(vb->m.userptr); else lvb->m.offset = vb->m.offset; lvb->length = vb->length; lvb->input = vb->input; lvb->reserved = vb->reserved; return (0); } static int linux_to_bsd_v4l2_format(struct l_v4l2_format *lvf, struct v4l2_format *vf) { vf->type = lvf->type; if (lvf->type == V4L2_BUF_TYPE_VIDEO_OVERLAY #ifdef V4L2_BUF_TYPE_VIDEO_OUTPUT_OVERLAY || lvf->type == V4L2_BUF_TYPE_VIDEO_OUTPUT_OVERLAY #endif ) /* * XXX TODO - needs 32 -> 64 bit conversion: * (unused by webcams?) */ return (EINVAL); memcpy(&vf->fmt, &lvf->fmt, sizeof(vf->fmt)); return (0); } static int bsd_to_linux_v4l2_format(struct v4l2_format *vf, struct l_v4l2_format *lvf) { lvf->type = vf->type; if (vf->type == V4L2_BUF_TYPE_VIDEO_OVERLAY #ifdef V4L2_BUF_TYPE_VIDEO_OUTPUT_OVERLAY || vf->type == V4L2_BUF_TYPE_VIDEO_OUTPUT_OVERLAY #endif ) /* * XXX TODO - needs 32 -> 64 bit conversion: * (unused by webcams?) */ return (EINVAL); memcpy(&lvf->fmt, &vf->fmt, sizeof(vf->fmt)); return (0); } static int linux_ioctl_v4l2(struct thread *td, struct linux_ioctl_args *args) { struct file *fp; int error; struct v4l2_format vformat; struct l_v4l2_format l_vformat; struct v4l2_standard vstd; struct l_v4l2_standard l_vstd; struct l_v4l2_buffer l_vbuf; struct v4l2_buffer vbuf; struct v4l2_input vinp; switch (args->cmd & 0xffff) { case LINUX_VIDIOC_RESERVED: case LINUX_VIDIOC_LOG_STATUS: if ((args->cmd & IOC_DIRMASK) != LINUX_IOC_VOID) return (ENOIOCTL); args->cmd = (args->cmd & 0xffff) | IOC_VOID; break; case LINUX_VIDIOC_OVERLAY: case LINUX_VIDIOC_STREAMON: case LINUX_VIDIOC_STREAMOFF: case LINUX_VIDIOC_S_STD: case LINUX_VIDIOC_S_TUNER: case LINUX_VIDIOC_S_AUDIO: case LINUX_VIDIOC_S_AUDOUT: case LINUX_VIDIOC_S_MODULATOR: case LINUX_VIDIOC_S_FREQUENCY: case LINUX_VIDIOC_S_CROP: case LINUX_VIDIOC_S_JPEGCOMP: case LINUX_VIDIOC_S_PRIORITY: case LINUX_VIDIOC_DBG_S_REGISTER: case LINUX_VIDIOC_S_HW_FREQ_SEEK: case LINUX_VIDIOC_SUBSCRIBE_EVENT: case LINUX_VIDIOC_UNSUBSCRIBE_EVENT: args->cmd = (args->cmd & ~IOC_DIRMASK) | IOC_IN; break; case LINUX_VIDIOC_QUERYCAP: case LINUX_VIDIOC_G_STD: case LINUX_VIDIOC_G_AUDIO: case LINUX_VIDIOC_G_INPUT: case LINUX_VIDIOC_G_OUTPUT: case LINUX_VIDIOC_G_AUDOUT: case LINUX_VIDIOC_G_JPEGCOMP: case LINUX_VIDIOC_QUERYSTD: case LINUX_VIDIOC_G_PRIORITY: case LINUX_VIDIOC_QUERY_DV_PRESET: args->cmd = (args->cmd & ~IOC_DIRMASK) | IOC_OUT; break; case LINUX_VIDIOC_ENUM_FMT: case LINUX_VIDIOC_REQBUFS: case LINUX_VIDIOC_G_PARM: case LINUX_VIDIOC_S_PARM: case LINUX_VIDIOC_G_CTRL: case LINUX_VIDIOC_S_CTRL: case LINUX_VIDIOC_G_TUNER: case LINUX_VIDIOC_QUERYCTRL: case LINUX_VIDIOC_QUERYMENU: case LINUX_VIDIOC_S_INPUT: case LINUX_VIDIOC_S_OUTPUT: case LINUX_VIDIOC_ENUMOUTPUT: case LINUX_VIDIOC_G_MODULATOR: case LINUX_VIDIOC_G_FREQUENCY: case LINUX_VIDIOC_CROPCAP: case LINUX_VIDIOC_G_CROP: case LINUX_VIDIOC_ENUMAUDIO: case LINUX_VIDIOC_ENUMAUDOUT: case LINUX_VIDIOC_G_SLICED_VBI_CAP: #ifdef VIDIOC_ENUM_FRAMESIZES case LINUX_VIDIOC_ENUM_FRAMESIZES: case LINUX_VIDIOC_ENUM_FRAMEINTERVALS: case LINUX_VIDIOC_ENCODER_CMD: case LINUX_VIDIOC_TRY_ENCODER_CMD: #endif case LINUX_VIDIOC_DBG_G_REGISTER: case LINUX_VIDIOC_DBG_G_CHIP_IDENT: case LINUX_VIDIOC_ENUM_DV_PRESETS: case LINUX_VIDIOC_S_DV_PRESET: case LINUX_VIDIOC_G_DV_PRESET: case LINUX_VIDIOC_S_DV_TIMINGS: case LINUX_VIDIOC_G_DV_TIMINGS: args->cmd = (args->cmd & ~IOC_DIRMASK) | IOC_INOUT; break; case LINUX_VIDIOC_G_FMT: case LINUX_VIDIOC_S_FMT: case LINUX_VIDIOC_TRY_FMT: error = copyin((void *)args->arg, &l_vformat, sizeof(l_vformat)); if (error) return (error); error = fget(td, args->fd, &cap_ioctl_rights, &fp); if (error) return (error); if (linux_to_bsd_v4l2_format(&l_vformat, &vformat) != 0) error = EINVAL; else if ((args->cmd & 0xffff) == LINUX_VIDIOC_G_FMT) error = fo_ioctl(fp, VIDIOC_G_FMT, &vformat, td->td_ucred, td); else if ((args->cmd & 0xffff) == LINUX_VIDIOC_S_FMT) error = fo_ioctl(fp, VIDIOC_S_FMT, &vformat, td->td_ucred, td); else error = fo_ioctl(fp, VIDIOC_TRY_FMT, &vformat, td->td_ucred, td); bsd_to_linux_v4l2_format(&vformat, &l_vformat); copyout(&l_vformat, (void *)args->arg, sizeof(l_vformat)); fdrop(fp, td); return (error); case LINUX_VIDIOC_ENUMSTD: error = copyin((void *)args->arg, &l_vstd, sizeof(l_vstd)); if (error) return (error); linux_to_bsd_v4l2_standard(&l_vstd, &vstd); error = fget(td, args->fd, &cap_ioctl_rights, &fp); if (error) return (error); error = fo_ioctl(fp, VIDIOC_ENUMSTD, (caddr_t)&vstd, td->td_ucred, td); if (error) { fdrop(fp, td); return (error); } bsd_to_linux_v4l2_standard(&vstd, &l_vstd); error = copyout(&l_vstd, (void *)args->arg, sizeof(l_vstd)); fdrop(fp, td); return (error); case LINUX_VIDIOC_ENUMINPUT: /* * The Linux struct l_v4l2_input differs only in size, * it has no padding at the end. */ error = copyin((void *)args->arg, &vinp, sizeof(struct l_v4l2_input)); if (error != 0) return (error); error = fget(td, args->fd, &cap_ioctl_rights, &fp); if (error != 0) return (error); error = fo_ioctl(fp, VIDIOC_ENUMINPUT, (caddr_t)&vinp, td->td_ucred, td); if (error) { fdrop(fp, td); return (error); } error = copyout(&vinp, (void *)args->arg, sizeof(struct l_v4l2_input)); fdrop(fp, td); return (error); case LINUX_VIDIOC_QUERYBUF: case LINUX_VIDIOC_QBUF: case LINUX_VIDIOC_DQBUF: error = copyin((void *)args->arg, &l_vbuf, sizeof(l_vbuf)); if (error) return (error); error = fget(td, args->fd, &cap_ioctl_rights, &fp); if (error) return (error); linux_to_bsd_v4l2_buffer(&l_vbuf, &vbuf); if ((args->cmd & 0xffff) == LINUX_VIDIOC_QUERYBUF) error = fo_ioctl(fp, VIDIOC_QUERYBUF, &vbuf, td->td_ucred, td); else if ((args->cmd & 0xffff) == LINUX_VIDIOC_QBUF) error = fo_ioctl(fp, VIDIOC_QBUF, &vbuf, td->td_ucred, td); else error = fo_ioctl(fp, VIDIOC_DQBUF, &vbuf, td->td_ucred, td); bsd_to_linux_v4l2_buffer(&vbuf, &l_vbuf); copyout(&l_vbuf, (void *)args->arg, sizeof(l_vbuf)); fdrop(fp, td); return (error); /* * XXX TODO - these need 32 -> 64 bit conversion: * (are any of them needed for webcams?) */ case LINUX_VIDIOC_G_FBUF: case LINUX_VIDIOC_S_FBUF: case LINUX_VIDIOC_G_EXT_CTRLS: case LINUX_VIDIOC_S_EXT_CTRLS: case LINUX_VIDIOC_TRY_EXT_CTRLS: case LINUX_VIDIOC_DQEVENT: default: return (ENOIOCTL); } error = sys_ioctl(td, (struct ioctl_args *)args); return (error); } /* * Support for emulators/linux-libusb. This port uses FBSD_LUSB* macros * instead of USB* ones. This lets us to provide correct values for cmd. * 0xffffffe0 -- 0xffffffff range seemed to be the least collision-prone. */ static int linux_ioctl_fbsd_usb(struct thread *td, struct linux_ioctl_args *args) { int error; error = 0; switch (args->cmd) { case FBSD_LUSB_DEVICEENUMERATE: args->cmd = USB_DEVICEENUMERATE; break; case FBSD_LUSB_DEV_QUIRK_ADD: args->cmd = USB_DEV_QUIRK_ADD; break; case FBSD_LUSB_DEV_QUIRK_GET: args->cmd = USB_DEV_QUIRK_GET; break; case FBSD_LUSB_DEV_QUIRK_REMOVE: args->cmd = USB_DEV_QUIRK_REMOVE; break; case FBSD_LUSB_DO_REQUEST: args->cmd = USB_DO_REQUEST; break; case FBSD_LUSB_FS_CLEAR_STALL_SYNC: args->cmd = USB_FS_CLEAR_STALL_SYNC; break; case FBSD_LUSB_FS_CLOSE: args->cmd = USB_FS_CLOSE; break; case FBSD_LUSB_FS_COMPLETE: args->cmd = USB_FS_COMPLETE; break; case FBSD_LUSB_FS_INIT: args->cmd = USB_FS_INIT; break; case FBSD_LUSB_FS_OPEN: args->cmd = USB_FS_OPEN; break; case FBSD_LUSB_FS_START: args->cmd = USB_FS_START; break; case FBSD_LUSB_FS_STOP: args->cmd = USB_FS_STOP; break; case FBSD_LUSB_FS_UNINIT: args->cmd = USB_FS_UNINIT; break; case FBSD_LUSB_GET_CONFIG: args->cmd = USB_GET_CONFIG; break; case FBSD_LUSB_GET_DEVICEINFO: args->cmd = USB_GET_DEVICEINFO; break; case FBSD_LUSB_GET_DEVICE_DESC: args->cmd = USB_GET_DEVICE_DESC; break; case FBSD_LUSB_GET_FULL_DESC: args->cmd = USB_GET_FULL_DESC; break; case FBSD_LUSB_GET_IFACE_DRIVER: args->cmd = USB_GET_IFACE_DRIVER; break; case FBSD_LUSB_GET_PLUGTIME: args->cmd = USB_GET_PLUGTIME; break; case FBSD_LUSB_GET_POWER_MODE: args->cmd = USB_GET_POWER_MODE; break; case FBSD_LUSB_GET_REPORT_DESC: args->cmd = USB_GET_REPORT_DESC; break; case FBSD_LUSB_GET_REPORT_ID: args->cmd = USB_GET_REPORT_ID; break; case FBSD_LUSB_GET_TEMPLATE: args->cmd = USB_GET_TEMPLATE; break; case FBSD_LUSB_IFACE_DRIVER_ACTIVE: args->cmd = USB_IFACE_DRIVER_ACTIVE; break; case FBSD_LUSB_IFACE_DRIVER_DETACH: args->cmd = USB_IFACE_DRIVER_DETACH; break; case FBSD_LUSB_QUIRK_NAME_GET: args->cmd = USB_QUIRK_NAME_GET; break; case FBSD_LUSB_READ_DIR: args->cmd = USB_READ_DIR; break; case FBSD_LUSB_SET_ALTINTERFACE: args->cmd = USB_SET_ALTINTERFACE; break; case FBSD_LUSB_SET_CONFIG: args->cmd = USB_SET_CONFIG; break; case FBSD_LUSB_SET_IMMED: args->cmd = USB_SET_IMMED; break; case FBSD_LUSB_SET_POWER_MODE: args->cmd = USB_SET_POWER_MODE; break; case FBSD_LUSB_SET_TEMPLATE: args->cmd = USB_SET_TEMPLATE; break; case FBSD_LUSB_FS_OPEN_STREAM: args->cmd = USB_FS_OPEN_STREAM; break; case FBSD_LUSB_GET_DEV_PORT_PATH: args->cmd = USB_GET_DEV_PORT_PATH; break; case FBSD_LUSB_GET_POWER_USAGE: args->cmd = USB_GET_POWER_USAGE; break; default: error = ENOIOCTL; } if (error != ENOIOCTL) error = sys_ioctl(td, (struct ioctl_args *)args); return (error); } /* * Some evdev ioctls must be translated. * - EVIOCGMTSLOTS is a IOC_READ ioctl on Linux although it has input data * (must be IOC_INOUT on FreeBSD). * - On Linux, EVIOCGRAB, EVIOCREVOKE and EVIOCRMFF are defined as _IOW with * an int argument. You don't pass an int pointer to the ioctl(), however, * but just the int directly. On FreeBSD, they are defined as _IOWINT for * this to work. */ static int linux_ioctl_evdev(struct thread *td, struct linux_ioctl_args *args) { struct file *fp; clockid_t clock; int error; args->cmd = SETDIR(args->cmd); switch (args->cmd) { case (EVIOCGRAB & ~IOC_DIRMASK) | IOC_IN: args->cmd = EVIOCGRAB; break; case (EVIOCREVOKE & ~IOC_DIRMASK) | IOC_IN: args->cmd = EVIOCREVOKE; break; case (EVIOCRMFF & ~IOC_DIRMASK) | IOC_IN: args->cmd = EVIOCRMFF; break; case EVIOCSCLOCKID: { error = copyin(PTRIN(args->arg), &clock, sizeof(clock)); if (error != 0) return (error); if (clock & ~(LINUX_IOCTL_EVDEV_CLK)) return (EINVAL); error = linux_to_native_clockid(&clock, clock); if (error != 0) return (error); error = fget(td, args->fd, &cap_ioctl_rights, &fp); if (error != 0) return (error); error = fo_ioctl(fp, EVIOCSCLOCKID, &clock, td->td_ucred, td); fdrop(fp, td); return (error); } default: break; } if (IOCBASECMD(args->cmd) == ((EVIOCGMTSLOTS(0) & ~IOC_DIRMASK) | IOC_OUT)) args->cmd = (args->cmd & ~IOC_DIRMASK) | IOC_INOUT; return (sys_ioctl(td, (struct ioctl_args *)args)); } /* * main ioctl syscall function */ int linux_ioctl(struct thread *td, struct linux_ioctl_args *args) { struct file *fp; struct handler_element *he; int error, cmd; #ifdef DEBUG if (ldebug(ioctl)) printf(ARGS(ioctl, "%d, %04lx, *"), args->fd, (unsigned long)args->cmd); #endif error = fget(td, args->fd, &cap_ioctl_rights, &fp); if (error != 0) return (error); if ((fp->f_flag & (FREAD|FWRITE)) == 0) { fdrop(fp, td); return (EBADF); } /* Iterate over the ioctl handlers */ cmd = args->cmd & 0xffff; sx_slock(&linux_ioctl_sx); mtx_lock(&Giant); TAILQ_FOREACH(he, &handlers, list) { if (cmd >= he->low && cmd <= he->high) { error = (*he->func)(td, args); if (error != ENOIOCTL) { mtx_unlock(&Giant); sx_sunlock(&linux_ioctl_sx); fdrop(fp, td); return (error); } } } mtx_unlock(&Giant); sx_sunlock(&linux_ioctl_sx); fdrop(fp, td); switch (args->cmd & 0xffff) { case LINUX_BTRFS_IOC_CLONE: return (ENOTSUP); default: linux_msg(td, "ioctl fd=%d, cmd=0x%x ('%c',%d) is not implemented", args->fd, (int)(args->cmd & 0xffff), (int)(args->cmd & 0xff00) >> 8, (int)(args->cmd & 0xff)); break; } return (EINVAL); } int linux_ioctl_register_handler(struct linux_ioctl_handler *h) { struct handler_element *he, *cur; if (h == NULL || h->func == NULL) return (EINVAL); /* * Reuse the element if the handler is already on the list, otherwise * create a new element. */ sx_xlock(&linux_ioctl_sx); TAILQ_FOREACH(he, &handlers, list) { if (he->func == h->func) break; } if (he == NULL) { he = malloc(sizeof(*he), M_LINUX, M_WAITOK); he->func = h->func; } else TAILQ_REMOVE(&handlers, he, list); /* Initialize range information. */ he->low = h->low; he->high = h->high; he->span = h->high - h->low + 1; /* Add the element to the list, sorted on span. */ TAILQ_FOREACH(cur, &handlers, list) { if (cur->span > he->span) { TAILQ_INSERT_BEFORE(cur, he, list); sx_xunlock(&linux_ioctl_sx); return (0); } } TAILQ_INSERT_TAIL(&handlers, he, list); sx_xunlock(&linux_ioctl_sx); return (0); } int linux_ioctl_unregister_handler(struct linux_ioctl_handler *h) { struct handler_element *he; if (h == NULL || h->func == NULL) return (EINVAL); sx_xlock(&linux_ioctl_sx); TAILQ_FOREACH(he, &handlers, list) { if (he->func == h->func) { TAILQ_REMOVE(&handlers, he, list); sx_xunlock(&linux_ioctl_sx); free(he, M_LINUX); return (0); } } sx_xunlock(&linux_ioctl_sx); return (EINVAL); } Index: head/sys/compat/linuxkpi/common/include/linux/inetdevice.h =================================================================== --- head/sys/compat/linuxkpi/common/include/linux/inetdevice.h (revision 334117) +++ head/sys/compat/linuxkpi/common/include/linux/inetdevice.h (revision 334118) @@ -1,94 +1,96 @@ /*- * Copyright (c) 2010 Isilon Systems, Inc. * Copyright (c) 2010 iX Systems, Inc. * Copyright (c) 2010 Panasas, Inc. * Copyright (c) 2013-2017 Mellanox Technologies, Ltd. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _LINUX_INETDEVICE_H_ #define _LINUX_INETDEVICE_H_ #include static inline struct net_device * ip_dev_find(struct vnet *vnet, uint32_t addr) { struct sockaddr_in sin; struct ifaddr *ifa; struct ifnet *ifp; memset(&sin, 0, sizeof(sin)); sin.sin_addr.s_addr = addr; sin.sin_len = sizeof(sin); sin.sin_family = AF_INET; + NET_EPOCH_ENTER(); CURVNET_SET_QUIET(vnet); ifa = ifa_ifwithaddr((struct sockaddr *)&sin); CURVNET_RESTORE(); if (ifa) { ifp = ifa->ifa_ifp; if_ref(ifp); - ifa_free(ifa); } else { ifp = NULL; } + NET_EPOCH_EXIT(); return (ifp); } static inline struct net_device * ip6_dev_find(struct vnet *vnet, struct in6_addr addr) { struct sockaddr_in6 sin6; struct ifaddr *ifa = NULL; struct ifnet *ifp = NULL; int x; memset(&sin6, 0, sizeof(sin6)); sin6.sin6_addr = addr; sin6.sin6_len = sizeof(sin6); sin6.sin6_family = AF_INET6; + NET_EPOCH_ENTER(); CURVNET_SET_QUIET(vnet); if (IN6_IS_SCOPE_LINKLOCAL(&addr) || IN6_IS_ADDR_MC_INTFACELOCAL(&addr)) { /* XXX need to search all scope ID's */ for (x = 0; x <= V_if_index && x < 65536; x++) { sin6.sin6_addr.s6_addr16[1] = htons(x); ifa = ifa_ifwithaddr((struct sockaddr *)&sin6); if (ifa != NULL) break; } } else { ifa = ifa_ifwithaddr((struct sockaddr *)&sin6); } if (ifa != NULL) { ifp = ifa->ifa_ifp; if_ref(ifp); - ifa_free(ifa); } + NET_EPOCH_EXIT(); CURVNET_RESTORE(); return (ifp); } #endif /* _LINUX_INETDEVICE_H_ */ Index: head/sys/dev/mlx5/mlx5_ib/mlx5_ib_main.c =================================================================== --- head/sys/dev/mlx5/mlx5_ib/mlx5_ib_main.c (revision 334117) +++ head/sys/dev/mlx5/mlx5_ib/mlx5_ib_main.c (revision 334118) @@ -1,3246 +1,3246 @@ /*- * Copyright (c) 2013-2015, Mellanox Technologies, Ltd. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS `AS IS' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include #include #include #include #include #if defined(CONFIG_X86) #include #endif #include #include #include #undef inode #include #include #include #include #include #include #include #include #include #include #include #include "mlx5_ib.h" #define DRIVER_NAME "mlx5_ib" #define DRIVER_VERSION "3.4.1-BETA" #define DRIVER_RELDATE "October 2017" MODULE_DESCRIPTION("Mellanox Connect-IB HCA IB driver"); MODULE_LICENSE("Dual BSD/GPL"); MODULE_DEPEND(mlx5ib, linuxkpi, 1, 1, 1); MODULE_DEPEND(mlx5ib, mlx5, 1, 1, 1); MODULE_DEPEND(mlx5ib, ibcore, 1, 1, 1); MODULE_VERSION(mlx5ib, 1); static int deprecated_prof_sel = 2; module_param_named(prof_sel, deprecated_prof_sel, int, 0444); MODULE_PARM_DESC(prof_sel, "profile selector. Deprecated here. Moved to module mlx5_core"); static char mlx5_version[] = DRIVER_NAME ": Mellanox Connect-IB Infiniband driver v" DRIVER_VERSION " (" DRIVER_RELDATE ")\n"; enum { MLX5_ATOMIC_SIZE_QP_8BYTES = 1 << 3, }; static enum rdma_link_layer mlx5_port_type_cap_to_rdma_ll(int port_type_cap) { switch (port_type_cap) { case MLX5_CAP_PORT_TYPE_IB: return IB_LINK_LAYER_INFINIBAND; case MLX5_CAP_PORT_TYPE_ETH: return IB_LINK_LAYER_ETHERNET; default: return IB_LINK_LAYER_UNSPECIFIED; } } static enum rdma_link_layer mlx5_ib_port_link_layer(struct ib_device *device, u8 port_num) { struct mlx5_ib_dev *dev = to_mdev(device); int port_type_cap = MLX5_CAP_GEN(dev->mdev, port_type); return mlx5_port_type_cap_to_rdma_ll(port_type_cap); } static bool mlx5_netdev_match(struct net_device *ndev, struct mlx5_core_dev *mdev, const char *dname) { return ndev->if_type == IFT_ETHER && ndev->if_dname != NULL && strcmp(ndev->if_dname, dname) == 0 && ndev->if_softc != NULL && *(struct mlx5_core_dev **)ndev->if_softc == mdev; } static int mlx5_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *ndev = netdev_notifier_info_to_dev(ptr); struct mlx5_ib_dev *ibdev = container_of(this, struct mlx5_ib_dev, roce.nb); switch (event) { case NETDEV_REGISTER: case NETDEV_UNREGISTER: write_lock(&ibdev->roce.netdev_lock); /* check if network interface belongs to mlx5en */ if (mlx5_netdev_match(ndev, ibdev->mdev, "mce")) ibdev->roce.netdev = (event == NETDEV_UNREGISTER) ? NULL : ndev; write_unlock(&ibdev->roce.netdev_lock); break; case NETDEV_UP: case NETDEV_DOWN: { struct net_device *upper = NULL; if ((upper == ndev || (!upper && ndev == ibdev->roce.netdev)) && ibdev->ib_active) { struct ib_event ibev = {0}; ibev.device = &ibdev->ib_dev; ibev.event = (event == NETDEV_UP) ? IB_EVENT_PORT_ACTIVE : IB_EVENT_PORT_ERR; ibev.element.port_num = 1; ib_dispatch_event(&ibev); } break; } default: break; } return NOTIFY_DONE; } static struct net_device *mlx5_ib_get_netdev(struct ib_device *device, u8 port_num) { struct mlx5_ib_dev *ibdev = to_mdev(device); struct net_device *ndev; /* Ensure ndev does not disappear before we invoke dev_hold() */ read_lock(&ibdev->roce.netdev_lock); ndev = ibdev->roce.netdev; if (ndev) dev_hold(ndev); read_unlock(&ibdev->roce.netdev_lock); return ndev; } static int translate_eth_proto_oper(u32 eth_proto_oper, u8 *active_speed, u8 *active_width) { switch (eth_proto_oper) { case MLX5E_PROT_MASK(MLX5E_1000BASE_CX_SGMII): case MLX5E_PROT_MASK(MLX5E_1000BASE_KX): case MLX5E_PROT_MASK(MLX5E_100BASE_TX): case MLX5E_PROT_MASK(MLX5E_1000BASE_T): *active_width = IB_WIDTH_1X; *active_speed = IB_SPEED_SDR; break; case MLX5E_PROT_MASK(MLX5E_10GBASE_T): case MLX5E_PROT_MASK(MLX5E_10GBASE_CX4): case MLX5E_PROT_MASK(MLX5E_10GBASE_KX4): case MLX5E_PROT_MASK(MLX5E_10GBASE_KR): case MLX5E_PROT_MASK(MLX5E_10GBASE_CR): case MLX5E_PROT_MASK(MLX5E_10GBASE_SR): case MLX5E_PROT_MASK(MLX5E_10GBASE_ER): *active_width = IB_WIDTH_1X; *active_speed = IB_SPEED_QDR; break; case MLX5E_PROT_MASK(MLX5E_25GBASE_CR): case MLX5E_PROT_MASK(MLX5E_25GBASE_KR): case MLX5E_PROT_MASK(MLX5E_25GBASE_SR): *active_width = IB_WIDTH_1X; *active_speed = IB_SPEED_EDR; break; case MLX5E_PROT_MASK(MLX5E_40GBASE_CR4): case MLX5E_PROT_MASK(MLX5E_40GBASE_KR4): case MLX5E_PROT_MASK(MLX5E_40GBASE_SR4): case MLX5E_PROT_MASK(MLX5E_40GBASE_LR4): *active_width = IB_WIDTH_4X; *active_speed = IB_SPEED_QDR; break; case MLX5E_PROT_MASK(MLX5E_50GBASE_CR2): case MLX5E_PROT_MASK(MLX5E_50GBASE_KR2): case MLX5E_PROT_MASK(MLX5E_50GBASE_SR2): *active_width = IB_WIDTH_1X; *active_speed = IB_SPEED_HDR; break; case MLX5E_PROT_MASK(MLX5E_56GBASE_R4): *active_width = IB_WIDTH_4X; *active_speed = IB_SPEED_FDR; break; case MLX5E_PROT_MASK(MLX5E_100GBASE_CR4): case MLX5E_PROT_MASK(MLX5E_100GBASE_SR4): case MLX5E_PROT_MASK(MLX5E_100GBASE_KR4): case MLX5E_PROT_MASK(MLX5E_100GBASE_LR4): *active_width = IB_WIDTH_4X; *active_speed = IB_SPEED_EDR; break; default: return -EINVAL; } return 0; } static int mlx5_query_port_roce(struct ib_device *device, u8 port_num, struct ib_port_attr *props) { struct mlx5_ib_dev *dev = to_mdev(device); struct net_device *ndev; enum ib_mtu ndev_ib_mtu; u16 qkey_viol_cntr; u32 eth_prot_oper; int err; memset(props, 0, sizeof(*props)); /* Possible bad flows are checked before filling out props so in case * of an error it will still be zeroed out. */ err = mlx5_query_port_eth_proto_oper(dev->mdev, ð_prot_oper, port_num); if (err) return err; translate_eth_proto_oper(eth_prot_oper, &props->active_speed, &props->active_width); props->port_cap_flags |= IB_PORT_CM_SUP; props->port_cap_flags |= IB_PORT_IP_BASED_GIDS; props->gid_tbl_len = MLX5_CAP_ROCE(dev->mdev, roce_address_table_size); props->max_mtu = IB_MTU_4096; props->max_msg_sz = 1 << MLX5_CAP_GEN(dev->mdev, log_max_msg); props->pkey_tbl_len = 1; props->state = IB_PORT_DOWN; props->phys_state = 3; mlx5_query_nic_vport_qkey_viol_cntr(dev->mdev, &qkey_viol_cntr); props->qkey_viol_cntr = qkey_viol_cntr; ndev = mlx5_ib_get_netdev(device, port_num); if (!ndev) return 0; if (netif_running(ndev) && netif_carrier_ok(ndev)) { props->state = IB_PORT_ACTIVE; props->phys_state = 5; } ndev_ib_mtu = iboe_get_mtu(ndev->if_mtu); dev_put(ndev); props->active_mtu = min(props->max_mtu, ndev_ib_mtu); return 0; } static void ib_gid_to_mlx5_roce_addr(const union ib_gid *gid, const struct ib_gid_attr *attr, void *mlx5_addr) { #define MLX5_SET_RA(p, f, v) MLX5_SET(roce_addr_layout, p, f, v) char *mlx5_addr_l3_addr = MLX5_ADDR_OF(roce_addr_layout, mlx5_addr, source_l3_address); void *mlx5_addr_mac = MLX5_ADDR_OF(roce_addr_layout, mlx5_addr, source_mac_47_32); if (!gid) return; ether_addr_copy(mlx5_addr_mac, IF_LLADDR(attr->ndev)); if (is_vlan_dev(attr->ndev)) { MLX5_SET_RA(mlx5_addr, vlan_valid, 1); MLX5_SET_RA(mlx5_addr, vlan_id, vlan_dev_vlan_id(attr->ndev)); } switch (attr->gid_type) { case IB_GID_TYPE_IB: MLX5_SET_RA(mlx5_addr, roce_version, MLX5_ROCE_VERSION_1); break; case IB_GID_TYPE_ROCE_UDP_ENCAP: MLX5_SET_RA(mlx5_addr, roce_version, MLX5_ROCE_VERSION_2); break; default: WARN_ON(true); } if (attr->gid_type != IB_GID_TYPE_IB) { if (ipv6_addr_v4mapped((void *)gid)) MLX5_SET_RA(mlx5_addr, roce_l3_type, MLX5_ROCE_L3_TYPE_IPV4); else MLX5_SET_RA(mlx5_addr, roce_l3_type, MLX5_ROCE_L3_TYPE_IPV6); } if ((attr->gid_type == IB_GID_TYPE_IB) || !ipv6_addr_v4mapped((void *)gid)) memcpy(mlx5_addr_l3_addr, gid, sizeof(*gid)); else memcpy(&mlx5_addr_l3_addr[12], &gid->raw[12], 4); } static int set_roce_addr(struct ib_device *device, u8 port_num, unsigned int index, const union ib_gid *gid, const struct ib_gid_attr *attr) { struct mlx5_ib_dev *dev = to_mdev(device); u32 in[MLX5_ST_SZ_DW(set_roce_address_in)] = {0}; u32 out[MLX5_ST_SZ_DW(set_roce_address_out)] = {0}; void *in_addr = MLX5_ADDR_OF(set_roce_address_in, in, roce_address); enum rdma_link_layer ll = mlx5_ib_port_link_layer(device, port_num); if (ll != IB_LINK_LAYER_ETHERNET) return -EINVAL; ib_gid_to_mlx5_roce_addr(gid, attr, in_addr); MLX5_SET(set_roce_address_in, in, roce_address_index, index); MLX5_SET(set_roce_address_in, in, opcode, MLX5_CMD_OP_SET_ROCE_ADDRESS); return mlx5_cmd_exec(dev->mdev, in, sizeof(in), out, sizeof(out)); } static int mlx5_ib_add_gid(struct ib_device *device, u8 port_num, unsigned int index, const union ib_gid *gid, const struct ib_gid_attr *attr, __always_unused void **context) { return set_roce_addr(device, port_num, index, gid, attr); } static int mlx5_ib_del_gid(struct ib_device *device, u8 port_num, unsigned int index, __always_unused void **context) { return set_roce_addr(device, port_num, index, NULL, NULL); } __be16 mlx5_get_roce_udp_sport(struct mlx5_ib_dev *dev, u8 port_num, int index) { struct ib_gid_attr attr; union ib_gid gid; if (ib_get_cached_gid(&dev->ib_dev, port_num, index, &gid, &attr)) return 0; if (!attr.ndev) return 0; dev_put(attr.ndev); if (attr.gid_type != IB_GID_TYPE_ROCE_UDP_ENCAP) return 0; return cpu_to_be16(MLX5_CAP_ROCE(dev->mdev, r_roce_min_src_udp_port)); } static int mlx5_use_mad_ifc(struct mlx5_ib_dev *dev) { if (MLX5_CAP_GEN(dev->mdev, port_type) == MLX5_CAP_PORT_TYPE_IB) return !MLX5_CAP_GEN(dev->mdev, ib_virt); return 0; } enum { MLX5_VPORT_ACCESS_METHOD_MAD, MLX5_VPORT_ACCESS_METHOD_HCA, MLX5_VPORT_ACCESS_METHOD_NIC, }; static int mlx5_get_vport_access_method(struct ib_device *ibdev) { if (mlx5_use_mad_ifc(to_mdev(ibdev))) return MLX5_VPORT_ACCESS_METHOD_MAD; if (mlx5_ib_port_link_layer(ibdev, 1) == IB_LINK_LAYER_ETHERNET) return MLX5_VPORT_ACCESS_METHOD_NIC; return MLX5_VPORT_ACCESS_METHOD_HCA; } static void get_atomic_caps(struct mlx5_ib_dev *dev, struct ib_device_attr *props) { u8 tmp; u8 atomic_operations = MLX5_CAP_ATOMIC(dev->mdev, atomic_operations); u8 atomic_size_qp = MLX5_CAP_ATOMIC(dev->mdev, atomic_size_qp); u8 atomic_req_8B_endianness_mode = MLX5_CAP_ATOMIC(dev->mdev, atomic_req_8B_endianess_mode); /* Check if HW supports 8 bytes standard atomic operations and capable * of host endianness respond */ tmp = MLX5_ATOMIC_OPS_CMP_SWAP | MLX5_ATOMIC_OPS_FETCH_ADD; if (((atomic_operations & tmp) == tmp) && (atomic_size_qp & MLX5_ATOMIC_SIZE_QP_8BYTES) && (atomic_req_8B_endianness_mode)) { props->atomic_cap = IB_ATOMIC_HCA; } else { props->atomic_cap = IB_ATOMIC_NONE; } } static int mlx5_query_system_image_guid(struct ib_device *ibdev, __be64 *sys_image_guid) { struct mlx5_ib_dev *dev = to_mdev(ibdev); struct mlx5_core_dev *mdev = dev->mdev; u64 tmp; int err; switch (mlx5_get_vport_access_method(ibdev)) { case MLX5_VPORT_ACCESS_METHOD_MAD: return mlx5_query_mad_ifc_system_image_guid(ibdev, sys_image_guid); case MLX5_VPORT_ACCESS_METHOD_HCA: err = mlx5_query_hca_vport_system_image_guid(mdev, &tmp); break; case MLX5_VPORT_ACCESS_METHOD_NIC: err = mlx5_query_nic_vport_system_image_guid(mdev, &tmp); break; default: return -EINVAL; } if (!err) *sys_image_guid = cpu_to_be64(tmp); return err; } static int mlx5_query_max_pkeys(struct ib_device *ibdev, u16 *max_pkeys) { struct mlx5_ib_dev *dev = to_mdev(ibdev); struct mlx5_core_dev *mdev = dev->mdev; switch (mlx5_get_vport_access_method(ibdev)) { case MLX5_VPORT_ACCESS_METHOD_MAD: return mlx5_query_mad_ifc_max_pkeys(ibdev, max_pkeys); case MLX5_VPORT_ACCESS_METHOD_HCA: case MLX5_VPORT_ACCESS_METHOD_NIC: *max_pkeys = mlx5_to_sw_pkey_sz(MLX5_CAP_GEN(mdev, pkey_table_size)); return 0; default: return -EINVAL; } } static int mlx5_query_vendor_id(struct ib_device *ibdev, u32 *vendor_id) { struct mlx5_ib_dev *dev = to_mdev(ibdev); switch (mlx5_get_vport_access_method(ibdev)) { case MLX5_VPORT_ACCESS_METHOD_MAD: return mlx5_query_mad_ifc_vendor_id(ibdev, vendor_id); case MLX5_VPORT_ACCESS_METHOD_HCA: case MLX5_VPORT_ACCESS_METHOD_NIC: return mlx5_core_query_vendor_id(dev->mdev, vendor_id); default: return -EINVAL; } } static int mlx5_query_node_guid(struct mlx5_ib_dev *dev, __be64 *node_guid) { u64 tmp; int err; switch (mlx5_get_vport_access_method(&dev->ib_dev)) { case MLX5_VPORT_ACCESS_METHOD_MAD: return mlx5_query_mad_ifc_node_guid(dev, node_guid); case MLX5_VPORT_ACCESS_METHOD_HCA: err = mlx5_query_hca_vport_node_guid(dev->mdev, &tmp); break; case MLX5_VPORT_ACCESS_METHOD_NIC: err = mlx5_query_nic_vport_node_guid(dev->mdev, &tmp); break; default: return -EINVAL; } if (!err) *node_guid = cpu_to_be64(tmp); return err; } struct mlx5_reg_node_desc { u8 desc[IB_DEVICE_NODE_DESC_MAX]; }; static int mlx5_query_node_desc(struct mlx5_ib_dev *dev, char *node_desc) { struct mlx5_reg_node_desc in; if (mlx5_use_mad_ifc(dev)) return mlx5_query_mad_ifc_node_desc(dev, node_desc); memset(&in, 0, sizeof(in)); return mlx5_core_access_reg(dev->mdev, &in, sizeof(in), node_desc, sizeof(struct mlx5_reg_node_desc), MLX5_REG_NODE_DESC, 0, 0); } static int mlx5_ib_query_device(struct ib_device *ibdev, struct ib_device_attr *props, struct ib_udata *uhw) { struct mlx5_ib_dev *dev = to_mdev(ibdev); struct mlx5_core_dev *mdev = dev->mdev; int err = -ENOMEM; int max_rq_sg; int max_sq_sg; u64 min_page_size = 1ull << MLX5_CAP_GEN(mdev, log_pg_sz); struct mlx5_ib_query_device_resp resp = {}; size_t resp_len; u64 max_tso; resp_len = sizeof(resp.comp_mask) + sizeof(resp.response_length); if (uhw->outlen && uhw->outlen < resp_len) return -EINVAL; else resp.response_length = resp_len; if (uhw->inlen && !ib_is_udata_cleared(uhw, 0, uhw->inlen)) return -EINVAL; memset(props, 0, sizeof(*props)); err = mlx5_query_system_image_guid(ibdev, &props->sys_image_guid); if (err) return err; err = mlx5_query_max_pkeys(ibdev, &props->max_pkeys); if (err) return err; err = mlx5_query_vendor_id(ibdev, &props->vendor_id); if (err) return err; props->fw_ver = ((u64)fw_rev_maj(dev->mdev) << 32) | (fw_rev_min(dev->mdev) << 16) | fw_rev_sub(dev->mdev); props->device_cap_flags = IB_DEVICE_CHANGE_PHY_PORT | IB_DEVICE_PORT_ACTIVE_EVENT | IB_DEVICE_SYS_IMAGE_GUID | IB_DEVICE_RC_RNR_NAK_GEN; if (MLX5_CAP_GEN(mdev, pkv)) props->device_cap_flags |= IB_DEVICE_BAD_PKEY_CNTR; if (MLX5_CAP_GEN(mdev, qkv)) props->device_cap_flags |= IB_DEVICE_BAD_QKEY_CNTR; if (MLX5_CAP_GEN(mdev, apm)) props->device_cap_flags |= IB_DEVICE_AUTO_PATH_MIG; if (MLX5_CAP_GEN(mdev, xrc)) props->device_cap_flags |= IB_DEVICE_XRC; if (MLX5_CAP_GEN(mdev, imaicl)) { props->device_cap_flags |= IB_DEVICE_MEM_WINDOW | IB_DEVICE_MEM_WINDOW_TYPE_2B; props->max_mw = 1 << MLX5_CAP_GEN(mdev, log_max_mkey); /* We support 'Gappy' memory registration too */ props->device_cap_flags |= IB_DEVICE_SG_GAPS_REG; } props->device_cap_flags |= IB_DEVICE_MEM_MGT_EXTENSIONS; if (MLX5_CAP_GEN(mdev, sho)) { props->device_cap_flags |= IB_DEVICE_SIGNATURE_HANDOVER; /* At this stage no support for signature handover */ props->sig_prot_cap = IB_PROT_T10DIF_TYPE_1 | IB_PROT_T10DIF_TYPE_2 | IB_PROT_T10DIF_TYPE_3; props->sig_guard_cap = IB_GUARD_T10DIF_CRC | IB_GUARD_T10DIF_CSUM; } if (MLX5_CAP_GEN(mdev, block_lb_mc)) props->device_cap_flags |= IB_DEVICE_BLOCK_MULTICAST_LOOPBACK; if (MLX5_CAP_GEN(dev->mdev, eth_net_offloads)) { if (MLX5_CAP_ETH(mdev, csum_cap)) props->device_cap_flags |= IB_DEVICE_RAW_IP_CSUM; if (field_avail(typeof(resp), tso_caps, uhw->outlen)) { max_tso = MLX5_CAP_ETH(mdev, max_lso_cap); if (max_tso) { resp.tso_caps.max_tso = 1 << max_tso; resp.tso_caps.supported_qpts |= 1 << IB_QPT_RAW_PACKET; resp.response_length += sizeof(resp.tso_caps); } } if (field_avail(typeof(resp), rss_caps, uhw->outlen)) { resp.rss_caps.rx_hash_function = MLX5_RX_HASH_FUNC_TOEPLITZ; resp.rss_caps.rx_hash_fields_mask = MLX5_RX_HASH_SRC_IPV4 | MLX5_RX_HASH_DST_IPV4 | MLX5_RX_HASH_SRC_IPV6 | MLX5_RX_HASH_DST_IPV6 | MLX5_RX_HASH_SRC_PORT_TCP | MLX5_RX_HASH_DST_PORT_TCP | MLX5_RX_HASH_SRC_PORT_UDP | MLX5_RX_HASH_DST_PORT_UDP; resp.response_length += sizeof(resp.rss_caps); } } else { if (field_avail(typeof(resp), tso_caps, uhw->outlen)) resp.response_length += sizeof(resp.tso_caps); if (field_avail(typeof(resp), rss_caps, uhw->outlen)) resp.response_length += sizeof(resp.rss_caps); } if (MLX5_CAP_GEN(mdev, ipoib_ipoib_offloads)) { props->device_cap_flags |= IB_DEVICE_UD_IP_CSUM; props->device_cap_flags |= IB_DEVICE_UD_TSO; } if (MLX5_CAP_GEN(dev->mdev, eth_net_offloads) && MLX5_CAP_ETH(dev->mdev, scatter_fcs)) props->device_cap_flags |= IB_DEVICE_RAW_SCATTER_FCS; if (mlx5_get_flow_namespace(dev->mdev, MLX5_FLOW_NAMESPACE_BYPASS)) props->device_cap_flags |= IB_DEVICE_MANAGED_FLOW_STEERING; props->vendor_part_id = mdev->pdev->device; props->hw_ver = mdev->pdev->revision; props->max_mr_size = ~0ull; props->page_size_cap = ~(min_page_size - 1); props->max_qp = 1 << MLX5_CAP_GEN(mdev, log_max_qp); props->max_qp_wr = 1 << MLX5_CAP_GEN(mdev, log_max_qp_sz); max_rq_sg = MLX5_CAP_GEN(mdev, max_wqe_sz_rq) / sizeof(struct mlx5_wqe_data_seg); max_sq_sg = (MLX5_CAP_GEN(mdev, max_wqe_sz_sq) - sizeof(struct mlx5_wqe_ctrl_seg)) / sizeof(struct mlx5_wqe_data_seg); props->max_sge = min(max_rq_sg, max_sq_sg); props->max_sge_rd = MLX5_MAX_SGE_RD; props->max_cq = 1 << MLX5_CAP_GEN(mdev, log_max_cq); props->max_cqe = (1 << MLX5_CAP_GEN(mdev, log_max_cq_sz)) - 1; props->max_mr = 1 << MLX5_CAP_GEN(mdev, log_max_mkey); props->max_pd = 1 << MLX5_CAP_GEN(mdev, log_max_pd); props->max_qp_rd_atom = 1 << MLX5_CAP_GEN(mdev, log_max_ra_req_qp); props->max_qp_init_rd_atom = 1 << MLX5_CAP_GEN(mdev, log_max_ra_res_qp); props->max_srq = 1 << MLX5_CAP_GEN(mdev, log_max_srq); props->max_srq_wr = (1 << MLX5_CAP_GEN(mdev, log_max_srq_sz)) - 1; props->local_ca_ack_delay = MLX5_CAP_GEN(mdev, local_ca_ack_delay); props->max_res_rd_atom = props->max_qp_rd_atom * props->max_qp; props->max_srq_sge = max_rq_sg - 1; props->max_fast_reg_page_list_len = 1 << MLX5_CAP_GEN(mdev, log_max_klm_list_size); get_atomic_caps(dev, props); props->masked_atomic_cap = IB_ATOMIC_NONE; props->max_mcast_grp = 1 << MLX5_CAP_GEN(mdev, log_max_mcg); props->max_mcast_qp_attach = MLX5_CAP_GEN(mdev, max_qp_mcg); props->max_total_mcast_qp_attach = props->max_mcast_qp_attach * props->max_mcast_grp; props->max_map_per_fmr = INT_MAX; /* no limit in ConnectIB */ props->hca_core_clock = MLX5_CAP_GEN(mdev, device_frequency_khz); props->timestamp_mask = 0x7FFFFFFFFFFFFFFFULL; #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING if (MLX5_CAP_GEN(mdev, pg)) props->device_cap_flags |= IB_DEVICE_ON_DEMAND_PAGING; props->odp_caps = dev->odp_caps; #endif if (MLX5_CAP_GEN(mdev, cd)) props->device_cap_flags |= IB_DEVICE_CROSS_CHANNEL; if (!mlx5_core_is_pf(mdev)) props->device_cap_flags |= IB_DEVICE_VIRTUAL_FUNCTION; if (mlx5_ib_port_link_layer(ibdev, 1) == IB_LINK_LAYER_ETHERNET) { props->rss_caps.max_rwq_indirection_tables = 1 << MLX5_CAP_GEN(dev->mdev, log_max_rqt); props->rss_caps.max_rwq_indirection_table_size = 1 << MLX5_CAP_GEN(dev->mdev, log_max_rqt_size); props->rss_caps.supported_qpts = 1 << IB_QPT_RAW_PACKET; props->max_wq_type_rq = 1 << MLX5_CAP_GEN(dev->mdev, log_max_rq); } if (uhw->outlen) { err = ib_copy_to_udata(uhw, &resp, resp.response_length); if (err) return err; } return 0; } enum mlx5_ib_width { MLX5_IB_WIDTH_1X = 1 << 0, MLX5_IB_WIDTH_2X = 1 << 1, MLX5_IB_WIDTH_4X = 1 << 2, MLX5_IB_WIDTH_8X = 1 << 3, MLX5_IB_WIDTH_12X = 1 << 4 }; static int translate_active_width(struct ib_device *ibdev, u8 active_width, u8 *ib_width) { struct mlx5_ib_dev *dev = to_mdev(ibdev); int err = 0; if (active_width & MLX5_IB_WIDTH_1X) { *ib_width = IB_WIDTH_1X; } else if (active_width & MLX5_IB_WIDTH_2X) { mlx5_ib_dbg(dev, "active_width %d is not supported by IB spec\n", (int)active_width); err = -EINVAL; } else if (active_width & MLX5_IB_WIDTH_4X) { *ib_width = IB_WIDTH_4X; } else if (active_width & MLX5_IB_WIDTH_8X) { *ib_width = IB_WIDTH_8X; } else if (active_width & MLX5_IB_WIDTH_12X) { *ib_width = IB_WIDTH_12X; } else { mlx5_ib_dbg(dev, "Invalid active_width %d\n", (int)active_width); err = -EINVAL; } return err; } enum ib_max_vl_num { __IB_MAX_VL_0 = 1, __IB_MAX_VL_0_1 = 2, __IB_MAX_VL_0_3 = 3, __IB_MAX_VL_0_7 = 4, __IB_MAX_VL_0_14 = 5, }; enum mlx5_vl_hw_cap { MLX5_VL_HW_0 = 1, MLX5_VL_HW_0_1 = 2, MLX5_VL_HW_0_2 = 3, MLX5_VL_HW_0_3 = 4, MLX5_VL_HW_0_4 = 5, MLX5_VL_HW_0_5 = 6, MLX5_VL_HW_0_6 = 7, MLX5_VL_HW_0_7 = 8, MLX5_VL_HW_0_14 = 15 }; static int translate_max_vl_num(struct ib_device *ibdev, u8 vl_hw_cap, u8 *max_vl_num) { switch (vl_hw_cap) { case MLX5_VL_HW_0: *max_vl_num = __IB_MAX_VL_0; break; case MLX5_VL_HW_0_1: *max_vl_num = __IB_MAX_VL_0_1; break; case MLX5_VL_HW_0_3: *max_vl_num = __IB_MAX_VL_0_3; break; case MLX5_VL_HW_0_7: *max_vl_num = __IB_MAX_VL_0_7; break; case MLX5_VL_HW_0_14: *max_vl_num = __IB_MAX_VL_0_14; break; default: return -EINVAL; } return 0; } static int mlx5_query_hca_port(struct ib_device *ibdev, u8 port, struct ib_port_attr *props) { struct mlx5_ib_dev *dev = to_mdev(ibdev); struct mlx5_core_dev *mdev = dev->mdev; u32 *rep; int replen = MLX5_ST_SZ_BYTES(query_hca_vport_context_out); struct mlx5_ptys_reg *ptys; struct mlx5_pmtu_reg *pmtu; struct mlx5_pvlc_reg pvlc; void *ctx; int err; rep = mlx5_vzalloc(replen); ptys = kzalloc(sizeof(*ptys), GFP_KERNEL); pmtu = kzalloc(sizeof(*pmtu), GFP_KERNEL); if (!rep || !ptys || !pmtu) { err = -ENOMEM; goto out; } memset(props, 0, sizeof(*props)); err = mlx5_query_hca_vport_context(mdev, port, 0, rep, replen); if (err) goto out; ctx = MLX5_ADDR_OF(query_hca_vport_context_out, rep, hca_vport_context); props->lid = MLX5_GET(hca_vport_context, ctx, lid); props->lmc = MLX5_GET(hca_vport_context, ctx, lmc); props->sm_lid = MLX5_GET(hca_vport_context, ctx, sm_lid); props->sm_sl = MLX5_GET(hca_vport_context, ctx, sm_sl); props->state = MLX5_GET(hca_vport_context, ctx, vport_state); props->phys_state = MLX5_GET(hca_vport_context, ctx, port_physical_state); props->port_cap_flags = MLX5_GET(hca_vport_context, ctx, cap_mask1); props->gid_tbl_len = mlx5_get_gid_table_len(MLX5_CAP_GEN(mdev, gid_table_size)); props->max_msg_sz = 1 << MLX5_CAP_GEN(mdev, log_max_msg); props->pkey_tbl_len = mlx5_to_sw_pkey_sz(MLX5_CAP_GEN(mdev, pkey_table_size)); props->bad_pkey_cntr = MLX5_GET(hca_vport_context, ctx, pkey_violation_counter); props->qkey_viol_cntr = MLX5_GET(hca_vport_context, ctx, qkey_violation_counter); props->subnet_timeout = MLX5_GET(hca_vport_context, ctx, subnet_timeout); props->init_type_reply = MLX5_GET(hca_vport_context, ctx, init_type_reply); props->grh_required = MLX5_GET(hca_vport_context, ctx, grh_required); ptys->proto_mask |= MLX5_PTYS_IB; ptys->local_port = port; err = mlx5_core_access_ptys(mdev, ptys, 0); if (err) goto out; err = translate_active_width(ibdev, ptys->ib_link_width_oper, &props->active_width); if (err) goto out; props->active_speed = (u8)ptys->ib_proto_oper; pmtu->local_port = port; err = mlx5_core_access_pmtu(mdev, pmtu, 0); if (err) goto out; props->max_mtu = pmtu->max_mtu; props->active_mtu = pmtu->oper_mtu; memset(&pvlc, 0, sizeof(pvlc)); pvlc.local_port = port; err = mlx5_core_access_pvlc(mdev, &pvlc, 0); if (err) goto out; err = translate_max_vl_num(ibdev, pvlc.vl_hw_cap, &props->max_vl_num); out: kvfree(rep); kfree(ptys); kfree(pmtu); return err; } int mlx5_ib_query_port(struct ib_device *ibdev, u8 port, struct ib_port_attr *props) { switch (mlx5_get_vport_access_method(ibdev)) { case MLX5_VPORT_ACCESS_METHOD_MAD: return mlx5_query_mad_ifc_port(ibdev, port, props); case MLX5_VPORT_ACCESS_METHOD_HCA: return mlx5_query_hca_port(ibdev, port, props); case MLX5_VPORT_ACCESS_METHOD_NIC: return mlx5_query_port_roce(ibdev, port, props); default: return -EINVAL; } } static int mlx5_ib_query_gid(struct ib_device *ibdev, u8 port, int index, union ib_gid *gid) { struct mlx5_ib_dev *dev = to_mdev(ibdev); struct mlx5_core_dev *mdev = dev->mdev; switch (mlx5_get_vport_access_method(ibdev)) { case MLX5_VPORT_ACCESS_METHOD_MAD: return mlx5_query_mad_ifc_gids(ibdev, port, index, gid); case MLX5_VPORT_ACCESS_METHOD_HCA: return mlx5_query_hca_vport_gid(mdev, port, 0, index, gid); default: return -EINVAL; } } static int mlx5_ib_query_pkey(struct ib_device *ibdev, u8 port, u16 index, u16 *pkey) { struct mlx5_ib_dev *dev = to_mdev(ibdev); struct mlx5_core_dev *mdev = dev->mdev; switch (mlx5_get_vport_access_method(ibdev)) { case MLX5_VPORT_ACCESS_METHOD_MAD: return mlx5_query_mad_ifc_pkey(ibdev, port, index, pkey); case MLX5_VPORT_ACCESS_METHOD_HCA: case MLX5_VPORT_ACCESS_METHOD_NIC: return mlx5_query_hca_vport_pkey(mdev, 0, port, 0, index, pkey); default: return -EINVAL; } } static int mlx5_ib_modify_device(struct ib_device *ibdev, int mask, struct ib_device_modify *props) { struct mlx5_ib_dev *dev = to_mdev(ibdev); struct mlx5_reg_node_desc in; struct mlx5_reg_node_desc out; int err; if (mask & ~IB_DEVICE_MODIFY_NODE_DESC) return -EOPNOTSUPP; if (!(mask & IB_DEVICE_MODIFY_NODE_DESC)) return 0; /* * If possible, pass node desc to FW, so it can generate * a 144 trap. If cmd fails, just ignore. */ memcpy(&in, props->node_desc, IB_DEVICE_NODE_DESC_MAX); err = mlx5_core_access_reg(dev->mdev, &in, sizeof(in), &out, sizeof(out), MLX5_REG_NODE_DESC, 0, 1); if (err) return err; memcpy(ibdev->node_desc, props->node_desc, IB_DEVICE_NODE_DESC_MAX); return err; } static int mlx5_ib_modify_port(struct ib_device *ibdev, u8 port, int mask, struct ib_port_modify *props) { struct mlx5_ib_dev *dev = to_mdev(ibdev); struct ib_port_attr attr; u32 tmp; int err; mutex_lock(&dev->cap_mask_mutex); err = mlx5_ib_query_port(ibdev, port, &attr); if (err) goto out; tmp = (attr.port_cap_flags | props->set_port_cap_mask) & ~props->clr_port_cap_mask; err = mlx5_set_port_caps(dev->mdev, port, tmp); out: mutex_unlock(&dev->cap_mask_mutex); return err; } static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev, struct ib_udata *udata) { struct mlx5_ib_dev *dev = to_mdev(ibdev); struct mlx5_ib_alloc_ucontext_req_v2 req = {}; struct mlx5_ib_alloc_ucontext_resp resp = {}; struct mlx5_ib_ucontext *context; struct mlx5_uuar_info *uuari; struct mlx5_uar *uars; int gross_uuars; int num_uars; int ver; int uuarn; int err; int i; size_t reqlen; size_t min_req_v2 = offsetof(struct mlx5_ib_alloc_ucontext_req_v2, max_cqe_version); if (!dev->ib_active) return ERR_PTR(-EAGAIN); if (udata->inlen < sizeof(struct ib_uverbs_cmd_hdr)) return ERR_PTR(-EINVAL); reqlen = udata->inlen - sizeof(struct ib_uverbs_cmd_hdr); if (reqlen == sizeof(struct mlx5_ib_alloc_ucontext_req)) ver = 0; else if (reqlen >= min_req_v2) ver = 2; else return ERR_PTR(-EINVAL); err = ib_copy_from_udata(&req, udata, min(reqlen, sizeof(req))); if (err) return ERR_PTR(err); if (req.flags) return ERR_PTR(-EINVAL); if (req.total_num_uuars > MLX5_MAX_UUARS) return ERR_PTR(-ENOMEM); if (req.total_num_uuars == 0) return ERR_PTR(-EINVAL); if (req.comp_mask || req.reserved0 || req.reserved1 || req.reserved2) return ERR_PTR(-EOPNOTSUPP); if (reqlen > sizeof(req) && !ib_is_udata_cleared(udata, sizeof(req), reqlen - sizeof(req))) return ERR_PTR(-EOPNOTSUPP); req.total_num_uuars = ALIGN(req.total_num_uuars, MLX5_NON_FP_BF_REGS_PER_PAGE); if (req.num_low_latency_uuars > req.total_num_uuars - 1) return ERR_PTR(-EINVAL); num_uars = req.total_num_uuars / MLX5_NON_FP_BF_REGS_PER_PAGE; gross_uuars = num_uars * MLX5_BF_REGS_PER_PAGE; resp.qp_tab_size = 1 << MLX5_CAP_GEN(dev->mdev, log_max_qp); if (mlx5_core_is_pf(dev->mdev) && MLX5_CAP_GEN(dev->mdev, bf)) resp.bf_reg_size = 1 << MLX5_CAP_GEN(dev->mdev, log_bf_reg_size); resp.cache_line_size = cache_line_size(); resp.max_sq_desc_sz = MLX5_CAP_GEN(dev->mdev, max_wqe_sz_sq); resp.max_rq_desc_sz = MLX5_CAP_GEN(dev->mdev, max_wqe_sz_rq); resp.max_send_wqebb = 1 << MLX5_CAP_GEN(dev->mdev, log_max_qp_sz); resp.max_recv_wr = 1 << MLX5_CAP_GEN(dev->mdev, log_max_qp_sz); resp.max_srq_recv_wr = 1 << MLX5_CAP_GEN(dev->mdev, log_max_srq_sz); resp.cqe_version = min_t(__u8, (__u8)MLX5_CAP_GEN(dev->mdev, cqe_version), req.max_cqe_version); resp.response_length = min(offsetof(typeof(resp), response_length) + sizeof(resp.response_length), udata->outlen); context = kzalloc(sizeof(*context), GFP_KERNEL); if (!context) return ERR_PTR(-ENOMEM); uuari = &context->uuari; mutex_init(&uuari->lock); uars = kcalloc(num_uars, sizeof(*uars), GFP_KERNEL); if (!uars) { err = -ENOMEM; goto out_ctx; } uuari->bitmap = kcalloc(BITS_TO_LONGS(gross_uuars), sizeof(*uuari->bitmap), GFP_KERNEL); if (!uuari->bitmap) { err = -ENOMEM; goto out_uar_ctx; } /* * clear all fast path uuars */ for (i = 0; i < gross_uuars; i++) { uuarn = i & 3; if (uuarn == 2 || uuarn == 3) set_bit(i, uuari->bitmap); } uuari->count = kcalloc(gross_uuars, sizeof(*uuari->count), GFP_KERNEL); if (!uuari->count) { err = -ENOMEM; goto out_bitmap; } for (i = 0; i < num_uars; i++) { err = mlx5_cmd_alloc_uar(dev->mdev, &uars[i].index); if (err) goto out_count; } #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING context->ibucontext.invalidate_range = &mlx5_ib_invalidate_range; #endif if (MLX5_CAP_GEN(dev->mdev, log_max_transport_domain)) { err = mlx5_alloc_transport_domain(dev->mdev, &context->tdn); if (err) goto out_uars; } INIT_LIST_HEAD(&context->vma_private_list); INIT_LIST_HEAD(&context->db_page_list); mutex_init(&context->db_page_mutex); resp.tot_uuars = req.total_num_uuars; resp.num_ports = MLX5_CAP_GEN(dev->mdev, num_ports); if (field_avail(typeof(resp), cqe_version, udata->outlen)) resp.response_length += sizeof(resp.cqe_version); if (field_avail(typeof(resp), cmds_supp_uhw, udata->outlen)) { resp.cmds_supp_uhw |= MLX5_USER_CMDS_SUPP_UHW_QUERY_DEVICE | MLX5_USER_CMDS_SUPP_UHW_CREATE_AH; resp.response_length += sizeof(resp.cmds_supp_uhw); } /* * We don't want to expose information from the PCI bar that is located * after 4096 bytes, so if the arch only supports larger pages, let's * pretend we don't support reading the HCA's core clock. This is also * forced by mmap function. */ if (PAGE_SIZE <= 4096 && field_avail(typeof(resp), hca_core_clock_offset, udata->outlen)) { resp.comp_mask |= MLX5_IB_ALLOC_UCONTEXT_RESP_MASK_CORE_CLOCK_OFFSET; resp.hca_core_clock_offset = offsetof(struct mlx5_init_seg, internal_timer_h) % PAGE_SIZE; resp.response_length += sizeof(resp.hca_core_clock_offset) + sizeof(resp.reserved2); } err = ib_copy_to_udata(udata, &resp, resp.response_length); if (err) goto out_td; uuari->ver = ver; uuari->num_low_latency_uuars = req.num_low_latency_uuars; uuari->uars = uars; uuari->num_uars = num_uars; context->cqe_version = resp.cqe_version; return &context->ibucontext; out_td: if (MLX5_CAP_GEN(dev->mdev, log_max_transport_domain)) mlx5_dealloc_transport_domain(dev->mdev, context->tdn); out_uars: for (i--; i >= 0; i--) mlx5_cmd_free_uar(dev->mdev, uars[i].index); out_count: kfree(uuari->count); out_bitmap: kfree(uuari->bitmap); out_uar_ctx: kfree(uars); out_ctx: kfree(context); return ERR_PTR(err); } static int mlx5_ib_dealloc_ucontext(struct ib_ucontext *ibcontext) { struct mlx5_ib_ucontext *context = to_mucontext(ibcontext); struct mlx5_ib_dev *dev = to_mdev(ibcontext->device); struct mlx5_uuar_info *uuari = &context->uuari; int i; if (MLX5_CAP_GEN(dev->mdev, log_max_transport_domain)) mlx5_dealloc_transport_domain(dev->mdev, context->tdn); for (i = 0; i < uuari->num_uars; i++) { if (mlx5_cmd_free_uar(dev->mdev, uuari->uars[i].index)) mlx5_ib_warn(dev, "failed to free UAR 0x%x\n", uuari->uars[i].index); } kfree(uuari->count); kfree(uuari->bitmap); kfree(uuari->uars); kfree(context); return 0; } static phys_addr_t uar_index2pfn(struct mlx5_ib_dev *dev, int index) { return (pci_resource_start(dev->mdev->pdev, 0) >> PAGE_SHIFT) + index; } static int get_command(unsigned long offset) { return (offset >> MLX5_IB_MMAP_CMD_SHIFT) & MLX5_IB_MMAP_CMD_MASK; } static int get_arg(unsigned long offset) { return offset & ((1 << MLX5_IB_MMAP_CMD_SHIFT) - 1); } static int get_index(unsigned long offset) { return get_arg(offset); } static void mlx5_ib_vma_open(struct vm_area_struct *area) { /* vma_open is called when a new VMA is created on top of our VMA. This * is done through either mremap flow or split_vma (usually due to * mlock, madvise, munmap, etc.) We do not support a clone of the VMA, * as this VMA is strongly hardware related. Therefore we set the * vm_ops of the newly created/cloned VMA to NULL, to prevent it from * calling us again and trying to do incorrect actions. We assume that * the original VMA size is exactly a single page, and therefore all * "splitting" operation will not happen to it. */ area->vm_ops = NULL; } static void mlx5_ib_vma_close(struct vm_area_struct *area) { struct mlx5_ib_vma_private_data *mlx5_ib_vma_priv_data; /* It's guaranteed that all VMAs opened on a FD are closed before the * file itself is closed, therefore no sync is needed with the regular * closing flow. (e.g. mlx5 ib_dealloc_ucontext) * However need a sync with accessing the vma as part of * mlx5_ib_disassociate_ucontext. * The close operation is usually called under mm->mmap_sem except when * process is exiting. * The exiting case is handled explicitly as part of * mlx5_ib_disassociate_ucontext. */ mlx5_ib_vma_priv_data = (struct mlx5_ib_vma_private_data *)area->vm_private_data; /* setting the vma context pointer to null in the mlx5_ib driver's * private data, to protect a race condition in * mlx5_ib_disassociate_ucontext(). */ mlx5_ib_vma_priv_data->vma = NULL; list_del(&mlx5_ib_vma_priv_data->list); kfree(mlx5_ib_vma_priv_data); } static const struct vm_operations_struct mlx5_ib_vm_ops = { .open = mlx5_ib_vma_open, .close = mlx5_ib_vma_close }; static int mlx5_ib_set_vma_data(struct vm_area_struct *vma, struct mlx5_ib_ucontext *ctx) { struct mlx5_ib_vma_private_data *vma_prv; struct list_head *vma_head = &ctx->vma_private_list; vma_prv = kzalloc(sizeof(*vma_prv), GFP_KERNEL); if (!vma_prv) return -ENOMEM; vma_prv->vma = vma; vma->vm_private_data = vma_prv; vma->vm_ops = &mlx5_ib_vm_ops; list_add(&vma_prv->list, vma_head); return 0; } static inline char *mmap_cmd2str(enum mlx5_ib_mmap_cmd cmd) { switch (cmd) { case MLX5_IB_MMAP_WC_PAGE: return "WC"; case MLX5_IB_MMAP_REGULAR_PAGE: return "best effort WC"; case MLX5_IB_MMAP_NC_PAGE: return "NC"; default: return NULL; } } static int uar_mmap(struct mlx5_ib_dev *dev, enum mlx5_ib_mmap_cmd cmd, struct vm_area_struct *vma, struct mlx5_ib_ucontext *context) { struct mlx5_uuar_info *uuari = &context->uuari; int err; unsigned long idx; phys_addr_t pfn, pa; pgprot_t prot; switch (cmd) { case MLX5_IB_MMAP_WC_PAGE: /* Some architectures don't support WC memory */ #if defined(CONFIG_X86) if (!pat_enabled()) return -EPERM; #elif !(defined(CONFIG_PPC) || (defined(CONFIG_ARM) && defined(CONFIG_MMU))) return -EPERM; #endif /* fall through */ case MLX5_IB_MMAP_REGULAR_PAGE: /* For MLX5_IB_MMAP_REGULAR_PAGE do the best effort to get WC */ prot = pgprot_writecombine(vma->vm_page_prot); break; case MLX5_IB_MMAP_NC_PAGE: prot = pgprot_noncached(vma->vm_page_prot); break; default: return -EINVAL; } if (vma->vm_end - vma->vm_start != PAGE_SIZE) return -EINVAL; idx = get_index(vma->vm_pgoff); if (idx >= uuari->num_uars) return -EINVAL; pfn = uar_index2pfn(dev, uuari->uars[idx].index); mlx5_ib_dbg(dev, "uar idx 0x%lx, pfn %pa\n", idx, &pfn); vma->vm_page_prot = prot; err = io_remap_pfn_range(vma, vma->vm_start, pfn, PAGE_SIZE, vma->vm_page_prot); if (err) { mlx5_ib_err(dev, "io_remap_pfn_range failed with error=%d, vm_start=0x%llx, pfn=%pa, mmap_cmd=%s\n", err, (unsigned long long)vma->vm_start, &pfn, mmap_cmd2str(cmd)); return -EAGAIN; } pa = pfn << PAGE_SHIFT; mlx5_ib_dbg(dev, "mapped %s at 0x%llx, PA %pa\n", mmap_cmd2str(cmd), (unsigned long long)vma->vm_start, &pa); return mlx5_ib_set_vma_data(vma, context); } static int mlx5_ib_mmap(struct ib_ucontext *ibcontext, struct vm_area_struct *vma) { struct mlx5_ib_ucontext *context = to_mucontext(ibcontext); struct mlx5_ib_dev *dev = to_mdev(ibcontext->device); unsigned long command; phys_addr_t pfn; command = get_command(vma->vm_pgoff); switch (command) { case MLX5_IB_MMAP_WC_PAGE: case MLX5_IB_MMAP_NC_PAGE: case MLX5_IB_MMAP_REGULAR_PAGE: return uar_mmap(dev, command, vma, context); case MLX5_IB_MMAP_GET_CONTIGUOUS_PAGES: return -ENOSYS; case MLX5_IB_MMAP_CORE_CLOCK: if (vma->vm_end - vma->vm_start != PAGE_SIZE) return -EINVAL; if (vma->vm_flags & VM_WRITE) return -EPERM; /* Don't expose to user-space information it shouldn't have */ if (PAGE_SIZE > 4096) return -EOPNOTSUPP; vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); pfn = (dev->mdev->iseg_base + offsetof(struct mlx5_init_seg, internal_timer_h)) >> PAGE_SHIFT; if (io_remap_pfn_range(vma, vma->vm_start, pfn, PAGE_SIZE, vma->vm_page_prot)) return -EAGAIN; mlx5_ib_dbg(dev, "mapped internal timer at 0x%llx, PA 0x%llx\n", (unsigned long long)vma->vm_start, (unsigned long long)pfn << PAGE_SHIFT); break; default: return -EINVAL; } return 0; } static struct ib_pd *mlx5_ib_alloc_pd(struct ib_device *ibdev, struct ib_ucontext *context, struct ib_udata *udata) { struct mlx5_ib_alloc_pd_resp resp; struct mlx5_ib_pd *pd; int err; pd = kmalloc(sizeof(*pd), GFP_KERNEL); if (!pd) return ERR_PTR(-ENOMEM); err = mlx5_core_alloc_pd(to_mdev(ibdev)->mdev, &pd->pdn); if (err) { kfree(pd); return ERR_PTR(err); } if (context) { resp.pdn = pd->pdn; if (ib_copy_to_udata(udata, &resp, sizeof(resp))) { mlx5_core_dealloc_pd(to_mdev(ibdev)->mdev, pd->pdn); kfree(pd); return ERR_PTR(-EFAULT); } } return &pd->ibpd; } static int mlx5_ib_dealloc_pd(struct ib_pd *pd) { struct mlx5_ib_dev *mdev = to_mdev(pd->device); struct mlx5_ib_pd *mpd = to_mpd(pd); mlx5_core_dealloc_pd(mdev->mdev, mpd->pdn); kfree(mpd); return 0; } enum { MATCH_CRITERIA_ENABLE_OUTER_BIT, MATCH_CRITERIA_ENABLE_MISC_BIT, MATCH_CRITERIA_ENABLE_INNER_BIT }; #define HEADER_IS_ZERO(match_criteria, headers) \ !(memchr_inv(MLX5_ADDR_OF(fte_match_param, match_criteria, headers), \ 0, MLX5_FLD_SZ_BYTES(fte_match_param, headers))) \ static u8 get_match_criteria_enable(u32 *match_criteria) { u8 match_criteria_enable; match_criteria_enable = (!HEADER_IS_ZERO(match_criteria, outer_headers)) << MATCH_CRITERIA_ENABLE_OUTER_BIT; match_criteria_enable |= (!HEADER_IS_ZERO(match_criteria, misc_parameters)) << MATCH_CRITERIA_ENABLE_MISC_BIT; match_criteria_enable |= (!HEADER_IS_ZERO(match_criteria, inner_headers)) << MATCH_CRITERIA_ENABLE_INNER_BIT; return match_criteria_enable; } static void set_proto(void *outer_c, void *outer_v, u8 mask, u8 val) { MLX5_SET(fte_match_set_lyr_2_4, outer_c, ip_protocol, mask); MLX5_SET(fte_match_set_lyr_2_4, outer_v, ip_protocol, val); } static void set_tos(void *outer_c, void *outer_v, u8 mask, u8 val) { MLX5_SET(fte_match_set_lyr_2_4, outer_c, ip_ecn, mask); MLX5_SET(fte_match_set_lyr_2_4, outer_v, ip_ecn, val); MLX5_SET(fte_match_set_lyr_2_4, outer_c, ip_dscp, mask >> 2); MLX5_SET(fte_match_set_lyr_2_4, outer_v, ip_dscp, val >> 2); } #define LAST_ETH_FIELD vlan_tag #define LAST_IB_FIELD sl #define LAST_IPV4_FIELD tos #define LAST_IPV6_FIELD traffic_class #define LAST_TCP_UDP_FIELD src_port /* Field is the last supported field */ #define FIELDS_NOT_SUPPORTED(filter, field)\ memchr_inv((void *)&filter.field +\ sizeof(filter.field), 0,\ sizeof(filter) -\ offsetof(typeof(filter), field) -\ sizeof(filter.field)) static int parse_flow_attr(u32 *match_c, u32 *match_v, const union ib_flow_spec *ib_spec) { void *outer_headers_c = MLX5_ADDR_OF(fte_match_param, match_c, outer_headers); void *outer_headers_v = MLX5_ADDR_OF(fte_match_param, match_v, outer_headers); void *misc_params_c = MLX5_ADDR_OF(fte_match_param, match_c, misc_parameters); void *misc_params_v = MLX5_ADDR_OF(fte_match_param, match_v, misc_parameters); switch (ib_spec->type) { case IB_FLOW_SPEC_ETH: if (FIELDS_NOT_SUPPORTED(ib_spec->eth.mask, LAST_ETH_FIELD)) return -ENOTSUPP; ether_addr_copy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, outer_headers_c, dmac_47_16), ib_spec->eth.mask.dst_mac); ether_addr_copy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, outer_headers_v, dmac_47_16), ib_spec->eth.val.dst_mac); ether_addr_copy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, outer_headers_c, smac_47_16), ib_spec->eth.mask.src_mac); ether_addr_copy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, outer_headers_v, smac_47_16), ib_spec->eth.val.src_mac); if (ib_spec->eth.mask.vlan_tag) { MLX5_SET(fte_match_set_lyr_2_4, outer_headers_c, cvlan_tag, 1); MLX5_SET(fte_match_set_lyr_2_4, outer_headers_v, cvlan_tag, 1); MLX5_SET(fte_match_set_lyr_2_4, outer_headers_c, first_vid, ntohs(ib_spec->eth.mask.vlan_tag)); MLX5_SET(fte_match_set_lyr_2_4, outer_headers_v, first_vid, ntohs(ib_spec->eth.val.vlan_tag)); MLX5_SET(fte_match_set_lyr_2_4, outer_headers_c, first_cfi, ntohs(ib_spec->eth.mask.vlan_tag) >> 12); MLX5_SET(fte_match_set_lyr_2_4, outer_headers_v, first_cfi, ntohs(ib_spec->eth.val.vlan_tag) >> 12); MLX5_SET(fte_match_set_lyr_2_4, outer_headers_c, first_prio, ntohs(ib_spec->eth.mask.vlan_tag) >> 13); MLX5_SET(fte_match_set_lyr_2_4, outer_headers_v, first_prio, ntohs(ib_spec->eth.val.vlan_tag) >> 13); } MLX5_SET(fte_match_set_lyr_2_4, outer_headers_c, ethertype, ntohs(ib_spec->eth.mask.ether_type)); MLX5_SET(fte_match_set_lyr_2_4, outer_headers_v, ethertype, ntohs(ib_spec->eth.val.ether_type)); break; case IB_FLOW_SPEC_IPV4: if (FIELDS_NOT_SUPPORTED(ib_spec->ipv4.mask, LAST_IPV4_FIELD)) return -ENOTSUPP; MLX5_SET(fte_match_set_lyr_2_4, outer_headers_c, ethertype, 0xffff); MLX5_SET(fte_match_set_lyr_2_4, outer_headers_v, ethertype, ETH_P_IP); memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, outer_headers_c, src_ipv4_src_ipv6.ipv4_layout.ipv4), &ib_spec->ipv4.mask.src_ip, sizeof(ib_spec->ipv4.mask.src_ip)); memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, outer_headers_v, src_ipv4_src_ipv6.ipv4_layout.ipv4), &ib_spec->ipv4.val.src_ip, sizeof(ib_spec->ipv4.val.src_ip)); memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, outer_headers_c, dst_ipv4_dst_ipv6.ipv4_layout.ipv4), &ib_spec->ipv4.mask.dst_ip, sizeof(ib_spec->ipv4.mask.dst_ip)); memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, outer_headers_v, dst_ipv4_dst_ipv6.ipv4_layout.ipv4), &ib_spec->ipv4.val.dst_ip, sizeof(ib_spec->ipv4.val.dst_ip)); set_tos(outer_headers_c, outer_headers_v, ib_spec->ipv4.mask.tos, ib_spec->ipv4.val.tos); set_proto(outer_headers_c, outer_headers_v, ib_spec->ipv4.mask.proto, ib_spec->ipv4.val.proto); break; case IB_FLOW_SPEC_IPV6: if (FIELDS_NOT_SUPPORTED(ib_spec->ipv6.mask, LAST_IPV6_FIELD)) return -ENOTSUPP; MLX5_SET(fte_match_set_lyr_2_4, outer_headers_c, ethertype, 0xffff); MLX5_SET(fte_match_set_lyr_2_4, outer_headers_v, ethertype, IPPROTO_IPV6); memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, outer_headers_c, src_ipv4_src_ipv6.ipv6_layout.ipv6), &ib_spec->ipv6.mask.src_ip, sizeof(ib_spec->ipv6.mask.src_ip)); memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, outer_headers_v, src_ipv4_src_ipv6.ipv6_layout.ipv6), &ib_spec->ipv6.val.src_ip, sizeof(ib_spec->ipv6.val.src_ip)); memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, outer_headers_c, dst_ipv4_dst_ipv6.ipv6_layout.ipv6), &ib_spec->ipv6.mask.dst_ip, sizeof(ib_spec->ipv6.mask.dst_ip)); memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, outer_headers_v, dst_ipv4_dst_ipv6.ipv6_layout.ipv6), &ib_spec->ipv6.val.dst_ip, sizeof(ib_spec->ipv6.val.dst_ip)); set_tos(outer_headers_c, outer_headers_v, ib_spec->ipv6.mask.traffic_class, ib_spec->ipv6.val.traffic_class); set_proto(outer_headers_c, outer_headers_v, ib_spec->ipv6.mask.next_hdr, ib_spec->ipv6.val.next_hdr); MLX5_SET(fte_match_set_misc, misc_params_c, outer_ipv6_flow_label, ntohl(ib_spec->ipv6.mask.flow_label)); MLX5_SET(fte_match_set_misc, misc_params_v, outer_ipv6_flow_label, ntohl(ib_spec->ipv6.val.flow_label)); break; case IB_FLOW_SPEC_TCP: if (FIELDS_NOT_SUPPORTED(ib_spec->tcp_udp.mask, LAST_TCP_UDP_FIELD)) return -ENOTSUPP; MLX5_SET(fte_match_set_lyr_2_4, outer_headers_c, ip_protocol, 0xff); MLX5_SET(fte_match_set_lyr_2_4, outer_headers_v, ip_protocol, IPPROTO_TCP); MLX5_SET(fte_match_set_lyr_2_4, outer_headers_c, tcp_sport, ntohs(ib_spec->tcp_udp.mask.src_port)); MLX5_SET(fte_match_set_lyr_2_4, outer_headers_v, tcp_sport, ntohs(ib_spec->tcp_udp.val.src_port)); MLX5_SET(fte_match_set_lyr_2_4, outer_headers_c, tcp_dport, ntohs(ib_spec->tcp_udp.mask.dst_port)); MLX5_SET(fte_match_set_lyr_2_4, outer_headers_v, tcp_dport, ntohs(ib_spec->tcp_udp.val.dst_port)); break; case IB_FLOW_SPEC_UDP: if (FIELDS_NOT_SUPPORTED(ib_spec->tcp_udp.mask, LAST_TCP_UDP_FIELD)) return -ENOTSUPP; MLX5_SET(fte_match_set_lyr_2_4, outer_headers_c, ip_protocol, 0xff); MLX5_SET(fte_match_set_lyr_2_4, outer_headers_v, ip_protocol, IPPROTO_UDP); MLX5_SET(fte_match_set_lyr_2_4, outer_headers_c, udp_sport, ntohs(ib_spec->tcp_udp.mask.src_port)); MLX5_SET(fte_match_set_lyr_2_4, outer_headers_v, udp_sport, ntohs(ib_spec->tcp_udp.val.src_port)); MLX5_SET(fte_match_set_lyr_2_4, outer_headers_c, udp_dport, ntohs(ib_spec->tcp_udp.mask.dst_port)); MLX5_SET(fte_match_set_lyr_2_4, outer_headers_v, udp_dport, ntohs(ib_spec->tcp_udp.val.dst_port)); break; default: return -EINVAL; } return 0; } /* If a flow could catch both multicast and unicast packets, * it won't fall into the multicast flow steering table and this rule * could steal other multicast packets. */ static bool flow_is_multicast_only(struct ib_flow_attr *ib_attr) { struct ib_flow_spec_eth *eth_spec; if (ib_attr->type != IB_FLOW_ATTR_NORMAL || ib_attr->size < sizeof(struct ib_flow_attr) + sizeof(struct ib_flow_spec_eth) || ib_attr->num_of_specs < 1) return false; eth_spec = (struct ib_flow_spec_eth *)(ib_attr + 1); if (eth_spec->type != IB_FLOW_SPEC_ETH || eth_spec->size != sizeof(*eth_spec)) return false; return is_multicast_ether_addr(eth_spec->mask.dst_mac) && is_multicast_ether_addr(eth_spec->val.dst_mac); } static bool is_valid_attr(const struct ib_flow_attr *flow_attr) { union ib_flow_spec *ib_spec = (union ib_flow_spec *)(flow_attr + 1); bool has_ipv4_spec = false; bool eth_type_ipv4 = true; unsigned int spec_index; /* Validate that ethertype is correct */ for (spec_index = 0; spec_index < flow_attr->num_of_specs; spec_index++) { if (ib_spec->type == IB_FLOW_SPEC_ETH && ib_spec->eth.mask.ether_type) { if (!((ib_spec->eth.mask.ether_type == htons(0xffff)) && ib_spec->eth.val.ether_type == htons(ETH_P_IP))) eth_type_ipv4 = false; } else if (ib_spec->type == IB_FLOW_SPEC_IPV4) { has_ipv4_spec = true; } ib_spec = (void *)ib_spec + ib_spec->size; } return !has_ipv4_spec || eth_type_ipv4; } static void put_flow_table(struct mlx5_ib_dev *dev, struct mlx5_ib_flow_prio *prio, bool ft_added) { prio->refcount -= !!ft_added; if (!prio->refcount) { mlx5_destroy_flow_table(prio->flow_table); prio->flow_table = NULL; } } static int mlx5_ib_destroy_flow(struct ib_flow *flow_id) { struct mlx5_ib_dev *dev = to_mdev(flow_id->qp->device); struct mlx5_ib_flow_handler *handler = container_of(flow_id, struct mlx5_ib_flow_handler, ibflow); struct mlx5_ib_flow_handler *iter, *tmp; mutex_lock(&dev->flow_db.lock); list_for_each_entry_safe(iter, tmp, &handler->list, list) { mlx5_del_flow_rule(iter->rule); put_flow_table(dev, iter->prio, true); list_del(&iter->list); kfree(iter); } mlx5_del_flow_rule(handler->rule); put_flow_table(dev, handler->prio, true); mutex_unlock(&dev->flow_db.lock); kfree(handler); return 0; } static int ib_prio_to_core_prio(unsigned int priority, bool dont_trap) { priority *= 2; if (!dont_trap) priority++; return priority; } enum flow_table_type { MLX5_IB_FT_RX, MLX5_IB_FT_TX }; #define MLX5_FS_MAX_TYPES 10 #define MLX5_FS_MAX_ENTRIES 32000UL static struct mlx5_ib_flow_prio *get_flow_table(struct mlx5_ib_dev *dev, struct ib_flow_attr *flow_attr, enum flow_table_type ft_type) { bool dont_trap = flow_attr->flags & IB_FLOW_ATTR_FLAGS_DONT_TRAP; struct mlx5_flow_namespace *ns = NULL; struct mlx5_ib_flow_prio *prio; struct mlx5_flow_table *ft; int num_entries; int num_groups; int priority; int err = 0; if (flow_attr->type == IB_FLOW_ATTR_NORMAL) { if (flow_is_multicast_only(flow_attr) && !dont_trap) priority = MLX5_IB_FLOW_MCAST_PRIO; else priority = ib_prio_to_core_prio(flow_attr->priority, dont_trap); ns = mlx5_get_flow_namespace(dev->mdev, MLX5_FLOW_NAMESPACE_BYPASS); num_entries = MLX5_FS_MAX_ENTRIES; num_groups = MLX5_FS_MAX_TYPES; prio = &dev->flow_db.prios[priority]; } else if (flow_attr->type == IB_FLOW_ATTR_ALL_DEFAULT || flow_attr->type == IB_FLOW_ATTR_MC_DEFAULT) { ns = mlx5_get_flow_namespace(dev->mdev, MLX5_FLOW_NAMESPACE_LEFTOVERS); build_leftovers_ft_param("bypass", &priority, &num_entries, &num_groups); prio = &dev->flow_db.prios[MLX5_IB_FLOW_LEFTOVERS_PRIO]; } else if (flow_attr->type == IB_FLOW_ATTR_SNIFFER) { if (!MLX5_CAP_FLOWTABLE(dev->mdev, allow_sniffer_and_nic_rx_shared_tir)) return ERR_PTR(-ENOTSUPP); ns = mlx5_get_flow_namespace(dev->mdev, ft_type == MLX5_IB_FT_RX ? MLX5_FLOW_NAMESPACE_SNIFFER_RX : MLX5_FLOW_NAMESPACE_SNIFFER_TX); prio = &dev->flow_db.sniffer[ft_type]; priority = 0; num_entries = 1; num_groups = 1; } if (!ns) return ERR_PTR(-ENOTSUPP); ft = prio->flow_table; if (!ft) { ft = mlx5_create_auto_grouped_flow_table(ns, priority, "bypass", num_entries, num_groups); if (!IS_ERR(ft)) { prio->refcount = 0; prio->flow_table = ft; } else { err = PTR_ERR(ft); } } return err ? ERR_PTR(err) : prio; } static struct mlx5_ib_flow_handler *create_flow_rule(struct mlx5_ib_dev *dev, struct mlx5_ib_flow_prio *ft_prio, const struct ib_flow_attr *flow_attr, struct mlx5_flow_destination *dst) { struct mlx5_flow_table *ft = ft_prio->flow_table; struct mlx5_ib_flow_handler *handler; struct mlx5_flow_spec *spec; const void *ib_flow = (const void *)flow_attr + sizeof(*flow_attr); unsigned int spec_index; u32 action; int err = 0; if (!is_valid_attr(flow_attr)) return ERR_PTR(-EINVAL); spec = mlx5_vzalloc(sizeof(*spec)); handler = kzalloc(sizeof(*handler), GFP_KERNEL); if (!handler || !spec) { err = -ENOMEM; goto free; } INIT_LIST_HEAD(&handler->list); for (spec_index = 0; spec_index < flow_attr->num_of_specs; spec_index++) { err = parse_flow_attr(spec->match_criteria, spec->match_value, ib_flow); if (err < 0) goto free; ib_flow += ((union ib_flow_spec *)ib_flow)->size; } spec->match_criteria_enable = get_match_criteria_enable(spec->match_criteria); action = dst ? MLX5_FLOW_CONTEXT_ACTION_FWD_DEST : MLX5_FLOW_CONTEXT_ACTION_FWD_NEXT_PRIO; handler->rule = mlx5_add_flow_rule(ft, spec->match_criteria_enable, spec->match_criteria, spec->match_value, action, MLX5_FS_DEFAULT_FLOW_TAG, dst); if (IS_ERR(handler->rule)) { err = PTR_ERR(handler->rule); goto free; } ft_prio->refcount++; handler->prio = ft_prio; ft_prio->flow_table = ft; free: if (err) kfree(handler); kvfree(spec); return err ? ERR_PTR(err) : handler; } static struct mlx5_ib_flow_handler *create_dont_trap_rule(struct mlx5_ib_dev *dev, struct mlx5_ib_flow_prio *ft_prio, struct ib_flow_attr *flow_attr, struct mlx5_flow_destination *dst) { struct mlx5_ib_flow_handler *handler_dst = NULL; struct mlx5_ib_flow_handler *handler = NULL; handler = create_flow_rule(dev, ft_prio, flow_attr, NULL); if (!IS_ERR(handler)) { handler_dst = create_flow_rule(dev, ft_prio, flow_attr, dst); if (IS_ERR(handler_dst)) { mlx5_del_flow_rule(handler->rule); ft_prio->refcount--; kfree(handler); handler = handler_dst; } else { list_add(&handler_dst->list, &handler->list); } } return handler; } enum { LEFTOVERS_MC, LEFTOVERS_UC, }; static struct mlx5_ib_flow_handler *create_leftovers_rule(struct mlx5_ib_dev *dev, struct mlx5_ib_flow_prio *ft_prio, struct ib_flow_attr *flow_attr, struct mlx5_flow_destination *dst) { struct mlx5_ib_flow_handler *handler_ucast = NULL; struct mlx5_ib_flow_handler *handler = NULL; static struct { struct ib_flow_attr flow_attr; struct ib_flow_spec_eth eth_flow; } leftovers_specs[] = { [LEFTOVERS_MC] = { .flow_attr = { .num_of_specs = 1, .size = sizeof(leftovers_specs[0]) }, .eth_flow = { .type = IB_FLOW_SPEC_ETH, .size = sizeof(struct ib_flow_spec_eth), .mask = {.dst_mac = {0x1} }, .val = {.dst_mac = {0x1} } } }, [LEFTOVERS_UC] = { .flow_attr = { .num_of_specs = 1, .size = sizeof(leftovers_specs[0]) }, .eth_flow = { .type = IB_FLOW_SPEC_ETH, .size = sizeof(struct ib_flow_spec_eth), .mask = {.dst_mac = {0x1} }, .val = {.dst_mac = {} } } } }; handler = create_flow_rule(dev, ft_prio, &leftovers_specs[LEFTOVERS_MC].flow_attr, dst); if (!IS_ERR(handler) && flow_attr->type == IB_FLOW_ATTR_ALL_DEFAULT) { handler_ucast = create_flow_rule(dev, ft_prio, &leftovers_specs[LEFTOVERS_UC].flow_attr, dst); if (IS_ERR(handler_ucast)) { mlx5_del_flow_rule(handler->rule); ft_prio->refcount--; kfree(handler); handler = handler_ucast; } else { list_add(&handler_ucast->list, &handler->list); } } return handler; } static struct mlx5_ib_flow_handler *create_sniffer_rule(struct mlx5_ib_dev *dev, struct mlx5_ib_flow_prio *ft_rx, struct mlx5_ib_flow_prio *ft_tx, struct mlx5_flow_destination *dst) { struct mlx5_ib_flow_handler *handler_rx; struct mlx5_ib_flow_handler *handler_tx; int err; static const struct ib_flow_attr flow_attr = { .num_of_specs = 0, .size = sizeof(flow_attr) }; handler_rx = create_flow_rule(dev, ft_rx, &flow_attr, dst); if (IS_ERR(handler_rx)) { err = PTR_ERR(handler_rx); goto err; } handler_tx = create_flow_rule(dev, ft_tx, &flow_attr, dst); if (IS_ERR(handler_tx)) { err = PTR_ERR(handler_tx); goto err_tx; } list_add(&handler_tx->list, &handler_rx->list); return handler_rx; err_tx: mlx5_del_flow_rule(handler_rx->rule); ft_rx->refcount--; kfree(handler_rx); err: return ERR_PTR(err); } static struct ib_flow *mlx5_ib_create_flow(struct ib_qp *qp, struct ib_flow_attr *flow_attr, int domain) { struct mlx5_ib_dev *dev = to_mdev(qp->device); struct mlx5_ib_qp *mqp = to_mqp(qp); struct mlx5_ib_flow_handler *handler = NULL; struct mlx5_flow_destination *dst = NULL; struct mlx5_ib_flow_prio *ft_prio_tx = NULL; struct mlx5_ib_flow_prio *ft_prio; int err; if (flow_attr->priority > MLX5_IB_FLOW_LAST_PRIO) return ERR_PTR(-ENOSPC); if (domain != IB_FLOW_DOMAIN_USER || flow_attr->port > MLX5_CAP_GEN(dev->mdev, num_ports) || (flow_attr->flags & ~IB_FLOW_ATTR_FLAGS_DONT_TRAP)) return ERR_PTR(-EINVAL); dst = kzalloc(sizeof(*dst), GFP_KERNEL); if (!dst) return ERR_PTR(-ENOMEM); mutex_lock(&dev->flow_db.lock); ft_prio = get_flow_table(dev, flow_attr, MLX5_IB_FT_RX); if (IS_ERR(ft_prio)) { err = PTR_ERR(ft_prio); goto unlock; } if (flow_attr->type == IB_FLOW_ATTR_SNIFFER) { ft_prio_tx = get_flow_table(dev, flow_attr, MLX5_IB_FT_TX); if (IS_ERR(ft_prio_tx)) { err = PTR_ERR(ft_prio_tx); ft_prio_tx = NULL; goto destroy_ft; } } dst->type = MLX5_FLOW_DESTINATION_TYPE_TIR; if (mqp->flags & MLX5_IB_QP_RSS) dst->tir_num = mqp->rss_qp.tirn; else dst->tir_num = mqp->raw_packet_qp.rq.tirn; if (flow_attr->type == IB_FLOW_ATTR_NORMAL) { if (flow_attr->flags & IB_FLOW_ATTR_FLAGS_DONT_TRAP) { handler = create_dont_trap_rule(dev, ft_prio, flow_attr, dst); } else { handler = create_flow_rule(dev, ft_prio, flow_attr, dst); } } else if (flow_attr->type == IB_FLOW_ATTR_ALL_DEFAULT || flow_attr->type == IB_FLOW_ATTR_MC_DEFAULT) { handler = create_leftovers_rule(dev, ft_prio, flow_attr, dst); } else if (flow_attr->type == IB_FLOW_ATTR_SNIFFER) { handler = create_sniffer_rule(dev, ft_prio, ft_prio_tx, dst); } else { err = -EINVAL; goto destroy_ft; } if (IS_ERR(handler)) { err = PTR_ERR(handler); handler = NULL; goto destroy_ft; } mutex_unlock(&dev->flow_db.lock); kfree(dst); return &handler->ibflow; destroy_ft: put_flow_table(dev, ft_prio, false); if (ft_prio_tx) put_flow_table(dev, ft_prio_tx, false); unlock: mutex_unlock(&dev->flow_db.lock); kfree(dst); kfree(handler); return ERR_PTR(err); } static int mlx5_ib_mcg_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) { struct mlx5_ib_dev *dev = to_mdev(ibqp->device); int err; err = mlx5_core_attach_mcg(dev->mdev, gid, ibqp->qp_num); if (err) mlx5_ib_warn(dev, "failed attaching QPN 0x%x, MGID %pI6\n", ibqp->qp_num, gid->raw); return err; } static int mlx5_ib_mcg_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) { struct mlx5_ib_dev *dev = to_mdev(ibqp->device); int err; err = mlx5_core_detach_mcg(dev->mdev, gid, ibqp->qp_num); if (err) mlx5_ib_warn(dev, "failed detaching QPN 0x%x, MGID %pI6\n", ibqp->qp_num, gid->raw); return err; } static int init_node_data(struct mlx5_ib_dev *dev) { int err; err = mlx5_query_node_desc(dev, dev->ib_dev.node_desc); if (err) return err; return mlx5_query_node_guid(dev, &dev->ib_dev.node_guid); } static ssize_t show_fw_pages(struct device *device, struct device_attribute *attr, char *buf) { struct mlx5_ib_dev *dev = container_of(device, struct mlx5_ib_dev, ib_dev.dev); return sprintf(buf, "%lld\n", (long long)dev->mdev->priv.fw_pages); } static ssize_t show_reg_pages(struct device *device, struct device_attribute *attr, char *buf) { struct mlx5_ib_dev *dev = container_of(device, struct mlx5_ib_dev, ib_dev.dev); return sprintf(buf, "%d\n", atomic_read(&dev->mdev->priv.reg_pages)); } static ssize_t show_hca(struct device *device, struct device_attribute *attr, char *buf) { struct mlx5_ib_dev *dev = container_of(device, struct mlx5_ib_dev, ib_dev.dev); return sprintf(buf, "MT%d\n", dev->mdev->pdev->device); } static ssize_t show_rev(struct device *device, struct device_attribute *attr, char *buf) { struct mlx5_ib_dev *dev = container_of(device, struct mlx5_ib_dev, ib_dev.dev); return sprintf(buf, "%x\n", dev->mdev->pdev->revision); } static ssize_t show_board(struct device *device, struct device_attribute *attr, char *buf) { struct mlx5_ib_dev *dev = container_of(device, struct mlx5_ib_dev, ib_dev.dev); return sprintf(buf, "%.*s\n", MLX5_BOARD_ID_LEN, dev->mdev->board_id); } static DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL); static DEVICE_ATTR(hca_type, S_IRUGO, show_hca, NULL); static DEVICE_ATTR(board_id, S_IRUGO, show_board, NULL); static DEVICE_ATTR(fw_pages, S_IRUGO, show_fw_pages, NULL); static DEVICE_ATTR(reg_pages, S_IRUGO, show_reg_pages, NULL); static struct device_attribute *mlx5_class_attributes[] = { &dev_attr_hw_rev, &dev_attr_hca_type, &dev_attr_board_id, &dev_attr_fw_pages, &dev_attr_reg_pages, }; static void pkey_change_handler(struct work_struct *work) { struct mlx5_ib_port_resources *ports = container_of(work, struct mlx5_ib_port_resources, pkey_change_work); mutex_lock(&ports->devr->mutex); mlx5_ib_gsi_pkey_change(ports->gsi); mutex_unlock(&ports->devr->mutex); } static void mlx5_ib_handle_internal_error(struct mlx5_ib_dev *ibdev) { struct mlx5_ib_qp *mqp; struct mlx5_ib_cq *send_mcq, *recv_mcq; struct mlx5_core_cq *mcq; struct list_head cq_armed_list; unsigned long flags_qp; unsigned long flags_cq; unsigned long flags; INIT_LIST_HEAD(&cq_armed_list); /* Go over qp list reside on that ibdev, sync with create/destroy qp.*/ spin_lock_irqsave(&ibdev->reset_flow_resource_lock, flags); list_for_each_entry(mqp, &ibdev->qp_list, qps_list) { spin_lock_irqsave(&mqp->sq.lock, flags_qp); if (mqp->sq.tail != mqp->sq.head) { send_mcq = to_mcq(mqp->ibqp.send_cq); spin_lock_irqsave(&send_mcq->lock, flags_cq); if (send_mcq->mcq.comp && mqp->ibqp.send_cq->comp_handler) { if (!send_mcq->mcq.reset_notify_added) { send_mcq->mcq.reset_notify_added = 1; list_add_tail(&send_mcq->mcq.reset_notify, &cq_armed_list); } } spin_unlock_irqrestore(&send_mcq->lock, flags_cq); } spin_unlock_irqrestore(&mqp->sq.lock, flags_qp); spin_lock_irqsave(&mqp->rq.lock, flags_qp); /* no handling is needed for SRQ */ if (!mqp->ibqp.srq) { if (mqp->rq.tail != mqp->rq.head) { recv_mcq = to_mcq(mqp->ibqp.recv_cq); spin_lock_irqsave(&recv_mcq->lock, flags_cq); if (recv_mcq->mcq.comp && mqp->ibqp.recv_cq->comp_handler) { if (!recv_mcq->mcq.reset_notify_added) { recv_mcq->mcq.reset_notify_added = 1; list_add_tail(&recv_mcq->mcq.reset_notify, &cq_armed_list); } } spin_unlock_irqrestore(&recv_mcq->lock, flags_cq); } } spin_unlock_irqrestore(&mqp->rq.lock, flags_qp); } /*At that point all inflight post send were put to be executed as of we * lock/unlock above locks Now need to arm all involved CQs. */ list_for_each_entry(mcq, &cq_armed_list, reset_notify) { mcq->comp(mcq); } spin_unlock_irqrestore(&ibdev->reset_flow_resource_lock, flags); } static void mlx5_ib_event(struct mlx5_core_dev *dev, void *context, enum mlx5_dev_event event, unsigned long param) { struct mlx5_ib_dev *ibdev = (struct mlx5_ib_dev *)context; struct ib_event ibev; bool fatal = false; u8 port = 0; switch (event) { case MLX5_DEV_EVENT_SYS_ERROR: ibev.event = IB_EVENT_DEVICE_FATAL; mlx5_ib_handle_internal_error(ibdev); fatal = true; break; case MLX5_DEV_EVENT_PORT_UP: case MLX5_DEV_EVENT_PORT_DOWN: case MLX5_DEV_EVENT_PORT_INITIALIZED: port = (u8)param; /* In RoCE, port up/down events are handled in * mlx5_netdev_event(). */ if (mlx5_ib_port_link_layer(&ibdev->ib_dev, port) == IB_LINK_LAYER_ETHERNET) return; ibev.event = (event == MLX5_DEV_EVENT_PORT_UP) ? IB_EVENT_PORT_ACTIVE : IB_EVENT_PORT_ERR; break; case MLX5_DEV_EVENT_LID_CHANGE: ibev.event = IB_EVENT_LID_CHANGE; port = (u8)param; break; case MLX5_DEV_EVENT_PKEY_CHANGE: ibev.event = IB_EVENT_PKEY_CHANGE; port = (u8)param; schedule_work(&ibdev->devr.ports[port - 1].pkey_change_work); break; case MLX5_DEV_EVENT_GUID_CHANGE: ibev.event = IB_EVENT_GID_CHANGE; port = (u8)param; break; case MLX5_DEV_EVENT_CLIENT_REREG: ibev.event = IB_EVENT_CLIENT_REREGISTER; port = (u8)param; break; default: break; } ibev.device = &ibdev->ib_dev; ibev.element.port_num = port; if (port < 1 || port > ibdev->num_ports) { mlx5_ib_warn(ibdev, "warning: event on port %d\n", port); return; } if (ibdev->ib_active) ib_dispatch_event(&ibev); if (fatal) ibdev->ib_active = false; } static void get_ext_port_caps(struct mlx5_ib_dev *dev) { int port; for (port = 1; port <= MLX5_CAP_GEN(dev->mdev, num_ports); port++) mlx5_query_ext_port_caps(dev, port); } static int get_port_caps(struct mlx5_ib_dev *dev) { struct ib_device_attr *dprops = NULL; struct ib_port_attr *pprops = NULL; int err = -ENOMEM; int port; struct ib_udata uhw = {.inlen = 0, .outlen = 0}; pprops = kmalloc(sizeof(*pprops), GFP_KERNEL); if (!pprops) goto out; dprops = kmalloc(sizeof(*dprops), GFP_KERNEL); if (!dprops) goto out; err = mlx5_ib_query_device(&dev->ib_dev, dprops, &uhw); if (err) { mlx5_ib_warn(dev, "query_device failed %d\n", err); goto out; } for (port = 1; port <= MLX5_CAP_GEN(dev->mdev, num_ports); port++) { err = mlx5_ib_query_port(&dev->ib_dev, port, pprops); if (err) { mlx5_ib_warn(dev, "query_port %d failed %d\n", port, err); break; } dev->mdev->port_caps[port - 1].pkey_table_len = dprops->max_pkeys; dev->mdev->port_caps[port - 1].gid_table_len = pprops->gid_tbl_len; mlx5_ib_dbg(dev, "pkey_table_len %d, gid_table_len %d\n", dprops->max_pkeys, pprops->gid_tbl_len); } out: kfree(pprops); kfree(dprops); return err; } static void destroy_umrc_res(struct mlx5_ib_dev *dev) { int err; err = mlx5_mr_cache_cleanup(dev); if (err) mlx5_ib_warn(dev, "mr cache cleanup failed\n"); mlx5_ib_destroy_qp(dev->umrc.qp); ib_free_cq(dev->umrc.cq); ib_dealloc_pd(dev->umrc.pd); } enum { MAX_UMR_WR = 128, }; static int create_umr_res(struct mlx5_ib_dev *dev) { struct ib_qp_init_attr *init_attr = NULL; struct ib_qp_attr *attr = NULL; struct ib_pd *pd; struct ib_cq *cq; struct ib_qp *qp; int ret; attr = kzalloc(sizeof(*attr), GFP_KERNEL); init_attr = kzalloc(sizeof(*init_attr), GFP_KERNEL); if (!attr || !init_attr) { ret = -ENOMEM; goto error_0; } pd = ib_alloc_pd(&dev->ib_dev, 0); if (IS_ERR(pd)) { mlx5_ib_dbg(dev, "Couldn't create PD for sync UMR QP\n"); ret = PTR_ERR(pd); goto error_0; } cq = ib_alloc_cq(&dev->ib_dev, NULL, 128, 0, IB_POLL_SOFTIRQ); if (IS_ERR(cq)) { mlx5_ib_dbg(dev, "Couldn't create CQ for sync UMR QP\n"); ret = PTR_ERR(cq); goto error_2; } init_attr->send_cq = cq; init_attr->recv_cq = cq; init_attr->sq_sig_type = IB_SIGNAL_ALL_WR; init_attr->cap.max_send_wr = MAX_UMR_WR; init_attr->cap.max_send_sge = 1; init_attr->qp_type = MLX5_IB_QPT_REG_UMR; init_attr->port_num = 1; qp = mlx5_ib_create_qp(pd, init_attr, NULL); if (IS_ERR(qp)) { mlx5_ib_dbg(dev, "Couldn't create sync UMR QP\n"); ret = PTR_ERR(qp); goto error_3; } qp->device = &dev->ib_dev; qp->real_qp = qp; qp->uobject = NULL; qp->qp_type = MLX5_IB_QPT_REG_UMR; attr->qp_state = IB_QPS_INIT; attr->port_num = 1; ret = mlx5_ib_modify_qp(qp, attr, IB_QP_STATE | IB_QP_PKEY_INDEX | IB_QP_PORT, NULL); if (ret) { mlx5_ib_dbg(dev, "Couldn't modify UMR QP\n"); goto error_4; } memset(attr, 0, sizeof(*attr)); attr->qp_state = IB_QPS_RTR; attr->path_mtu = IB_MTU_256; ret = mlx5_ib_modify_qp(qp, attr, IB_QP_STATE, NULL); if (ret) { mlx5_ib_dbg(dev, "Couldn't modify umr QP to rtr\n"); goto error_4; } memset(attr, 0, sizeof(*attr)); attr->qp_state = IB_QPS_RTS; ret = mlx5_ib_modify_qp(qp, attr, IB_QP_STATE, NULL); if (ret) { mlx5_ib_dbg(dev, "Couldn't modify umr QP to rts\n"); goto error_4; } dev->umrc.qp = qp; dev->umrc.cq = cq; dev->umrc.pd = pd; sema_init(&dev->umrc.sem, MAX_UMR_WR); ret = mlx5_mr_cache_init(dev); if (ret) { mlx5_ib_warn(dev, "mr cache init failed %d\n", ret); goto error_4; } kfree(attr); kfree(init_attr); return 0; error_4: mlx5_ib_destroy_qp(qp); error_3: ib_free_cq(cq); error_2: ib_dealloc_pd(pd); error_0: kfree(attr); kfree(init_attr); return ret; } static int create_dev_resources(struct mlx5_ib_resources *devr) { struct ib_srq_init_attr attr; struct mlx5_ib_dev *dev; struct ib_cq_init_attr cq_attr = {.cqe = 1}; int port; int ret = 0; dev = container_of(devr, struct mlx5_ib_dev, devr); mutex_init(&devr->mutex); devr->p0 = mlx5_ib_alloc_pd(&dev->ib_dev, NULL, NULL); if (IS_ERR(devr->p0)) { ret = PTR_ERR(devr->p0); goto error0; } devr->p0->device = &dev->ib_dev; devr->p0->uobject = NULL; atomic_set(&devr->p0->usecnt, 0); devr->c0 = mlx5_ib_create_cq(&dev->ib_dev, &cq_attr, NULL, NULL); if (IS_ERR(devr->c0)) { ret = PTR_ERR(devr->c0); goto error1; } devr->c0->device = &dev->ib_dev; devr->c0->uobject = NULL; devr->c0->comp_handler = NULL; devr->c0->event_handler = NULL; devr->c0->cq_context = NULL; atomic_set(&devr->c0->usecnt, 0); devr->x0 = mlx5_ib_alloc_xrcd(&dev->ib_dev, NULL, NULL); if (IS_ERR(devr->x0)) { ret = PTR_ERR(devr->x0); goto error2; } devr->x0->device = &dev->ib_dev; devr->x0->inode = NULL; atomic_set(&devr->x0->usecnt, 0); mutex_init(&devr->x0->tgt_qp_mutex); INIT_LIST_HEAD(&devr->x0->tgt_qp_list); devr->x1 = mlx5_ib_alloc_xrcd(&dev->ib_dev, NULL, NULL); if (IS_ERR(devr->x1)) { ret = PTR_ERR(devr->x1); goto error3; } devr->x1->device = &dev->ib_dev; devr->x1->inode = NULL; atomic_set(&devr->x1->usecnt, 0); mutex_init(&devr->x1->tgt_qp_mutex); INIT_LIST_HEAD(&devr->x1->tgt_qp_list); memset(&attr, 0, sizeof(attr)); attr.attr.max_sge = 1; attr.attr.max_wr = 1; attr.srq_type = IB_SRQT_XRC; attr.ext.xrc.cq = devr->c0; attr.ext.xrc.xrcd = devr->x0; devr->s0 = mlx5_ib_create_srq(devr->p0, &attr, NULL); if (IS_ERR(devr->s0)) { ret = PTR_ERR(devr->s0); goto error4; } devr->s0->device = &dev->ib_dev; devr->s0->pd = devr->p0; devr->s0->uobject = NULL; devr->s0->event_handler = NULL; devr->s0->srq_context = NULL; devr->s0->srq_type = IB_SRQT_XRC; devr->s0->ext.xrc.xrcd = devr->x0; devr->s0->ext.xrc.cq = devr->c0; atomic_inc(&devr->s0->ext.xrc.xrcd->usecnt); atomic_inc(&devr->s0->ext.xrc.cq->usecnt); atomic_inc(&devr->p0->usecnt); atomic_set(&devr->s0->usecnt, 0); memset(&attr, 0, sizeof(attr)); attr.attr.max_sge = 1; attr.attr.max_wr = 1; attr.srq_type = IB_SRQT_BASIC; devr->s1 = mlx5_ib_create_srq(devr->p0, &attr, NULL); if (IS_ERR(devr->s1)) { ret = PTR_ERR(devr->s1); goto error5; } devr->s1->device = &dev->ib_dev; devr->s1->pd = devr->p0; devr->s1->uobject = NULL; devr->s1->event_handler = NULL; devr->s1->srq_context = NULL; devr->s1->srq_type = IB_SRQT_BASIC; devr->s1->ext.xrc.cq = devr->c0; atomic_inc(&devr->p0->usecnt); atomic_set(&devr->s0->usecnt, 0); for (port = 0; port < ARRAY_SIZE(devr->ports); ++port) { INIT_WORK(&devr->ports[port].pkey_change_work, pkey_change_handler); devr->ports[port].devr = devr; } return 0; error5: mlx5_ib_destroy_srq(devr->s0); error4: mlx5_ib_dealloc_xrcd(devr->x1); error3: mlx5_ib_dealloc_xrcd(devr->x0); error2: mlx5_ib_destroy_cq(devr->c0); error1: mlx5_ib_dealloc_pd(devr->p0); error0: return ret; } static void destroy_dev_resources(struct mlx5_ib_resources *devr) { struct mlx5_ib_dev *dev = container_of(devr, struct mlx5_ib_dev, devr); int port; mlx5_ib_destroy_srq(devr->s1); mlx5_ib_destroy_srq(devr->s0); mlx5_ib_dealloc_xrcd(devr->x0); mlx5_ib_dealloc_xrcd(devr->x1); mlx5_ib_destroy_cq(devr->c0); mlx5_ib_dealloc_pd(devr->p0); /* Make sure no change P_Key work items are still executing */ for (port = 0; port < dev->num_ports; ++port) cancel_work_sync(&devr->ports[port].pkey_change_work); } static u32 get_core_cap_flags(struct ib_device *ibdev) { struct mlx5_ib_dev *dev = to_mdev(ibdev); enum rdma_link_layer ll = mlx5_ib_port_link_layer(ibdev, 1); u8 l3_type_cap = MLX5_CAP_ROCE(dev->mdev, l3_type); u8 roce_version_cap = MLX5_CAP_ROCE(dev->mdev, roce_version); u32 ret = 0; if (ll == IB_LINK_LAYER_INFINIBAND) return RDMA_CORE_PORT_IBA_IB; if (!(l3_type_cap & MLX5_ROCE_L3_TYPE_IPV4_CAP)) return 0; if (!(l3_type_cap & MLX5_ROCE_L3_TYPE_IPV6_CAP)) return 0; if (roce_version_cap & MLX5_ROCE_VERSION_1_CAP) ret |= RDMA_CORE_PORT_IBA_ROCE; if (roce_version_cap & MLX5_ROCE_VERSION_2_CAP) ret |= RDMA_CORE_PORT_IBA_ROCE_UDP_ENCAP; return ret; } static int mlx5_port_immutable(struct ib_device *ibdev, u8 port_num, struct ib_port_immutable *immutable) { struct ib_port_attr attr; int err; err = mlx5_ib_query_port(ibdev, port_num, &attr); if (err) return err; immutable->pkey_tbl_len = attr.pkey_tbl_len; immutable->gid_tbl_len = attr.gid_tbl_len; immutable->core_cap_flags = get_core_cap_flags(ibdev); immutable->max_mad_size = IB_MGMT_MAD_SIZE; return 0; } static void get_dev_fw_str(struct ib_device *ibdev, char *str, size_t str_len) { struct mlx5_ib_dev *dev = container_of(ibdev, struct mlx5_ib_dev, ib_dev); snprintf(str, str_len, "%d.%d.%04d", fw_rev_maj(dev->mdev), fw_rev_min(dev->mdev), fw_rev_sub(dev->mdev)); } static int mlx5_roce_lag_init(struct mlx5_ib_dev *dev) { return 0; } static void mlx5_roce_lag_cleanup(struct mlx5_ib_dev *dev) { } static void mlx5_remove_roce_notifier(struct mlx5_ib_dev *dev) { if (dev->roce.nb.notifier_call) { unregister_netdevice_notifier(&dev->roce.nb); dev->roce.nb.notifier_call = NULL; } } static int mlx5_enable_roce(struct mlx5_ib_dev *dev) { VNET_ITERATOR_DECL(vnet_iter); struct net_device *idev; int err; /* Check if mlx5en net device already exists */ VNET_LIST_RLOCK(); VNET_FOREACH(vnet_iter) { IFNET_RLOCK(); CURVNET_SET_QUIET(vnet_iter); - TAILQ_FOREACH(idev, &V_ifnet, if_link) { + CK_STAILQ_FOREACH(idev, &V_ifnet, if_link) { /* check if network interface belongs to mlx5en */ if (!mlx5_netdev_match(idev, dev->mdev, "mce")) continue; write_lock(&dev->roce.netdev_lock); dev->roce.netdev = idev; write_unlock(&dev->roce.netdev_lock); } CURVNET_RESTORE(); IFNET_RUNLOCK(); } VNET_LIST_RUNLOCK(); dev->roce.nb.notifier_call = mlx5_netdev_event; err = register_netdevice_notifier(&dev->roce.nb); if (err) { dev->roce.nb.notifier_call = NULL; return err; } err = mlx5_nic_vport_enable_roce(dev->mdev); if (err) goto err_unregister_netdevice_notifier; err = mlx5_roce_lag_init(dev); if (err) goto err_disable_roce; return 0; err_disable_roce: mlx5_nic_vport_disable_roce(dev->mdev); err_unregister_netdevice_notifier: mlx5_remove_roce_notifier(dev); return err; } static void mlx5_disable_roce(struct mlx5_ib_dev *dev) { mlx5_roce_lag_cleanup(dev); mlx5_nic_vport_disable_roce(dev->mdev); } static void mlx5_ib_dealloc_q_port_counter(struct mlx5_ib_dev *dev, u8 port_num) { mlx5_vport_dealloc_q_counter(dev->mdev, MLX5_INTERFACE_PROTOCOL_IB, dev->port[port_num].q_cnt_id); dev->port[port_num].q_cnt_id = 0; } static void mlx5_ib_dealloc_q_counters(struct mlx5_ib_dev *dev) { unsigned int i; for (i = 0; i < dev->num_ports; i++) mlx5_ib_dealloc_q_port_counter(dev, i); } static int mlx5_ib_alloc_q_counters(struct mlx5_ib_dev *dev) { int i; int ret; for (i = 0; i < dev->num_ports; i++) { ret = mlx5_vport_alloc_q_counter(dev->mdev, MLX5_INTERFACE_PROTOCOL_IB, &dev->port[i].q_cnt_id); if (ret) { mlx5_ib_warn(dev, "couldn't allocate queue counter for port %d, err %d\n", i + 1, ret); goto dealloc_counters; } } return 0; dealloc_counters: while (--i >= 0) mlx5_ib_dealloc_q_port_counter(dev, i); return ret; } static const char * const names[] = { "rx_write_requests", "rx_read_requests", "rx_atomic_requests", "out_of_buffer", "out_of_sequence", "duplicate_request", "rnr_nak_retry_err", "packet_seq_err", "implied_nak_seq_err", "local_ack_timeout_err", }; static const size_t stats_offsets[] = { MLX5_BYTE_OFF(query_q_counter_out, rx_write_requests), MLX5_BYTE_OFF(query_q_counter_out, rx_read_requests), MLX5_BYTE_OFF(query_q_counter_out, rx_atomic_requests), MLX5_BYTE_OFF(query_q_counter_out, out_of_buffer), MLX5_BYTE_OFF(query_q_counter_out, out_of_sequence), MLX5_BYTE_OFF(query_q_counter_out, duplicate_request), MLX5_BYTE_OFF(query_q_counter_out, rnr_nak_retry_err), MLX5_BYTE_OFF(query_q_counter_out, packet_seq_err), MLX5_BYTE_OFF(query_q_counter_out, implied_nak_seq_err), MLX5_BYTE_OFF(query_q_counter_out, local_ack_timeout_err), }; static struct rdma_hw_stats *mlx5_ib_alloc_hw_stats(struct ib_device *ibdev, u8 port_num) { BUILD_BUG_ON(ARRAY_SIZE(names) != ARRAY_SIZE(stats_offsets)); /* We support only per port stats */ if (port_num == 0) return NULL; return rdma_alloc_hw_stats_struct(names, ARRAY_SIZE(names), RDMA_HW_STATS_DEFAULT_LIFESPAN); } static int mlx5_ib_get_hw_stats(struct ib_device *ibdev, struct rdma_hw_stats *stats, u8 port, int index) { struct mlx5_ib_dev *dev = to_mdev(ibdev); int outlen = MLX5_ST_SZ_BYTES(query_q_counter_out); void *out; __be32 val; int ret; int i; if (!port || !stats) return -ENOSYS; out = mlx5_vzalloc(outlen); if (!out) return -ENOMEM; ret = mlx5_vport_query_q_counter(dev->mdev, dev->port[port - 1].q_cnt_id, 0, out, outlen); if (ret) goto free; for (i = 0; i < ARRAY_SIZE(names); i++) { val = *(__be32 *)(out + stats_offsets[i]); stats->value[i] = (u64)be32_to_cpu(val); } free: kvfree(out); return ARRAY_SIZE(names); } static void *mlx5_ib_add(struct mlx5_core_dev *mdev) { struct mlx5_ib_dev *dev; enum rdma_link_layer ll; int port_type_cap; const char *name; int err; int i; port_type_cap = MLX5_CAP_GEN(mdev, port_type); ll = mlx5_port_type_cap_to_rdma_ll(port_type_cap); if ((ll == IB_LINK_LAYER_ETHERNET) && !MLX5_CAP_GEN(mdev, roce)) return NULL; printk_once(KERN_INFO "%s", mlx5_version); dev = (struct mlx5_ib_dev *)ib_alloc_device(sizeof(*dev)); if (!dev) return NULL; dev->mdev = mdev; dev->port = kcalloc(MLX5_CAP_GEN(mdev, num_ports), sizeof(*dev->port), GFP_KERNEL); if (!dev->port) goto err_dealloc; rwlock_init(&dev->roce.netdev_lock); err = get_port_caps(dev); if (err) goto err_free_port; if (mlx5_use_mad_ifc(dev)) get_ext_port_caps(dev); MLX5_INIT_DOORBELL_LOCK(&dev->uar_lock); name = "mlx5_%d"; strlcpy(dev->ib_dev.name, name, IB_DEVICE_NAME_MAX); dev->ib_dev.owner = THIS_MODULE; dev->ib_dev.node_type = RDMA_NODE_IB_CA; dev->ib_dev.local_dma_lkey = 0 /* not supported for now */; dev->num_ports = MLX5_CAP_GEN(mdev, num_ports); dev->ib_dev.phys_port_cnt = dev->num_ports; dev->ib_dev.num_comp_vectors = dev->mdev->priv.eq_table.num_comp_vectors; dev->ib_dev.dma_device = &mdev->pdev->dev; dev->ib_dev.uverbs_abi_ver = MLX5_IB_UVERBS_ABI_VERSION; dev->ib_dev.uverbs_cmd_mask = (1ull << IB_USER_VERBS_CMD_GET_CONTEXT) | (1ull << IB_USER_VERBS_CMD_QUERY_DEVICE) | (1ull << IB_USER_VERBS_CMD_QUERY_PORT) | (1ull << IB_USER_VERBS_CMD_ALLOC_PD) | (1ull << IB_USER_VERBS_CMD_DEALLOC_PD) | (1ull << IB_USER_VERBS_CMD_CREATE_AH) | (1ull << IB_USER_VERBS_CMD_DESTROY_AH) | (1ull << IB_USER_VERBS_CMD_REG_MR) | (1ull << IB_USER_VERBS_CMD_REREG_MR) | (1ull << IB_USER_VERBS_CMD_DEREG_MR) | (1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) | (1ull << IB_USER_VERBS_CMD_CREATE_CQ) | (1ull << IB_USER_VERBS_CMD_RESIZE_CQ) | (1ull << IB_USER_VERBS_CMD_DESTROY_CQ) | (1ull << IB_USER_VERBS_CMD_CREATE_QP) | (1ull << IB_USER_VERBS_CMD_MODIFY_QP) | (1ull << IB_USER_VERBS_CMD_QUERY_QP) | (1ull << IB_USER_VERBS_CMD_DESTROY_QP) | (1ull << IB_USER_VERBS_CMD_ATTACH_MCAST) | (1ull << IB_USER_VERBS_CMD_DETACH_MCAST) | (1ull << IB_USER_VERBS_CMD_CREATE_SRQ) | (1ull << IB_USER_VERBS_CMD_MODIFY_SRQ) | (1ull << IB_USER_VERBS_CMD_QUERY_SRQ) | (1ull << IB_USER_VERBS_CMD_DESTROY_SRQ) | (1ull << IB_USER_VERBS_CMD_CREATE_XSRQ) | (1ull << IB_USER_VERBS_CMD_OPEN_QP); dev->ib_dev.uverbs_ex_cmd_mask = (1ull << IB_USER_VERBS_EX_CMD_QUERY_DEVICE) | (1ull << IB_USER_VERBS_EX_CMD_CREATE_CQ) | (1ull << IB_USER_VERBS_EX_CMD_CREATE_QP); dev->ib_dev.query_device = mlx5_ib_query_device; dev->ib_dev.query_port = mlx5_ib_query_port; dev->ib_dev.get_link_layer = mlx5_ib_port_link_layer; if (ll == IB_LINK_LAYER_ETHERNET) dev->ib_dev.get_netdev = mlx5_ib_get_netdev; dev->ib_dev.query_gid = mlx5_ib_query_gid; dev->ib_dev.add_gid = mlx5_ib_add_gid; dev->ib_dev.del_gid = mlx5_ib_del_gid; dev->ib_dev.query_pkey = mlx5_ib_query_pkey; dev->ib_dev.modify_device = mlx5_ib_modify_device; dev->ib_dev.modify_port = mlx5_ib_modify_port; dev->ib_dev.alloc_ucontext = mlx5_ib_alloc_ucontext; dev->ib_dev.dealloc_ucontext = mlx5_ib_dealloc_ucontext; dev->ib_dev.mmap = mlx5_ib_mmap; dev->ib_dev.alloc_pd = mlx5_ib_alloc_pd; dev->ib_dev.dealloc_pd = mlx5_ib_dealloc_pd; dev->ib_dev.create_ah = mlx5_ib_create_ah; dev->ib_dev.query_ah = mlx5_ib_query_ah; dev->ib_dev.destroy_ah = mlx5_ib_destroy_ah; dev->ib_dev.create_srq = mlx5_ib_create_srq; dev->ib_dev.modify_srq = mlx5_ib_modify_srq; dev->ib_dev.query_srq = mlx5_ib_query_srq; dev->ib_dev.destroy_srq = mlx5_ib_destroy_srq; dev->ib_dev.post_srq_recv = mlx5_ib_post_srq_recv; dev->ib_dev.create_qp = mlx5_ib_create_qp; dev->ib_dev.modify_qp = mlx5_ib_modify_qp; dev->ib_dev.query_qp = mlx5_ib_query_qp; dev->ib_dev.destroy_qp = mlx5_ib_destroy_qp; dev->ib_dev.post_send = mlx5_ib_post_send; dev->ib_dev.post_recv = mlx5_ib_post_recv; dev->ib_dev.create_cq = mlx5_ib_create_cq; dev->ib_dev.modify_cq = mlx5_ib_modify_cq; dev->ib_dev.resize_cq = mlx5_ib_resize_cq; dev->ib_dev.destroy_cq = mlx5_ib_destroy_cq; dev->ib_dev.poll_cq = mlx5_ib_poll_cq; dev->ib_dev.req_notify_cq = mlx5_ib_arm_cq; dev->ib_dev.get_dma_mr = mlx5_ib_get_dma_mr; dev->ib_dev.reg_user_mr = mlx5_ib_reg_user_mr; dev->ib_dev.rereg_user_mr = mlx5_ib_rereg_user_mr; dev->ib_dev.dereg_mr = mlx5_ib_dereg_mr; dev->ib_dev.attach_mcast = mlx5_ib_mcg_attach; dev->ib_dev.detach_mcast = mlx5_ib_mcg_detach; dev->ib_dev.process_mad = mlx5_ib_process_mad; dev->ib_dev.alloc_mr = mlx5_ib_alloc_mr; dev->ib_dev.map_mr_sg = mlx5_ib_map_mr_sg; dev->ib_dev.check_mr_status = mlx5_ib_check_mr_status; dev->ib_dev.get_port_immutable = mlx5_port_immutable; dev->ib_dev.get_dev_fw_str = get_dev_fw_str; if (mlx5_core_is_pf(mdev)) { dev->ib_dev.get_vf_config = mlx5_ib_get_vf_config; dev->ib_dev.set_vf_link_state = mlx5_ib_set_vf_link_state; dev->ib_dev.get_vf_stats = mlx5_ib_get_vf_stats; dev->ib_dev.set_vf_guid = mlx5_ib_set_vf_guid; } mlx5_ib_internal_fill_odp_caps(dev); if (MLX5_CAP_GEN(mdev, imaicl)) { dev->ib_dev.alloc_mw = mlx5_ib_alloc_mw; dev->ib_dev.dealloc_mw = mlx5_ib_dealloc_mw; dev->ib_dev.uverbs_cmd_mask |= (1ull << IB_USER_VERBS_CMD_ALLOC_MW) | (1ull << IB_USER_VERBS_CMD_DEALLOC_MW); } if (MLX5_CAP_GEN(dev->mdev, out_of_seq_cnt) && MLX5_CAP_GEN(dev->mdev, retransmission_q_counters)) { dev->ib_dev.get_hw_stats = mlx5_ib_get_hw_stats; dev->ib_dev.alloc_hw_stats = mlx5_ib_alloc_hw_stats; } if (MLX5_CAP_GEN(mdev, xrc)) { dev->ib_dev.alloc_xrcd = mlx5_ib_alloc_xrcd; dev->ib_dev.dealloc_xrcd = mlx5_ib_dealloc_xrcd; dev->ib_dev.uverbs_cmd_mask |= (1ull << IB_USER_VERBS_CMD_OPEN_XRCD) | (1ull << IB_USER_VERBS_CMD_CLOSE_XRCD); } if (mlx5_ib_port_link_layer(&dev->ib_dev, 1) == IB_LINK_LAYER_ETHERNET) { dev->ib_dev.create_flow = mlx5_ib_create_flow; dev->ib_dev.destroy_flow = mlx5_ib_destroy_flow; dev->ib_dev.create_wq = mlx5_ib_create_wq; dev->ib_dev.modify_wq = mlx5_ib_modify_wq; dev->ib_dev.destroy_wq = mlx5_ib_destroy_wq; dev->ib_dev.create_rwq_ind_table = mlx5_ib_create_rwq_ind_table; dev->ib_dev.destroy_rwq_ind_table = mlx5_ib_destroy_rwq_ind_table; dev->ib_dev.uverbs_ex_cmd_mask |= (1ull << IB_USER_VERBS_EX_CMD_CREATE_FLOW) | (1ull << IB_USER_VERBS_EX_CMD_DESTROY_FLOW) | (1ull << IB_USER_VERBS_EX_CMD_CREATE_WQ) | (1ull << IB_USER_VERBS_EX_CMD_MODIFY_WQ) | (1ull << IB_USER_VERBS_EX_CMD_DESTROY_WQ) | (1ull << IB_USER_VERBS_EX_CMD_CREATE_RWQ_IND_TBL) | (1ull << IB_USER_VERBS_EX_CMD_DESTROY_RWQ_IND_TBL); } err = init_node_data(dev); if (err) goto err_free_port; mutex_init(&dev->flow_db.lock); mutex_init(&dev->cap_mask_mutex); INIT_LIST_HEAD(&dev->qp_list); spin_lock_init(&dev->reset_flow_resource_lock); if (ll == IB_LINK_LAYER_ETHERNET) { err = mlx5_enable_roce(dev); if (err) goto err_free_port; } err = create_dev_resources(&dev->devr); if (err) goto err_disable_roce; err = mlx5_ib_odp_init_one(dev); if (err) goto err_rsrc; err = mlx5_ib_alloc_q_counters(dev); if (err) goto err_odp; err = ib_register_device(&dev->ib_dev, NULL); if (err) goto err_q_cnt; err = create_umr_res(dev); if (err) goto err_dev; for (i = 0; i < ARRAY_SIZE(mlx5_class_attributes); i++) { err = device_create_file(&dev->ib_dev.dev, mlx5_class_attributes[i]); if (err) goto err_umrc; } err = mlx5_ib_init_congestion(dev); if (err) goto err_umrc; dev->ib_active = true; return dev; err_umrc: destroy_umrc_res(dev); err_dev: ib_unregister_device(&dev->ib_dev); err_q_cnt: mlx5_ib_dealloc_q_counters(dev); err_odp: mlx5_ib_odp_remove_one(dev); err_rsrc: destroy_dev_resources(&dev->devr); err_disable_roce: if (ll == IB_LINK_LAYER_ETHERNET) { mlx5_disable_roce(dev); mlx5_remove_roce_notifier(dev); } err_free_port: kfree(dev->port); err_dealloc: ib_dealloc_device((struct ib_device *)dev); return NULL; } static void mlx5_ib_remove(struct mlx5_core_dev *mdev, void *context) { struct mlx5_ib_dev *dev = context; enum rdma_link_layer ll = mlx5_ib_port_link_layer(&dev->ib_dev, 1); mlx5_ib_cleanup_congestion(dev); mlx5_remove_roce_notifier(dev); ib_unregister_device(&dev->ib_dev); mlx5_ib_dealloc_q_counters(dev); destroy_umrc_res(dev); mlx5_ib_odp_remove_one(dev); destroy_dev_resources(&dev->devr); if (ll == IB_LINK_LAYER_ETHERNET) mlx5_disable_roce(dev); kfree(dev->port); ib_dealloc_device(&dev->ib_dev); } static struct mlx5_interface mlx5_ib_interface = { .add = mlx5_ib_add, .remove = mlx5_ib_remove, .event = mlx5_ib_event, .protocol = MLX5_INTERFACE_PROTOCOL_IB, }; static int __init mlx5_ib_init(void) { int err; if (deprecated_prof_sel != 2) pr_warn("prof_sel is deprecated for mlx5_ib, set it for mlx5_core\n"); err = mlx5_ib_odp_init(); if (err) return err; err = mlx5_register_interface(&mlx5_ib_interface); if (err) goto clean_odp; return err; clean_odp: mlx5_ib_odp_cleanup(); return err; } static void __exit mlx5_ib_cleanup(void) { mlx5_unregister_interface(&mlx5_ib_interface); mlx5_ib_odp_cleanup(); } module_init_order(mlx5_ib_init, SI_ORDER_THIRD); module_exit_order(mlx5_ib_cleanup, SI_ORDER_THIRD); Index: head/sys/dev/wtap/if_wtap.c =================================================================== --- head/sys/dev/wtap/if_wtap.c (revision 334117) +++ head/sys/dev/wtap/if_wtap.c (revision 334118) @@ -1,742 +1,742 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2010-2011 Monthadar Al Jaberi, TerraNet AB * All rights reserved. * * Copyright (c) 2002-2009 Sam Leffler, Errno Consulting * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer, * without modification. * 2. Redistributions in binary form must reproduce at minimum a disclaimer * similar to the "NO WARRANTY" disclaimer below ("Disclaimer") and any * redistribution must be conditioned upon including a substantially * similar Disclaimer requirement for further binary redistribution. * * NO WARRANTY * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF NONINFRINGEMENT, MERCHANTIBILITY * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL * THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF * THE POSSIBILITY OF SUCH DAMAGES. * * $FreeBSD$ */ #include "if_wtapvar.h" #include /* uio struct */ #include #include #include #include #include "if_medium.h" /* * This _requires_ vimage to be useful. */ #ifndef VIMAGE #error if_wtap requires VIMAGE. #endif /* VIMAGE */ /* device for IOCTL and read/write for debuggin purposes */ /* Function prototypes */ static d_open_t wtap_node_open; static d_close_t wtap_node_close; static d_write_t wtap_node_write; static d_ioctl_t wtap_node_ioctl; static struct cdevsw wtap_cdevsw = { .d_version = D_VERSION, .d_flags = 0, .d_open = wtap_node_open, .d_close = wtap_node_close, .d_write = wtap_node_write, .d_ioctl = wtap_node_ioctl, .d_name = "wtapnode", }; static int wtap_node_open(struct cdev *dev, int oflags, int devtype, struct thread *p) { int err = 0; uprintf("Opened device \"echo\" successfully.\n"); return(err); } static int wtap_node_close(struct cdev *dev, int fflag, int devtype, struct thread *p) { uprintf("Closing device \"echo.\"\n"); return(0); } static int wtap_node_write(struct cdev *dev, struct uio *uio, int ioflag) { int err = 0; struct mbuf *m; struct ifnet *ifp; struct wtap_softc *sc; uint8_t buf[1024]; int buf_len; uprintf("write device %s \"echo.\"\n", devtoname(dev)); buf_len = MIN(uio->uio_iov->iov_len, 1024); err = copyin(uio->uio_iov->iov_base, buf, buf_len); if (err != 0) { uprintf("Write failed: bad address!\n"); return (err); } MGETHDR(m, M_NOWAIT, MT_DATA); m_copyback(m, 0, buf_len, buf); CURVNET_SET(TD_TO_VNET(curthread)); IFNET_RLOCK_NOSLEEP(); - TAILQ_FOREACH(ifp, &V_ifnet, if_link) { + CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) { printf("ifp->if_xname = %s\n", ifp->if_xname); if(strcmp(devtoname(dev), ifp->if_xname) == 0){ printf("found match, correspoding wtap = %s\n", ifp->if_xname); sc = (struct wtap_softc *)ifp->if_softc; printf("wtap id = %d\n", sc->id); wtap_inject(sc, m); } } IFNET_RUNLOCK_NOSLEEP(); CURVNET_RESTORE(); return(err); } int wtap_node_ioctl(struct cdev *dev, u_long cmd, caddr_t data, int fflag, struct thread *td) { int error = 0; switch(cmd) { default: DWTAP_PRINTF("Unknown WTAP IOCTL\n"); error = EINVAL; } return error; } static int wtap_raw_xmit(struct ieee80211_node *ni, struct mbuf *m, const struct ieee80211_bpf_params *params); static int wtap_medium_enqueue(struct wtap_vap *avp, struct mbuf *m) { return medium_transmit(avp->av_md, avp->id, m); } static int wtap_media_change(struct ifnet *ifp) { DWTAP_PRINTF("%s\n", __func__); int error = ieee80211_media_change(ifp); /* NB: only the fixed rate can change and that doesn't need a reset */ return (error == ENETRESET ? 0 : error); } /* * Intercept management frames to collect beacon rssi data * and to do ibss merges. */ static void wtap_recv_mgmt(struct ieee80211_node *ni, struct mbuf *m, int subtype, const struct ieee80211_rx_stats *stats, int rssi, int nf) { struct ieee80211vap *vap = ni->ni_vap; #if 0 DWTAP_PRINTF("[%d] %s\n", myath_id(ni), __func__); #endif WTAP_VAP(vap)->av_recv_mgmt(ni, m, subtype, stats, rssi, nf); } static int wtap_reset_vap(struct ieee80211vap *vap, u_long cmd) { DWTAP_PRINTF("%s\n", __func__); return 0; } static void wtap_beacon_update(struct ieee80211vap *vap, int item) { struct ieee80211_beacon_offsets *bo = &vap->iv_bcn_off; DWTAP_PRINTF("%s\n", __func__); setbit(bo->bo_flags, item); } /* * Allocate and setup an initial beacon frame. */ static int wtap_beacon_alloc(struct wtap_softc *sc, struct ieee80211_node *ni) { struct ieee80211vap *vap = ni->ni_vap; struct wtap_vap *avp = WTAP_VAP(vap); DWTAP_PRINTF("[%s] %s\n", ether_sprintf(ni->ni_macaddr), __func__); /* * NB: the beacon data buffer must be 32-bit aligned; * we assume the mbuf routines will return us something * with this alignment (perhaps should assert). */ avp->beacon = ieee80211_beacon_alloc(ni); if (avp->beacon == NULL) { printf("%s: cannot get mbuf\n", __func__); return ENOMEM; } callout_init(&avp->av_swba, 0); avp->bf_node = ieee80211_ref_node(ni); return 0; } static void wtap_beacon_config(struct wtap_softc *sc, struct ieee80211vap *vap) { DWTAP_PRINTF("%s\n", __func__); } static void wtap_beacon_intrp(void *arg) { struct wtap_vap *avp = arg; struct ieee80211vap *vap = arg; struct mbuf *m; if (vap->iv_state < IEEE80211_S_RUN) { DWTAP_PRINTF("Skip beacon, not running, state %d", vap->iv_state); return ; } DWTAP_PRINTF("[%d] beacon intrp\n", avp->id); //burst mode /* * Update dynamic beacon contents. If this returns * non-zero then we need to remap the memory because * the beacon frame changed size (probably because * of the TIM bitmap). */ m = m_dup(avp->beacon, M_NOWAIT); if (ieee80211_beacon_update(avp->bf_node, m, 0)) { printf("%s, need to remap the memory because the beacon frame" " changed size.\n",__func__); } if (ieee80211_radiotap_active_vap(vap)) ieee80211_radiotap_tx(vap, m); #if 0 medium_transmit(avp->av_md, avp->id, m); #endif wtap_medium_enqueue(avp, m); callout_schedule(&avp->av_swba, avp->av_bcinterval); } static int wtap_newstate(struct ieee80211vap *vap, enum ieee80211_state nstate, int arg) { struct ieee80211com *ic = vap->iv_ic; struct wtap_softc *sc = ic->ic_softc; struct wtap_vap *avp = WTAP_VAP(vap); struct ieee80211_node *ni = NULL; int error; DWTAP_PRINTF("%s\n", __func__); ni = ieee80211_ref_node(vap->iv_bss); /* * Invoke the parent method to do net80211 work. */ error = avp->av_newstate(vap, nstate, arg); if (error != 0) goto bad; if (nstate == IEEE80211_S_RUN) { /* NB: collect bss node again, it may have changed */ ieee80211_free_node(ni); ni = ieee80211_ref_node(vap->iv_bss); switch (vap->iv_opmode) { case IEEE80211_M_MBSS: error = wtap_beacon_alloc(sc, ni); if (error != 0) goto bad; wtap_beacon_config(sc, vap); callout_reset(&avp->av_swba, avp->av_bcinterval, wtap_beacon_intrp, vap); break; default: goto bad; } } else if (nstate == IEEE80211_S_INIT) { callout_stop(&avp->av_swba); } ieee80211_free_node(ni); return 0; bad: printf("%s: bad\n", __func__); ieee80211_free_node(ni); return error; } static void wtap_bmiss(struct ieee80211vap *vap) { struct wtap_vap *avp = (struct wtap_vap *)vap; DWTAP_PRINTF("%s\n", __func__); avp->av_bmiss(vap); } static struct ieee80211vap * wtap_vap_create(struct ieee80211com *ic, const char name[IFNAMSIZ], int unit, enum ieee80211_opmode opmode, int flags, const uint8_t bssid[IEEE80211_ADDR_LEN], const uint8_t mac[IEEE80211_ADDR_LEN]) { struct wtap_softc *sc = ic->ic_softc; struct ieee80211vap *vap; struct wtap_vap *avp; int error; struct ieee80211_node *ni; DWTAP_PRINTF("%s\n", __func__); avp = malloc(sizeof(struct wtap_vap), M_80211_VAP, M_WAITOK | M_ZERO); avp->id = sc->id; avp->av_md = sc->sc_md; avp->av_bcinterval = msecs_to_ticks(BEACON_INTRERVAL + 100*sc->id); vap = (struct ieee80211vap *) avp; error = ieee80211_vap_setup(ic, vap, name, unit, IEEE80211_M_MBSS, flags | IEEE80211_CLONE_NOBEACONS, bssid); if (error) { free(avp, M_80211_VAP); return (NULL); } /* override various methods */ avp->av_recv_mgmt = vap->iv_recv_mgmt; vap->iv_recv_mgmt = wtap_recv_mgmt; vap->iv_reset = wtap_reset_vap; vap->iv_update_beacon = wtap_beacon_update; avp->av_newstate = vap->iv_newstate; vap->iv_newstate = wtap_newstate; avp->av_bmiss = vap->iv_bmiss; vap->iv_bmiss = wtap_bmiss; /* complete setup */ ieee80211_vap_attach(vap, wtap_media_change, ieee80211_media_status, mac); avp->av_dev = make_dev(&wtap_cdevsw, 0, UID_ROOT, GID_WHEEL, 0600, "%s", (const char *)sc->name); /* TODO this is a hack to force it to choose the rate we want */ ni = ieee80211_ref_node(vap->iv_bss); ni->ni_txrate = 130; ieee80211_free_node(ni); return vap; } static void wtap_vap_delete(struct ieee80211vap *vap) { struct wtap_vap *avp = WTAP_VAP(vap); DWTAP_PRINTF("%s\n", __func__); destroy_dev(avp->av_dev); callout_stop(&avp->av_swba); ieee80211_vap_detach(vap); free((struct wtap_vap*) vap, M_80211_VAP); } static void wtap_parent(struct ieee80211com *ic) { struct wtap_softc *sc = ic->ic_softc; if (ic->ic_nrunning > 0) { sc->up = 1; ieee80211_start_all(ic); } else sc->up = 0; } static void wtap_scan_start(struct ieee80211com *ic) { #if 0 DWTAP_PRINTF("%s\n", __func__); #endif } static void wtap_scan_end(struct ieee80211com *ic) { #if 0 DWTAP_PRINTF("%s\n", __func__); #endif } static void wtap_set_channel(struct ieee80211com *ic) { #if 0 DWTAP_PRINTF("%s\n", __func__); #endif } static int wtap_raw_xmit(struct ieee80211_node *ni, struct mbuf *m, const struct ieee80211_bpf_params *params) { #if 0 DWTAP_PRINTF("%s, %p\n", __func__, m); #endif struct ieee80211vap *vap = ni->ni_vap; struct wtap_vap *avp = WTAP_VAP(vap); if (ieee80211_radiotap_active_vap(vap)) { ieee80211_radiotap_tx(vap, m); } if (m->m_flags & M_TXCB) ieee80211_process_callback(ni, m, 0); ieee80211_free_node(ni); return wtap_medium_enqueue(avp, m); } void wtap_inject(struct wtap_softc *sc, struct mbuf *m) { struct wtap_buf *bf = (struct wtap_buf *)malloc(sizeof(struct wtap_buf), M_WTAP_RXBUF, M_NOWAIT | M_ZERO); KASSERT(bf != NULL, ("could not allocated a new wtap_buf\n")); bf->m = m; mtx_lock(&sc->sc_mtx); STAILQ_INSERT_TAIL(&sc->sc_rxbuf, bf, bf_list); taskqueue_enqueue(sc->sc_tq, &sc->sc_rxtask); mtx_unlock(&sc->sc_mtx); } void wtap_rx_deliver(struct wtap_softc *sc, struct mbuf *m) { struct ieee80211com *ic = &sc->sc_ic; struct ieee80211_node *ni; int type; #if 0 DWTAP_PRINTF("%s\n", __func__); #endif DWTAP_PRINTF("[%d] receiving m=%p\n", sc->id, m); if (m == NULL) { /* NB: shouldn't happen */ ic_printf(ic, "%s: no mbuf!\n", __func__); } ieee80211_dump_pkt(ic, mtod(m, caddr_t), 0,0,0); /* * Locate the node for sender, track state, and then * pass the (referenced) node up to the 802.11 layer * for its use. */ ni = ieee80211_find_rxnode_withkey(ic, mtod(m, const struct ieee80211_frame_min *),IEEE80211_KEYIX_NONE); if (ni != NULL) { /* * Sending station is known, dispatch directly. */ type = ieee80211_input(ni, m, 1<<7, 10); ieee80211_free_node(ni); } else { type = ieee80211_input_all(ic, m, 1<<7, 10); } } static void wtap_rx_proc(void *arg, int npending) { struct wtap_softc *sc = (struct wtap_softc *)arg; struct ieee80211com *ic = &sc->sc_ic; struct mbuf *m; struct ieee80211_node *ni; int type; struct wtap_buf *bf; #if 0 DWTAP_PRINTF("%s\n", __func__); #endif for(;;) { mtx_lock(&sc->sc_mtx); bf = STAILQ_FIRST(&sc->sc_rxbuf); if (bf == NULL) { mtx_unlock(&sc->sc_mtx); return; } STAILQ_REMOVE_HEAD(&sc->sc_rxbuf, bf_list); mtx_unlock(&sc->sc_mtx); KASSERT(bf != NULL, ("wtap_buf is NULL\n")); m = bf->m; DWTAP_PRINTF("[%d] receiving m=%p\n", sc->id, bf->m); if (m == NULL) { /* NB: shouldn't happen */ ic_printf(ic, "%s: no mbuf!\n", __func__); free(bf, M_WTAP_RXBUF); return; } #if 0 ieee80211_dump_pkt(ic, mtod(m, caddr_t), 0,0,0); #endif /* * Locate the node for sender, track state, and then * pass the (referenced) node up to the 802.11 layer * for its use. */ ni = ieee80211_find_rxnode_withkey(ic, mtod(m, const struct ieee80211_frame_min *), IEEE80211_KEYIX_NONE); if (ni != NULL) { /* * Sending station is known, dispatch directly. */ type = ieee80211_input(ni, m, 1<<7, 10); ieee80211_free_node(ni); } else { type = ieee80211_input_all(ic, m, 1<<7, 10); } /* The mbufs are freed by the Net80211 stack */ free(bf, M_WTAP_RXBUF); } } static void wtap_newassoc(struct ieee80211_node *ni, int isnew) { DWTAP_PRINTF("%s\n", __func__); } /* * Callback from the 802.11 layer to update WME parameters. */ static int wtap_wme_update(struct ieee80211com *ic) { DWTAP_PRINTF("%s\n", __func__); return 0; } static void wtap_update_mcast(struct ieee80211com *ic) { DWTAP_PRINTF("%s\n", __func__); } static void wtap_update_promisc(struct ieee80211com *ic) { DWTAP_PRINTF("%s\n", __func__); } static int wtap_transmit(struct ieee80211com *ic, struct mbuf *m) { struct ieee80211_node *ni = (struct ieee80211_node *) m->m_pkthdr.rcvif; struct ieee80211vap *vap = ni->ni_vap; struct wtap_vap *avp = WTAP_VAP(vap); if(ni == NULL){ printf("m->m_pkthdr.rcvif is NULL we cant radiotap_tx\n"); }else{ if (ieee80211_radiotap_active_vap(vap)) ieee80211_radiotap_tx(vap, m); } if (m->m_flags & M_TXCB) ieee80211_process_callback(ni, m, 0); ieee80211_free_node(ni); return wtap_medium_enqueue(avp, m); } static struct ieee80211_node * wtap_node_alloc(struct ieee80211vap *vap, const uint8_t mac[IEEE80211_ADDR_LEN]) { struct ieee80211_node *ni; DWTAP_PRINTF("%s\n", __func__); ni = malloc(sizeof(struct ieee80211_node), M_80211_NODE, M_NOWAIT|M_ZERO); ni->ni_txrate = 130; return ni; } static void wtap_node_free(struct ieee80211_node *ni) { struct ieee80211com *ic = ni->ni_ic; struct wtap_softc *sc = ic->ic_softc; DWTAP_PRINTF("%s\n", __func__); sc->sc_node_free(ni); } int32_t wtap_attach(struct wtap_softc *sc, const uint8_t *macaddr) { struct ieee80211com *ic = &sc->sc_ic; DWTAP_PRINTF("%s\n", __func__); sc->up = 0; STAILQ_INIT(&sc->sc_rxbuf); sc->sc_tq = taskqueue_create("wtap_taskq", M_NOWAIT | M_ZERO, taskqueue_thread_enqueue, &sc->sc_tq); taskqueue_start_threads(&sc->sc_tq, 1, PI_SOFT, "%s taskQ", sc->name); TASK_INIT(&sc->sc_rxtask, 0, wtap_rx_proc, sc); ic->ic_softc = sc; ic->ic_name = sc->name; ic->ic_phytype = IEEE80211_T_DS; ic->ic_opmode = IEEE80211_M_MBSS; ic->ic_caps = IEEE80211_C_MBSS; ic->ic_max_keyix = 128; /* A value read from Atheros ATH_KEYMAX */ ic->ic_regdomain.regdomain = SKU_ETSI; ic->ic_regdomain.country = CTRY_SWEDEN; ic->ic_regdomain.location = 1; /* Indoors */ ic->ic_regdomain.isocc[0] = 'S'; ic->ic_regdomain.isocc[1] = 'E'; ic->ic_nchans = 1; ic->ic_channels[0].ic_flags = IEEE80211_CHAN_B; ic->ic_channels[0].ic_freq = 2412; IEEE80211_ADDR_COPY(ic->ic_macaddr, macaddr); ieee80211_ifattach(ic); /* override default methods */ ic->ic_newassoc = wtap_newassoc; ic->ic_wme.wme_update = wtap_wme_update; ic->ic_vap_create = wtap_vap_create; ic->ic_vap_delete = wtap_vap_delete; ic->ic_raw_xmit = wtap_raw_xmit; ic->ic_update_mcast = wtap_update_mcast; ic->ic_update_promisc = wtap_update_promisc; ic->ic_transmit = wtap_transmit; ic->ic_parent = wtap_parent; sc->sc_node_alloc = ic->ic_node_alloc; ic->ic_node_alloc = wtap_node_alloc; sc->sc_node_free = ic->ic_node_free; ic->ic_node_free = wtap_node_free; ic->ic_scan_start = wtap_scan_start; ic->ic_scan_end = wtap_scan_end; ic->ic_set_channel = wtap_set_channel; ieee80211_radiotap_attach(ic, &sc->sc_tx_th.wt_ihdr, sizeof(sc->sc_tx_th), WTAP_TX_RADIOTAP_PRESENT, &sc->sc_rx_th.wr_ihdr, sizeof(sc->sc_rx_th), WTAP_RX_RADIOTAP_PRESENT); /* Work here, we must find a way to populate the rate table */ #if 0 if(ic->ic_rt == NULL){ printf("no table for ic_curchan\n"); ic->ic_rt = ieee80211_get_ratetable(&ic->ic_channels[0]); } printf("ic->ic_rt =%p\n", ic->ic_rt); printf("rate count %d\n", ic->ic_rt->rateCount); uint8_t code = ic->ic_rt->info[0].dot11Rate; uint8_t cix = ic->ic_rt->info[0].ctlRateIndex; uint8_t ctl_rate = ic->ic_rt->info[cix].dot11Rate; printf("code=%d, cix=%d, ctl_rate=%d\n", code, cix, ctl_rate); uint8_t rix0 = ic->ic_rt->rateCodeToIndex[130]; uint8_t rix1 = ic->ic_rt->rateCodeToIndex[132]; uint8_t rix2 = ic->ic_rt->rateCodeToIndex[139]; uint8_t rix3 = ic->ic_rt->rateCodeToIndex[150]; printf("rix0 %u,rix1 %u,rix2 %u,rix3 %u\n", rix0,rix1,rix2,rix3); printf("lpAckDuration=%u\n", ic->ic_rt->info[0].lpAckDuration); printf("rate=%d\n", ic->ic_rt->info[0].rateKbps); #endif return 0; } int32_t wtap_detach(struct wtap_softc *sc) { struct ieee80211com *ic = &sc->sc_ic; DWTAP_PRINTF("%s\n", __func__); ieee80211_ageq_drain(&ic->ic_stageq); ieee80211_ifdetach(ic); return 0; } void wtap_resume(struct wtap_softc *sc) { DWTAP_PRINTF("%s\n", __func__); } void wtap_suspend(struct wtap_softc *sc) { DWTAP_PRINTF("%s\n", __func__); } void wtap_shutdown(struct wtap_softc *sc) { DWTAP_PRINTF("%s\n", __func__); } void wtap_intr(struct wtap_softc *sc) { DWTAP_PRINTF("%s\n", __func__); } Index: head/sys/net/altq/altq_subr.c =================================================================== --- head/sys/net/altq/altq_subr.c (revision 334117) +++ head/sys/net/altq/altq_subr.c (revision 334118) @@ -1,1976 +1,1976 @@ /*- * Copyright (C) 1997-2003 * Sony Computer Science Laboratories Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY SONY CSL AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL SONY CSL OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $KAME: altq_subr.c,v 1.21 2003/11/06 06:32:53 kjc Exp $ * $FreeBSD$ */ #include "opt_altq.h" #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET6 #include #endif #include #include #include #include #include #ifdef ALTQ3_COMPAT #include #endif /* machine dependent clock related includes */ #include #include #include #include #if defined(__amd64__) || defined(__i386__) #include /* for pentium tsc */ #include /* for CPUID_TSC */ #include /* for cpu_feature */ #endif /* __amd64 || __i386__ */ /* * internal function prototypes */ static void tbr_timeout(void *); int (*altq_input)(struct mbuf *, int) = NULL; static struct mbuf *tbr_dequeue(struct ifaltq *, int); static int tbr_timer = 0; /* token bucket regulator timer */ #if !defined(__FreeBSD__) || (__FreeBSD_version < 600000) static struct callout tbr_callout = CALLOUT_INITIALIZER; #else static struct callout tbr_callout; #endif #ifdef ALTQ3_CLFIER_COMPAT static int extract_ports4(struct mbuf *, struct ip *, struct flowinfo_in *); #ifdef INET6 static int extract_ports6(struct mbuf *, struct ip6_hdr *, struct flowinfo_in6 *); #endif static int apply_filter4(u_int32_t, struct flow_filter *, struct flowinfo_in *); static int apply_ppfilter4(u_int32_t, struct flow_filter *, struct flowinfo_in *); #ifdef INET6 static int apply_filter6(u_int32_t, struct flow_filter6 *, struct flowinfo_in6 *); #endif static int apply_tosfilter4(u_int32_t, struct flow_filter *, struct flowinfo_in *); static u_long get_filt_handle(struct acc_classifier *, int); static struct acc_filter *filth_to_filtp(struct acc_classifier *, u_long); static u_int32_t filt2fibmask(struct flow_filter *); static void ip4f_cache(struct ip *, struct flowinfo_in *); static int ip4f_lookup(struct ip *, struct flowinfo_in *); static int ip4f_init(void); static struct ip4_frag *ip4f_alloc(void); static void ip4f_free(struct ip4_frag *); #endif /* ALTQ3_CLFIER_COMPAT */ /* * alternate queueing support routines */ /* look up the queue state by the interface name and the queueing type. */ void * altq_lookup(name, type) char *name; int type; { struct ifnet *ifp; if ((ifp = ifunit(name)) != NULL) { /* read if_snd unlocked */ if (type != ALTQT_NONE && ifp->if_snd.altq_type == type) return (ifp->if_snd.altq_disc); } return NULL; } int altq_attach(ifq, type, discipline, enqueue, dequeue, request, clfier, classify) struct ifaltq *ifq; int type; void *discipline; int (*enqueue)(struct ifaltq *, struct mbuf *, struct altq_pktattr *); struct mbuf *(*dequeue)(struct ifaltq *, int); int (*request)(struct ifaltq *, int, void *); void *clfier; void *(*classify)(void *, struct mbuf *, int); { IFQ_LOCK(ifq); if (!ALTQ_IS_READY(ifq)) { IFQ_UNLOCK(ifq); return ENXIO; } #ifdef ALTQ3_COMPAT /* * pfaltq can override the existing discipline, but altq3 cannot. * check these if clfier is not NULL (which implies altq3). */ if (clfier != NULL) { if (ALTQ_IS_ENABLED(ifq)) { IFQ_UNLOCK(ifq); return EBUSY; } if (ALTQ_IS_ATTACHED(ifq)) { IFQ_UNLOCK(ifq); return EEXIST; } } #endif ifq->altq_type = type; ifq->altq_disc = discipline; ifq->altq_enqueue = enqueue; ifq->altq_dequeue = dequeue; ifq->altq_request = request; ifq->altq_clfier = clfier; ifq->altq_classify = classify; ifq->altq_flags &= (ALTQF_CANTCHANGE|ALTQF_ENABLED); #ifdef ALTQ3_COMPAT #ifdef ALTQ_KLD altq_module_incref(type); #endif #endif IFQ_UNLOCK(ifq); return 0; } int altq_detach(ifq) struct ifaltq *ifq; { IFQ_LOCK(ifq); if (!ALTQ_IS_READY(ifq)) { IFQ_UNLOCK(ifq); return ENXIO; } if (ALTQ_IS_ENABLED(ifq)) { IFQ_UNLOCK(ifq); return EBUSY; } if (!ALTQ_IS_ATTACHED(ifq)) { IFQ_UNLOCK(ifq); return (0); } #ifdef ALTQ3_COMPAT #ifdef ALTQ_KLD altq_module_declref(ifq->altq_type); #endif #endif ifq->altq_type = ALTQT_NONE; ifq->altq_disc = NULL; ifq->altq_enqueue = NULL; ifq->altq_dequeue = NULL; ifq->altq_request = NULL; ifq->altq_clfier = NULL; ifq->altq_classify = NULL; ifq->altq_flags &= ALTQF_CANTCHANGE; IFQ_UNLOCK(ifq); return 0; } int altq_enable(ifq) struct ifaltq *ifq; { int s; IFQ_LOCK(ifq); if (!ALTQ_IS_READY(ifq)) { IFQ_UNLOCK(ifq); return ENXIO; } if (ALTQ_IS_ENABLED(ifq)) { IFQ_UNLOCK(ifq); return 0; } s = splnet(); IFQ_PURGE_NOLOCK(ifq); ASSERT(ifq->ifq_len == 0); ifq->ifq_drv_maxlen = 0; /* disable bulk dequeue */ ifq->altq_flags |= ALTQF_ENABLED; if (ifq->altq_clfier != NULL) ifq->altq_flags |= ALTQF_CLASSIFY; splx(s); IFQ_UNLOCK(ifq); return 0; } int altq_disable(ifq) struct ifaltq *ifq; { int s; IFQ_LOCK(ifq); if (!ALTQ_IS_ENABLED(ifq)) { IFQ_UNLOCK(ifq); return 0; } s = splnet(); IFQ_PURGE_NOLOCK(ifq); ASSERT(ifq->ifq_len == 0); ifq->altq_flags &= ~(ALTQF_ENABLED|ALTQF_CLASSIFY); splx(s); IFQ_UNLOCK(ifq); return 0; } #ifdef ALTQ_DEBUG void altq_assert(file, line, failedexpr) const char *file, *failedexpr; int line; { (void)printf("altq assertion \"%s\" failed: file \"%s\", line %d\n", failedexpr, file, line); panic("altq assertion"); /* NOTREACHED */ } #endif /* * internal representation of token bucket parameters * rate: byte_per_unittime << 32 * (((bits_per_sec) / 8) << 32) / machclk_freq * depth: byte << 32 * */ #define TBR_SHIFT 32 #define TBR_SCALE(x) ((int64_t)(x) << TBR_SHIFT) #define TBR_UNSCALE(x) ((x) >> TBR_SHIFT) static struct mbuf * tbr_dequeue(ifq, op) struct ifaltq *ifq; int op; { struct tb_regulator *tbr; struct mbuf *m; int64_t interval; u_int64_t now; IFQ_LOCK_ASSERT(ifq); tbr = ifq->altq_tbr; if (op == ALTDQ_REMOVE && tbr->tbr_lastop == ALTDQ_POLL) { /* if this is a remove after poll, bypass tbr check */ } else { /* update token only when it is negative */ if (tbr->tbr_token <= 0) { now = read_machclk(); interval = now - tbr->tbr_last; if (interval >= tbr->tbr_filluptime) tbr->tbr_token = tbr->tbr_depth; else { tbr->tbr_token += interval * tbr->tbr_rate; if (tbr->tbr_token > tbr->tbr_depth) tbr->tbr_token = tbr->tbr_depth; } tbr->tbr_last = now; } /* if token is still negative, don't allow dequeue */ if (tbr->tbr_token <= 0) return (NULL); } if (ALTQ_IS_ENABLED(ifq)) m = (*ifq->altq_dequeue)(ifq, op); else { if (op == ALTDQ_POLL) _IF_POLL(ifq, m); else _IF_DEQUEUE(ifq, m); } if (m != NULL && op == ALTDQ_REMOVE) tbr->tbr_token -= TBR_SCALE(m_pktlen(m)); tbr->tbr_lastop = op; return (m); } /* * set a token bucket regulator. * if the specified rate is zero, the token bucket regulator is deleted. */ int tbr_set(ifq, profile) struct ifaltq *ifq; struct tb_profile *profile; { struct tb_regulator *tbr, *otbr; if (tbr_dequeue_ptr == NULL) tbr_dequeue_ptr = tbr_dequeue; if (machclk_freq == 0) init_machclk(); if (machclk_freq == 0) { printf("tbr_set: no cpu clock available!\n"); return (ENXIO); } IFQ_LOCK(ifq); if (profile->rate == 0) { /* delete this tbr */ if ((tbr = ifq->altq_tbr) == NULL) { IFQ_UNLOCK(ifq); return (ENOENT); } ifq->altq_tbr = NULL; free(tbr, M_DEVBUF); IFQ_UNLOCK(ifq); return (0); } tbr = malloc(sizeof(struct tb_regulator), M_DEVBUF, M_NOWAIT | M_ZERO); if (tbr == NULL) { IFQ_UNLOCK(ifq); return (ENOMEM); } tbr->tbr_rate = TBR_SCALE(profile->rate / 8) / machclk_freq; tbr->tbr_depth = TBR_SCALE(profile->depth); if (tbr->tbr_rate > 0) tbr->tbr_filluptime = tbr->tbr_depth / tbr->tbr_rate; else tbr->tbr_filluptime = 0xffffffffffffffffLL; tbr->tbr_token = tbr->tbr_depth; tbr->tbr_last = read_machclk(); tbr->tbr_lastop = ALTDQ_REMOVE; otbr = ifq->altq_tbr; ifq->altq_tbr = tbr; /* set the new tbr */ if (otbr != NULL) free(otbr, M_DEVBUF); else { if (tbr_timer == 0) { CALLOUT_RESET(&tbr_callout, 1, tbr_timeout, (void *)0); tbr_timer = 1; } } IFQ_UNLOCK(ifq); return (0); } /* * tbr_timeout goes through the interface list, and kicks the drivers * if necessary. * * MPSAFE */ static void tbr_timeout(arg) void *arg; { VNET_ITERATOR_DECL(vnet_iter); struct ifnet *ifp; int active, s; active = 0; s = splnet(); IFNET_RLOCK_NOSLEEP(); VNET_LIST_RLOCK_NOSLEEP(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); - for (ifp = TAILQ_FIRST(&V_ifnet); ifp; - ifp = TAILQ_NEXT(ifp, if_link)) { + for (ifp = CK_STAILQ_FIRST(&V_ifnet); ifp; + ifp = CK_STAILQ_NEXT(ifp, if_link)) { /* read from if_snd unlocked */ if (!TBR_IS_ENABLED(&ifp->if_snd)) continue; active++; if (!IFQ_IS_EMPTY(&ifp->if_snd) && ifp->if_start != NULL) (*ifp->if_start)(ifp); } CURVNET_RESTORE(); } VNET_LIST_RUNLOCK_NOSLEEP(); IFNET_RUNLOCK_NOSLEEP(); splx(s); if (active > 0) CALLOUT_RESET(&tbr_callout, 1, tbr_timeout, (void *)0); else tbr_timer = 0; /* don't need tbr_timer anymore */ } /* * get token bucket regulator profile */ int tbr_get(ifq, profile) struct ifaltq *ifq; struct tb_profile *profile; { struct tb_regulator *tbr; IFQ_LOCK(ifq); if ((tbr = ifq->altq_tbr) == NULL) { profile->rate = 0; profile->depth = 0; } else { profile->rate = (u_int)TBR_UNSCALE(tbr->tbr_rate * 8 * machclk_freq); profile->depth = (u_int)TBR_UNSCALE(tbr->tbr_depth); } IFQ_UNLOCK(ifq); return (0); } /* * attach a discipline to the interface. if one already exists, it is * overridden. * Locking is done in the discipline specific attach functions. Basically * they call back to altq_attach which takes care of the attach and locking. */ int altq_pfattach(struct pf_altq *a) { int error = 0; switch (a->scheduler) { case ALTQT_NONE: break; #ifdef ALTQ_CBQ case ALTQT_CBQ: error = cbq_pfattach(a); break; #endif #ifdef ALTQ_PRIQ case ALTQT_PRIQ: error = priq_pfattach(a); break; #endif #ifdef ALTQ_HFSC case ALTQT_HFSC: error = hfsc_pfattach(a); break; #endif #ifdef ALTQ_FAIRQ case ALTQT_FAIRQ: error = fairq_pfattach(a); break; #endif #ifdef ALTQ_CODEL case ALTQT_CODEL: error = codel_pfattach(a); break; #endif default: error = ENXIO; } return (error); } /* * detach a discipline from the interface. * it is possible that the discipline was already overridden by another * discipline. */ int altq_pfdetach(struct pf_altq *a) { struct ifnet *ifp; int s, error = 0; if ((ifp = ifunit(a->ifname)) == NULL) return (EINVAL); /* if this discipline is no longer referenced, just return */ /* read unlocked from if_snd */ if (a->altq_disc == NULL || a->altq_disc != ifp->if_snd.altq_disc) return (0); s = splnet(); /* read unlocked from if_snd, _disable and _detach take care */ if (ALTQ_IS_ENABLED(&ifp->if_snd)) error = altq_disable(&ifp->if_snd); if (error == 0) error = altq_detach(&ifp->if_snd); splx(s); return (error); } /* * add a discipline or a queue * Locking is done in the discipline specific functions with regards to * malloc with WAITOK, also it is not yet clear which lock to use. */ int altq_add(struct pf_altq *a) { int error = 0; if (a->qname[0] != 0) return (altq_add_queue(a)); if (machclk_freq == 0) init_machclk(); if (machclk_freq == 0) panic("altq_add: no cpu clock"); switch (a->scheduler) { #ifdef ALTQ_CBQ case ALTQT_CBQ: error = cbq_add_altq(a); break; #endif #ifdef ALTQ_PRIQ case ALTQT_PRIQ: error = priq_add_altq(a); break; #endif #ifdef ALTQ_HFSC case ALTQT_HFSC: error = hfsc_add_altq(a); break; #endif #ifdef ALTQ_FAIRQ case ALTQT_FAIRQ: error = fairq_add_altq(a); break; #endif #ifdef ALTQ_CODEL case ALTQT_CODEL: error = codel_add_altq(a); break; #endif default: error = ENXIO; } return (error); } /* * remove a discipline or a queue * It is yet unclear what lock to use to protect this operation, the * discipline specific functions will determine and grab it */ int altq_remove(struct pf_altq *a) { int error = 0; if (a->qname[0] != 0) return (altq_remove_queue(a)); switch (a->scheduler) { #ifdef ALTQ_CBQ case ALTQT_CBQ: error = cbq_remove_altq(a); break; #endif #ifdef ALTQ_PRIQ case ALTQT_PRIQ: error = priq_remove_altq(a); break; #endif #ifdef ALTQ_HFSC case ALTQT_HFSC: error = hfsc_remove_altq(a); break; #endif #ifdef ALTQ_FAIRQ case ALTQT_FAIRQ: error = fairq_remove_altq(a); break; #endif #ifdef ALTQ_CODEL case ALTQT_CODEL: error = codel_remove_altq(a); break; #endif default: error = ENXIO; } return (error); } /* * add a queue to the discipline * It is yet unclear what lock to use to protect this operation, the * discipline specific functions will determine and grab it */ int altq_add_queue(struct pf_altq *a) { int error = 0; switch (a->scheduler) { #ifdef ALTQ_CBQ case ALTQT_CBQ: error = cbq_add_queue(a); break; #endif #ifdef ALTQ_PRIQ case ALTQT_PRIQ: error = priq_add_queue(a); break; #endif #ifdef ALTQ_HFSC case ALTQT_HFSC: error = hfsc_add_queue(a); break; #endif #ifdef ALTQ_FAIRQ case ALTQT_FAIRQ: error = fairq_add_queue(a); break; #endif default: error = ENXIO; } return (error); } /* * remove a queue from the discipline * It is yet unclear what lock to use to protect this operation, the * discipline specific functions will determine and grab it */ int altq_remove_queue(struct pf_altq *a) { int error = 0; switch (a->scheduler) { #ifdef ALTQ_CBQ case ALTQT_CBQ: error = cbq_remove_queue(a); break; #endif #ifdef ALTQ_PRIQ case ALTQT_PRIQ: error = priq_remove_queue(a); break; #endif #ifdef ALTQ_HFSC case ALTQT_HFSC: error = hfsc_remove_queue(a); break; #endif #ifdef ALTQ_FAIRQ case ALTQT_FAIRQ: error = fairq_remove_queue(a); break; #endif default: error = ENXIO; } return (error); } /* * get queue statistics * Locking is done in the discipline specific functions with regards to * copyout operations, also it is not yet clear which lock to use. */ int altq_getqstats(struct pf_altq *a, void *ubuf, int *nbytes) { int error = 0; switch (a->scheduler) { #ifdef ALTQ_CBQ case ALTQT_CBQ: error = cbq_getqstats(a, ubuf, nbytes); break; #endif #ifdef ALTQ_PRIQ case ALTQT_PRIQ: error = priq_getqstats(a, ubuf, nbytes); break; #endif #ifdef ALTQ_HFSC case ALTQT_HFSC: error = hfsc_getqstats(a, ubuf, nbytes); break; #endif #ifdef ALTQ_FAIRQ case ALTQT_FAIRQ: error = fairq_getqstats(a, ubuf, nbytes); break; #endif #ifdef ALTQ_CODEL case ALTQT_CODEL: error = codel_getqstats(a, ubuf, nbytes); break; #endif default: error = ENXIO; } return (error); } /* * read and write diffserv field in IPv4 or IPv6 header */ u_int8_t read_dsfield(m, pktattr) struct mbuf *m; struct altq_pktattr *pktattr; { struct mbuf *m0; u_int8_t ds_field = 0; if (pktattr == NULL || (pktattr->pattr_af != AF_INET && pktattr->pattr_af != AF_INET6)) return ((u_int8_t)0); /* verify that pattr_hdr is within the mbuf data */ for (m0 = m; m0 != NULL; m0 = m0->m_next) if ((pktattr->pattr_hdr >= m0->m_data) && (pktattr->pattr_hdr < m0->m_data + m0->m_len)) break; if (m0 == NULL) { /* ick, pattr_hdr is stale */ pktattr->pattr_af = AF_UNSPEC; #ifdef ALTQ_DEBUG printf("read_dsfield: can't locate header!\n"); #endif return ((u_int8_t)0); } if (pktattr->pattr_af == AF_INET) { struct ip *ip = (struct ip *)pktattr->pattr_hdr; if (ip->ip_v != 4) return ((u_int8_t)0); /* version mismatch! */ ds_field = ip->ip_tos; } #ifdef INET6 else if (pktattr->pattr_af == AF_INET6) { struct ip6_hdr *ip6 = (struct ip6_hdr *)pktattr->pattr_hdr; u_int32_t flowlabel; flowlabel = ntohl(ip6->ip6_flow); if ((flowlabel >> 28) != 6) return ((u_int8_t)0); /* version mismatch! */ ds_field = (flowlabel >> 20) & 0xff; } #endif return (ds_field); } void write_dsfield(struct mbuf *m, struct altq_pktattr *pktattr, u_int8_t dsfield) { struct mbuf *m0; if (pktattr == NULL || (pktattr->pattr_af != AF_INET && pktattr->pattr_af != AF_INET6)) return; /* verify that pattr_hdr is within the mbuf data */ for (m0 = m; m0 != NULL; m0 = m0->m_next) if ((pktattr->pattr_hdr >= m0->m_data) && (pktattr->pattr_hdr < m0->m_data + m0->m_len)) break; if (m0 == NULL) { /* ick, pattr_hdr is stale */ pktattr->pattr_af = AF_UNSPEC; #ifdef ALTQ_DEBUG printf("write_dsfield: can't locate header!\n"); #endif return; } if (pktattr->pattr_af == AF_INET) { struct ip *ip = (struct ip *)pktattr->pattr_hdr; u_int8_t old; int32_t sum; if (ip->ip_v != 4) return; /* version mismatch! */ old = ip->ip_tos; dsfield |= old & 3; /* leave CU bits */ if (old == dsfield) return; ip->ip_tos = dsfield; /* * update checksum (from RFC1624) * HC' = ~(~HC + ~m + m') */ sum = ~ntohs(ip->ip_sum) & 0xffff; sum += 0xff00 + (~old & 0xff) + dsfield; sum = (sum >> 16) + (sum & 0xffff); sum += (sum >> 16); /* add carry */ ip->ip_sum = htons(~sum & 0xffff); } #ifdef INET6 else if (pktattr->pattr_af == AF_INET6) { struct ip6_hdr *ip6 = (struct ip6_hdr *)pktattr->pattr_hdr; u_int32_t flowlabel; flowlabel = ntohl(ip6->ip6_flow); if ((flowlabel >> 28) != 6) return; /* version mismatch! */ flowlabel = (flowlabel & 0xf03fffff) | (dsfield << 20); ip6->ip6_flow = htonl(flowlabel); } #endif return; } /* * high resolution clock support taking advantage of a machine dependent * high resolution time counter (e.g., timestamp counter of intel pentium). * we assume * - 64-bit-long monotonically-increasing counter * - frequency range is 100M-4GHz (CPU speed) */ /* if pcc is not available or disabled, emulate 256MHz using microtime() */ #define MACHCLK_SHIFT 8 int machclk_usepcc; u_int32_t machclk_freq; u_int32_t machclk_per_tick; #if defined(__i386__) && defined(__NetBSD__) extern u_int64_t cpu_tsc_freq; #endif #if (__FreeBSD_version >= 700035) /* Update TSC freq with the value indicated by the caller. */ static void tsc_freq_changed(void *arg, const struct cf_level *level, int status) { /* If there was an error during the transition, don't do anything. */ if (status != 0) return; #if (__FreeBSD_version >= 701102) && (defined(__amd64__) || defined(__i386__)) /* If TSC is P-state invariant, don't do anything. */ if (tsc_is_invariant) return; #endif /* Total setting for this level gives the new frequency in MHz. */ init_machclk(); } EVENTHANDLER_DEFINE(cpufreq_post_change, tsc_freq_changed, NULL, EVENTHANDLER_PRI_LAST); #endif /* __FreeBSD_version >= 700035 */ static void init_machclk_setup(void) { #if (__FreeBSD_version >= 600000) callout_init(&tbr_callout, 0); #endif machclk_usepcc = 1; #if (!defined(__amd64__) && !defined(__i386__)) || defined(ALTQ_NOPCC) machclk_usepcc = 0; #endif #if defined(__FreeBSD__) && defined(SMP) machclk_usepcc = 0; #endif #if defined(__NetBSD__) && defined(MULTIPROCESSOR) machclk_usepcc = 0; #endif #if defined(__amd64__) || defined(__i386__) /* check if TSC is available */ if ((cpu_feature & CPUID_TSC) == 0 || atomic_load_acq_64(&tsc_freq) == 0) machclk_usepcc = 0; #endif } void init_machclk(void) { static int called; /* Call one-time initialization function. */ if (!called) { init_machclk_setup(); called = 1; } if (machclk_usepcc == 0) { /* emulate 256MHz using microtime() */ machclk_freq = 1000000 << MACHCLK_SHIFT; machclk_per_tick = machclk_freq / hz; #ifdef ALTQ_DEBUG printf("altq: emulate %uHz cpu clock\n", machclk_freq); #endif return; } /* * if the clock frequency (of Pentium TSC or Alpha PCC) is * accessible, just use it. */ #if defined(__amd64__) || defined(__i386__) machclk_freq = atomic_load_acq_64(&tsc_freq); #endif /* * if we don't know the clock frequency, measure it. */ if (machclk_freq == 0) { static int wait; struct timeval tv_start, tv_end; u_int64_t start, end, diff; int timo; microtime(&tv_start); start = read_machclk(); timo = hz; /* 1 sec */ (void)tsleep(&wait, PWAIT | PCATCH, "init_machclk", timo); microtime(&tv_end); end = read_machclk(); diff = (u_int64_t)(tv_end.tv_sec - tv_start.tv_sec) * 1000000 + tv_end.tv_usec - tv_start.tv_usec; if (diff != 0) machclk_freq = (u_int)((end - start) * 1000000 / diff); } machclk_per_tick = machclk_freq / hz; #ifdef ALTQ_DEBUG printf("altq: CPU clock: %uHz\n", machclk_freq); #endif } #if defined(__OpenBSD__) && defined(__i386__) static __inline u_int64_t rdtsc(void) { u_int64_t rv; __asm __volatile(".byte 0x0f, 0x31" : "=A" (rv)); return (rv); } #endif /* __OpenBSD__ && __i386__ */ u_int64_t read_machclk(void) { u_int64_t val; if (machclk_usepcc) { #if defined(__amd64__) || defined(__i386__) val = rdtsc(); #else panic("read_machclk"); #endif } else { struct timeval tv, boottime; microtime(&tv); getboottime(&boottime); val = (((u_int64_t)(tv.tv_sec - boottime.tv_sec) * 1000000 + tv.tv_usec) << MACHCLK_SHIFT); } return (val); } #ifdef ALTQ3_CLFIER_COMPAT #ifndef IPPROTO_ESP #define IPPROTO_ESP 50 /* encapsulating security payload */ #endif #ifndef IPPROTO_AH #define IPPROTO_AH 51 /* authentication header */ #endif /* * extract flow information from a given packet. * filt_mask shows flowinfo fields required. * we assume the ip header is in one mbuf, and addresses and ports are * in network byte order. */ int altq_extractflow(m, af, flow, filt_bmask) struct mbuf *m; int af; struct flowinfo *flow; u_int32_t filt_bmask; { switch (af) { case PF_INET: { struct flowinfo_in *fin; struct ip *ip; ip = mtod(m, struct ip *); if (ip->ip_v != 4) break; fin = (struct flowinfo_in *)flow; fin->fi_len = sizeof(struct flowinfo_in); fin->fi_family = AF_INET; fin->fi_proto = ip->ip_p; fin->fi_tos = ip->ip_tos; fin->fi_src.s_addr = ip->ip_src.s_addr; fin->fi_dst.s_addr = ip->ip_dst.s_addr; if (filt_bmask & FIMB4_PORTS) /* if port info is required, extract port numbers */ extract_ports4(m, ip, fin); else { fin->fi_sport = 0; fin->fi_dport = 0; fin->fi_gpi = 0; } return (1); } #ifdef INET6 case PF_INET6: { struct flowinfo_in6 *fin6; struct ip6_hdr *ip6; ip6 = mtod(m, struct ip6_hdr *); /* should we check the ip version? */ fin6 = (struct flowinfo_in6 *)flow; fin6->fi6_len = sizeof(struct flowinfo_in6); fin6->fi6_family = AF_INET6; fin6->fi6_proto = ip6->ip6_nxt; fin6->fi6_tclass = (ntohl(ip6->ip6_flow) >> 20) & 0xff; fin6->fi6_flowlabel = ip6->ip6_flow & htonl(0x000fffff); fin6->fi6_src = ip6->ip6_src; fin6->fi6_dst = ip6->ip6_dst; if ((filt_bmask & FIMB6_PORTS) || ((filt_bmask & FIMB6_PROTO) && ip6->ip6_nxt > IPPROTO_IPV6)) /* * if port info is required, or proto is required * but there are option headers, extract port * and protocol numbers. */ extract_ports6(m, ip6, fin6); else { fin6->fi6_sport = 0; fin6->fi6_dport = 0; fin6->fi6_gpi = 0; } return (1); } #endif /* INET6 */ default: break; } /* failed */ flow->fi_len = sizeof(struct flowinfo); flow->fi_family = AF_UNSPEC; return (0); } /* * helper routine to extract port numbers */ /* structure for ipsec and ipv6 option header template */ struct _opt6 { u_int8_t opt6_nxt; /* next header */ u_int8_t opt6_hlen; /* header extension length */ u_int16_t _pad; u_int32_t ah_spi; /* security parameter index for authentication header */ }; /* * extract port numbers from a ipv4 packet. */ static int extract_ports4(m, ip, fin) struct mbuf *m; struct ip *ip; struct flowinfo_in *fin; { struct mbuf *m0; u_short ip_off; u_int8_t proto; int off; fin->fi_sport = 0; fin->fi_dport = 0; fin->fi_gpi = 0; ip_off = ntohs(ip->ip_off); /* if it is a fragment, try cached fragment info */ if (ip_off & IP_OFFMASK) { ip4f_lookup(ip, fin); return (1); } /* locate the mbuf containing the protocol header */ for (m0 = m; m0 != NULL; m0 = m0->m_next) if (((caddr_t)ip >= m0->m_data) && ((caddr_t)ip < m0->m_data + m0->m_len)) break; if (m0 == NULL) { #ifdef ALTQ_DEBUG printf("extract_ports4: can't locate header! ip=%p\n", ip); #endif return (0); } off = ((caddr_t)ip - m0->m_data) + (ip->ip_hl << 2); proto = ip->ip_p; #ifdef ALTQ_IPSEC again: #endif while (off >= m0->m_len) { off -= m0->m_len; m0 = m0->m_next; if (m0 == NULL) return (0); /* bogus ip_hl! */ } if (m0->m_len < off + 4) return (0); switch (proto) { case IPPROTO_TCP: case IPPROTO_UDP: { struct udphdr *udp; udp = (struct udphdr *)(mtod(m0, caddr_t) + off); fin->fi_sport = udp->uh_sport; fin->fi_dport = udp->uh_dport; fin->fi_proto = proto; } break; #ifdef ALTQ_IPSEC case IPPROTO_ESP: if (fin->fi_gpi == 0){ u_int32_t *gpi; gpi = (u_int32_t *)(mtod(m0, caddr_t) + off); fin->fi_gpi = *gpi; } fin->fi_proto = proto; break; case IPPROTO_AH: { /* get next header and header length */ struct _opt6 *opt6; opt6 = (struct _opt6 *)(mtod(m0, caddr_t) + off); proto = opt6->opt6_nxt; off += 8 + (opt6->opt6_hlen * 4); if (fin->fi_gpi == 0 && m0->m_len >= off + 8) fin->fi_gpi = opt6->ah_spi; } /* goto the next header */ goto again; #endif /* ALTQ_IPSEC */ default: fin->fi_proto = proto; return (0); } /* if this is a first fragment, cache it. */ if (ip_off & IP_MF) ip4f_cache(ip, fin); return (1); } #ifdef INET6 static int extract_ports6(m, ip6, fin6) struct mbuf *m; struct ip6_hdr *ip6; struct flowinfo_in6 *fin6; { struct mbuf *m0; int off; u_int8_t proto; fin6->fi6_gpi = 0; fin6->fi6_sport = 0; fin6->fi6_dport = 0; /* locate the mbuf containing the protocol header */ for (m0 = m; m0 != NULL; m0 = m0->m_next) if (((caddr_t)ip6 >= m0->m_data) && ((caddr_t)ip6 < m0->m_data + m0->m_len)) break; if (m0 == NULL) { #ifdef ALTQ_DEBUG printf("extract_ports6: can't locate header! ip6=%p\n", ip6); #endif return (0); } off = ((caddr_t)ip6 - m0->m_data) + sizeof(struct ip6_hdr); proto = ip6->ip6_nxt; do { while (off >= m0->m_len) { off -= m0->m_len; m0 = m0->m_next; if (m0 == NULL) return (0); } if (m0->m_len < off + 4) return (0); switch (proto) { case IPPROTO_TCP: case IPPROTO_UDP: { struct udphdr *udp; udp = (struct udphdr *)(mtod(m0, caddr_t) + off); fin6->fi6_sport = udp->uh_sport; fin6->fi6_dport = udp->uh_dport; fin6->fi6_proto = proto; } return (1); case IPPROTO_ESP: if (fin6->fi6_gpi == 0) { u_int32_t *gpi; gpi = (u_int32_t *)(mtod(m0, caddr_t) + off); fin6->fi6_gpi = *gpi; } fin6->fi6_proto = proto; return (1); case IPPROTO_AH: { /* get next header and header length */ struct _opt6 *opt6; opt6 = (struct _opt6 *)(mtod(m0, caddr_t) + off); if (fin6->fi6_gpi == 0 && m0->m_len >= off + 8) fin6->fi6_gpi = opt6->ah_spi; proto = opt6->opt6_nxt; off += 8 + (opt6->opt6_hlen * 4); /* goto the next header */ break; } case IPPROTO_HOPOPTS: case IPPROTO_ROUTING: case IPPROTO_DSTOPTS: { /* get next header and header length */ struct _opt6 *opt6; opt6 = (struct _opt6 *)(mtod(m0, caddr_t) + off); proto = opt6->opt6_nxt; off += (opt6->opt6_hlen + 1) * 8; /* goto the next header */ break; } case IPPROTO_FRAGMENT: /* ipv6 fragmentations are not supported yet */ default: fin6->fi6_proto = proto; return (0); } } while (1); /*NOTREACHED*/ } #endif /* INET6 */ /* * altq common classifier */ int acc_add_filter(classifier, filter, class, phandle) struct acc_classifier *classifier; struct flow_filter *filter; void *class; u_long *phandle; { struct acc_filter *afp, *prev, *tmp; int i, s; #ifdef INET6 if (filter->ff_flow.fi_family != AF_INET && filter->ff_flow.fi_family != AF_INET6) return (EINVAL); #else if (filter->ff_flow.fi_family != AF_INET) return (EINVAL); #endif afp = malloc(sizeof(struct acc_filter), M_DEVBUF, M_WAITOK); if (afp == NULL) return (ENOMEM); bzero(afp, sizeof(struct acc_filter)); afp->f_filter = *filter; afp->f_class = class; i = ACC_WILDCARD_INDEX; if (filter->ff_flow.fi_family == AF_INET) { struct flow_filter *filter4 = &afp->f_filter; /* * if address is 0, it's a wildcard. if address mask * isn't set, use full mask. */ if (filter4->ff_flow.fi_dst.s_addr == 0) filter4->ff_mask.mask_dst.s_addr = 0; else if (filter4->ff_mask.mask_dst.s_addr == 0) filter4->ff_mask.mask_dst.s_addr = 0xffffffff; if (filter4->ff_flow.fi_src.s_addr == 0) filter4->ff_mask.mask_src.s_addr = 0; else if (filter4->ff_mask.mask_src.s_addr == 0) filter4->ff_mask.mask_src.s_addr = 0xffffffff; /* clear extra bits in addresses */ filter4->ff_flow.fi_dst.s_addr &= filter4->ff_mask.mask_dst.s_addr; filter4->ff_flow.fi_src.s_addr &= filter4->ff_mask.mask_src.s_addr; /* * if dst address is a wildcard, use hash-entry * ACC_WILDCARD_INDEX. */ if (filter4->ff_mask.mask_dst.s_addr != 0xffffffff) i = ACC_WILDCARD_INDEX; else i = ACC_GET_HASH_INDEX(filter4->ff_flow.fi_dst.s_addr); } #ifdef INET6 else if (filter->ff_flow.fi_family == AF_INET6) { struct flow_filter6 *filter6 = (struct flow_filter6 *)&afp->f_filter; #ifndef IN6MASK0 /* taken from kame ipv6 */ #define IN6MASK0 {{{ 0, 0, 0, 0 }}} #define IN6MASK128 {{{ 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff }}} const struct in6_addr in6mask0 = IN6MASK0; const struct in6_addr in6mask128 = IN6MASK128; #endif if (IN6_IS_ADDR_UNSPECIFIED(&filter6->ff_flow6.fi6_dst)) filter6->ff_mask6.mask6_dst = in6mask0; else if (IN6_IS_ADDR_UNSPECIFIED(&filter6->ff_mask6.mask6_dst)) filter6->ff_mask6.mask6_dst = in6mask128; if (IN6_IS_ADDR_UNSPECIFIED(&filter6->ff_flow6.fi6_src)) filter6->ff_mask6.mask6_src = in6mask0; else if (IN6_IS_ADDR_UNSPECIFIED(&filter6->ff_mask6.mask6_src)) filter6->ff_mask6.mask6_src = in6mask128; /* clear extra bits in addresses */ for (i = 0; i < 16; i++) filter6->ff_flow6.fi6_dst.s6_addr[i] &= filter6->ff_mask6.mask6_dst.s6_addr[i]; for (i = 0; i < 16; i++) filter6->ff_flow6.fi6_src.s6_addr[i] &= filter6->ff_mask6.mask6_src.s6_addr[i]; if (filter6->ff_flow6.fi6_flowlabel == 0) i = ACC_WILDCARD_INDEX; else i = ACC_GET_HASH_INDEX(filter6->ff_flow6.fi6_flowlabel); } #endif /* INET6 */ afp->f_handle = get_filt_handle(classifier, i); /* update filter bitmask */ afp->f_fbmask = filt2fibmask(filter); classifier->acc_fbmask |= afp->f_fbmask; /* * add this filter to the filter list. * filters are ordered from the highest rule number. */ s = splnet(); prev = NULL; LIST_FOREACH(tmp, &classifier->acc_filters[i], f_chain) { if (tmp->f_filter.ff_ruleno > afp->f_filter.ff_ruleno) prev = tmp; else break; } if (prev == NULL) LIST_INSERT_HEAD(&classifier->acc_filters[i], afp, f_chain); else LIST_INSERT_AFTER(prev, afp, f_chain); splx(s); *phandle = afp->f_handle; return (0); } int acc_delete_filter(classifier, handle) struct acc_classifier *classifier; u_long handle; { struct acc_filter *afp; int s; if ((afp = filth_to_filtp(classifier, handle)) == NULL) return (EINVAL); s = splnet(); LIST_REMOVE(afp, f_chain); splx(s); free(afp, M_DEVBUF); /* todo: update filt_bmask */ return (0); } /* * delete filters referencing to the specified class. * if the all flag is not 0, delete all the filters. */ int acc_discard_filters(classifier, class, all) struct acc_classifier *classifier; void *class; int all; { struct acc_filter *afp; int i, s; s = splnet(); for (i = 0; i < ACC_FILTER_TABLESIZE; i++) { do { LIST_FOREACH(afp, &classifier->acc_filters[i], f_chain) if (all || afp->f_class == class) { LIST_REMOVE(afp, f_chain); free(afp, M_DEVBUF); /* start again from the head */ break; } } while (afp != NULL); } splx(s); if (all) classifier->acc_fbmask = 0; return (0); } void * acc_classify(clfier, m, af) void *clfier; struct mbuf *m; int af; { struct acc_classifier *classifier; struct flowinfo flow; struct acc_filter *afp; int i; classifier = (struct acc_classifier *)clfier; altq_extractflow(m, af, &flow, classifier->acc_fbmask); if (flow.fi_family == AF_INET) { struct flowinfo_in *fp = (struct flowinfo_in *)&flow; if ((classifier->acc_fbmask & FIMB4_ALL) == FIMB4_TOS) { /* only tos is used */ LIST_FOREACH(afp, &classifier->acc_filters[ACC_WILDCARD_INDEX], f_chain) if (apply_tosfilter4(afp->f_fbmask, &afp->f_filter, fp)) /* filter matched */ return (afp->f_class); } else if ((classifier->acc_fbmask & (~(FIMB4_PROTO|FIMB4_SPORT|FIMB4_DPORT) & FIMB4_ALL)) == 0) { /* only proto and ports are used */ LIST_FOREACH(afp, &classifier->acc_filters[ACC_WILDCARD_INDEX], f_chain) if (apply_ppfilter4(afp->f_fbmask, &afp->f_filter, fp)) /* filter matched */ return (afp->f_class); } else { /* get the filter hash entry from its dest address */ i = ACC_GET_HASH_INDEX(fp->fi_dst.s_addr); do { /* * go through this loop twice. first for dst * hash, second for wildcards. */ LIST_FOREACH(afp, &classifier->acc_filters[i], f_chain) if (apply_filter4(afp->f_fbmask, &afp->f_filter, fp)) /* filter matched */ return (afp->f_class); /* * check again for filters with a dst addr * wildcard. * (daddr == 0 || dmask != 0xffffffff). */ if (i != ACC_WILDCARD_INDEX) i = ACC_WILDCARD_INDEX; else break; } while (1); } } #ifdef INET6 else if (flow.fi_family == AF_INET6) { struct flowinfo_in6 *fp6 = (struct flowinfo_in6 *)&flow; /* get the filter hash entry from its flow ID */ if (fp6->fi6_flowlabel != 0) i = ACC_GET_HASH_INDEX(fp6->fi6_flowlabel); else /* flowlable can be zero */ i = ACC_WILDCARD_INDEX; /* go through this loop twice. first for flow hash, second for wildcards. */ do { LIST_FOREACH(afp, &classifier->acc_filters[i], f_chain) if (apply_filter6(afp->f_fbmask, (struct flow_filter6 *)&afp->f_filter, fp6)) /* filter matched */ return (afp->f_class); /* * check again for filters with a wildcard. */ if (i != ACC_WILDCARD_INDEX) i = ACC_WILDCARD_INDEX; else break; } while (1); } #endif /* INET6 */ /* no filter matched */ return (NULL); } static int apply_filter4(fbmask, filt, pkt) u_int32_t fbmask; struct flow_filter *filt; struct flowinfo_in *pkt; { if (filt->ff_flow.fi_family != AF_INET) return (0); if ((fbmask & FIMB4_SPORT) && filt->ff_flow.fi_sport != pkt->fi_sport) return (0); if ((fbmask & FIMB4_DPORT) && filt->ff_flow.fi_dport != pkt->fi_dport) return (0); if ((fbmask & FIMB4_DADDR) && filt->ff_flow.fi_dst.s_addr != (pkt->fi_dst.s_addr & filt->ff_mask.mask_dst.s_addr)) return (0); if ((fbmask & FIMB4_SADDR) && filt->ff_flow.fi_src.s_addr != (pkt->fi_src.s_addr & filt->ff_mask.mask_src.s_addr)) return (0); if ((fbmask & FIMB4_PROTO) && filt->ff_flow.fi_proto != pkt->fi_proto) return (0); if ((fbmask & FIMB4_TOS) && filt->ff_flow.fi_tos != (pkt->fi_tos & filt->ff_mask.mask_tos)) return (0); if ((fbmask & FIMB4_GPI) && filt->ff_flow.fi_gpi != (pkt->fi_gpi)) return (0); /* match */ return (1); } /* * filter matching function optimized for a common case that checks * only protocol and port numbers */ static int apply_ppfilter4(fbmask, filt, pkt) u_int32_t fbmask; struct flow_filter *filt; struct flowinfo_in *pkt; { if (filt->ff_flow.fi_family != AF_INET) return (0); if ((fbmask & FIMB4_SPORT) && filt->ff_flow.fi_sport != pkt->fi_sport) return (0); if ((fbmask & FIMB4_DPORT) && filt->ff_flow.fi_dport != pkt->fi_dport) return (0); if ((fbmask & FIMB4_PROTO) && filt->ff_flow.fi_proto != pkt->fi_proto) return (0); /* match */ return (1); } /* * filter matching function only for tos field. */ static int apply_tosfilter4(fbmask, filt, pkt) u_int32_t fbmask; struct flow_filter *filt; struct flowinfo_in *pkt; { if (filt->ff_flow.fi_family != AF_INET) return (0); if ((fbmask & FIMB4_TOS) && filt->ff_flow.fi_tos != (pkt->fi_tos & filt->ff_mask.mask_tos)) return (0); /* match */ return (1); } #ifdef INET6 static int apply_filter6(fbmask, filt, pkt) u_int32_t fbmask; struct flow_filter6 *filt; struct flowinfo_in6 *pkt; { int i; if (filt->ff_flow6.fi6_family != AF_INET6) return (0); if ((fbmask & FIMB6_FLABEL) && filt->ff_flow6.fi6_flowlabel != pkt->fi6_flowlabel) return (0); if ((fbmask & FIMB6_PROTO) && filt->ff_flow6.fi6_proto != pkt->fi6_proto) return (0); if ((fbmask & FIMB6_SPORT) && filt->ff_flow6.fi6_sport != pkt->fi6_sport) return (0); if ((fbmask & FIMB6_DPORT) && filt->ff_flow6.fi6_dport != pkt->fi6_dport) return (0); if (fbmask & FIMB6_SADDR) { for (i = 0; i < 4; i++) if (filt->ff_flow6.fi6_src.s6_addr32[i] != (pkt->fi6_src.s6_addr32[i] & filt->ff_mask6.mask6_src.s6_addr32[i])) return (0); } if (fbmask & FIMB6_DADDR) { for (i = 0; i < 4; i++) if (filt->ff_flow6.fi6_dst.s6_addr32[i] != (pkt->fi6_dst.s6_addr32[i] & filt->ff_mask6.mask6_dst.s6_addr32[i])) return (0); } if ((fbmask & FIMB6_TCLASS) && filt->ff_flow6.fi6_tclass != (pkt->fi6_tclass & filt->ff_mask6.mask6_tclass)) return (0); if ((fbmask & FIMB6_GPI) && filt->ff_flow6.fi6_gpi != pkt->fi6_gpi) return (0); /* match */ return (1); } #endif /* INET6 */ /* * filter handle: * bit 20-28: index to the filter hash table * bit 0-19: unique id in the hash bucket. */ static u_long get_filt_handle(classifier, i) struct acc_classifier *classifier; int i; { static u_long handle_number = 1; u_long handle; struct acc_filter *afp; while (1) { handle = handle_number++ & 0x000fffff; if (LIST_EMPTY(&classifier->acc_filters[i])) break; LIST_FOREACH(afp, &classifier->acc_filters[i], f_chain) if ((afp->f_handle & 0x000fffff) == handle) break; if (afp == NULL) break; /* this handle is already used, try again */ } return ((i << 20) | handle); } /* convert filter handle to filter pointer */ static struct acc_filter * filth_to_filtp(classifier, handle) struct acc_classifier *classifier; u_long handle; { struct acc_filter *afp; int i; i = ACC_GET_HINDEX(handle); LIST_FOREACH(afp, &classifier->acc_filters[i], f_chain) if (afp->f_handle == handle) return (afp); return (NULL); } /* create flowinfo bitmask */ static u_int32_t filt2fibmask(filt) struct flow_filter *filt; { u_int32_t mask = 0; #ifdef INET6 struct flow_filter6 *filt6; #endif switch (filt->ff_flow.fi_family) { case AF_INET: if (filt->ff_flow.fi_proto != 0) mask |= FIMB4_PROTO; if (filt->ff_flow.fi_tos != 0) mask |= FIMB4_TOS; if (filt->ff_flow.fi_dst.s_addr != 0) mask |= FIMB4_DADDR; if (filt->ff_flow.fi_src.s_addr != 0) mask |= FIMB4_SADDR; if (filt->ff_flow.fi_sport != 0) mask |= FIMB4_SPORT; if (filt->ff_flow.fi_dport != 0) mask |= FIMB4_DPORT; if (filt->ff_flow.fi_gpi != 0) mask |= FIMB4_GPI; break; #ifdef INET6 case AF_INET6: filt6 = (struct flow_filter6 *)filt; if (filt6->ff_flow6.fi6_proto != 0) mask |= FIMB6_PROTO; if (filt6->ff_flow6.fi6_tclass != 0) mask |= FIMB6_TCLASS; if (!IN6_IS_ADDR_UNSPECIFIED(&filt6->ff_flow6.fi6_dst)) mask |= FIMB6_DADDR; if (!IN6_IS_ADDR_UNSPECIFIED(&filt6->ff_flow6.fi6_src)) mask |= FIMB6_SADDR; if (filt6->ff_flow6.fi6_sport != 0) mask |= FIMB6_SPORT; if (filt6->ff_flow6.fi6_dport != 0) mask |= FIMB6_DPORT; if (filt6->ff_flow6.fi6_gpi != 0) mask |= FIMB6_GPI; if (filt6->ff_flow6.fi6_flowlabel != 0) mask |= FIMB6_FLABEL; break; #endif /* INET6 */ } return (mask); } /* * helper functions to handle IPv4 fragments. * currently only in-sequence fragments are handled. * - fragment info is cached in a LRU list. * - when a first fragment is found, cache its flow info. * - when a non-first fragment is found, lookup the cache. */ struct ip4_frag { TAILQ_ENTRY(ip4_frag) ip4f_chain; char ip4f_valid; u_short ip4f_id; struct flowinfo_in ip4f_info; }; static TAILQ_HEAD(ip4f_list, ip4_frag) ip4f_list; /* IPv4 fragment cache */ #define IP4F_TABSIZE 16 /* IPv4 fragment cache size */ static void ip4f_cache(ip, fin) struct ip *ip; struct flowinfo_in *fin; { struct ip4_frag *fp; if (TAILQ_EMPTY(&ip4f_list)) { /* first time call, allocate fragment cache entries. */ if (ip4f_init() < 0) /* allocation failed! */ return; } fp = ip4f_alloc(); fp->ip4f_id = ip->ip_id; fp->ip4f_info.fi_proto = ip->ip_p; fp->ip4f_info.fi_src.s_addr = ip->ip_src.s_addr; fp->ip4f_info.fi_dst.s_addr = ip->ip_dst.s_addr; /* save port numbers */ fp->ip4f_info.fi_sport = fin->fi_sport; fp->ip4f_info.fi_dport = fin->fi_dport; fp->ip4f_info.fi_gpi = fin->fi_gpi; } static int ip4f_lookup(ip, fin) struct ip *ip; struct flowinfo_in *fin; { struct ip4_frag *fp; for (fp = TAILQ_FIRST(&ip4f_list); fp != NULL && fp->ip4f_valid; fp = TAILQ_NEXT(fp, ip4f_chain)) if (ip->ip_id == fp->ip4f_id && ip->ip_src.s_addr == fp->ip4f_info.fi_src.s_addr && ip->ip_dst.s_addr == fp->ip4f_info.fi_dst.s_addr && ip->ip_p == fp->ip4f_info.fi_proto) { /* found the matching entry */ fin->fi_sport = fp->ip4f_info.fi_sport; fin->fi_dport = fp->ip4f_info.fi_dport; fin->fi_gpi = fp->ip4f_info.fi_gpi; if ((ntohs(ip->ip_off) & IP_MF) == 0) /* this is the last fragment, release the entry. */ ip4f_free(fp); return (1); } /* no matching entry found */ return (0); } static int ip4f_init(void) { struct ip4_frag *fp; int i; TAILQ_INIT(&ip4f_list); for (i=0; iip4f_valid = 0; TAILQ_INSERT_TAIL(&ip4f_list, fp, ip4f_chain); } return (0); } static struct ip4_frag * ip4f_alloc(void) { struct ip4_frag *fp; /* reclaim an entry at the tail, put it at the head */ fp = TAILQ_LAST(&ip4f_list, ip4f_list); TAILQ_REMOVE(&ip4f_list, fp, ip4f_chain); fp->ip4f_valid = 1; TAILQ_INSERT_HEAD(&ip4f_list, fp, ip4f_chain); return (fp); } static void ip4f_free(fp) struct ip4_frag *fp; { TAILQ_REMOVE(&ip4f_list, fp, ip4f_chain); fp->ip4f_valid = 0; TAILQ_INSERT_TAIL(&ip4f_list, fp, ip4f_chain); } #endif /* ALTQ3_CLFIER_COMPAT */ Index: head/sys/net/bridgestp.c =================================================================== --- head/sys/net/bridgestp.c (revision 334117) +++ head/sys/net/bridgestp.c (revision 334118) @@ -1,2276 +1,2276 @@ /* $NetBSD: bridgestp.c,v 1.5 2003/11/28 08:56:48 keihan Exp $ */ /*- * SPDX-License-Identifier: BSD-2-Clause-NetBSD * * Copyright (c) 2000 Jason L. Wright (jason@thought.net) * Copyright (c) 2006 Andrew Thompson (thompsa@FreeBSD.org) * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * * OpenBSD: bridgestp.c,v 1.5 2001/03/22 03:48:29 jason Exp */ /* * Implementation of the spanning tree protocol as defined in * ISO/IEC 802.1D-2004, June 9, 2004. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef BRIDGESTP_DEBUG #define DPRINTF(fmt, arg...) printf("bstp: " fmt, ##arg) #else #define DPRINTF(fmt, arg...) (void)0 #endif #define PV2ADDR(pv, eaddr) do { \ eaddr[0] = pv >> 40; \ eaddr[1] = pv >> 32; \ eaddr[2] = pv >> 24; \ eaddr[3] = pv >> 16; \ eaddr[4] = pv >> 8; \ eaddr[5] = pv >> 0; \ } while (0) #define INFO_BETTER 1 #define INFO_SAME 0 #define INFO_WORSE -1 const uint8_t bstp_etheraddr[] = { 0x01, 0x80, 0xc2, 0x00, 0x00, 0x00 }; LIST_HEAD(, bstp_state) bstp_list; static struct mtx bstp_list_mtx; static void bstp_transmit(struct bstp_state *, struct bstp_port *); static void bstp_transmit_bpdu(struct bstp_state *, struct bstp_port *); static void bstp_transmit_tcn(struct bstp_state *, struct bstp_port *); static void bstp_decode_bpdu(struct bstp_port *, struct bstp_cbpdu *, struct bstp_config_unit *); static void bstp_send_bpdu(struct bstp_state *, struct bstp_port *, struct bstp_cbpdu *); static int bstp_pdu_flags(struct bstp_port *); static void bstp_received_stp(struct bstp_state *, struct bstp_port *, struct mbuf **, struct bstp_tbpdu *); static void bstp_received_rstp(struct bstp_state *, struct bstp_port *, struct mbuf **, struct bstp_tbpdu *); static void bstp_received_tcn(struct bstp_state *, struct bstp_port *, struct bstp_tcn_unit *); static void bstp_received_bpdu(struct bstp_state *, struct bstp_port *, struct bstp_config_unit *); static int bstp_pdu_rcvtype(struct bstp_port *, struct bstp_config_unit *); static int bstp_pdu_bettersame(struct bstp_port *, int); static int bstp_info_cmp(struct bstp_pri_vector *, struct bstp_pri_vector *); static int bstp_info_superior(struct bstp_pri_vector *, struct bstp_pri_vector *); static void bstp_assign_roles(struct bstp_state *); static void bstp_update_roles(struct bstp_state *, struct bstp_port *); static void bstp_update_state(struct bstp_state *, struct bstp_port *); static void bstp_update_tc(struct bstp_port *); static void bstp_update_info(struct bstp_port *); static void bstp_set_other_tcprop(struct bstp_port *); static void bstp_set_all_reroot(struct bstp_state *); static void bstp_set_all_sync(struct bstp_state *); static void bstp_set_port_state(struct bstp_port *, int); static void bstp_set_port_role(struct bstp_port *, int); static void bstp_set_port_proto(struct bstp_port *, int); static void bstp_set_port_tc(struct bstp_port *, int); static void bstp_set_timer_tc(struct bstp_port *); static void bstp_set_timer_msgage(struct bstp_port *); static int bstp_rerooted(struct bstp_state *, struct bstp_port *); static uint32_t bstp_calc_path_cost(struct bstp_port *); static void bstp_notify_state(void *, int); static void bstp_notify_rtage(void *, int); static void bstp_ifupdstatus(void *, int); static void bstp_enable_port(struct bstp_state *, struct bstp_port *); static void bstp_disable_port(struct bstp_state *, struct bstp_port *); static void bstp_tick(void *); static void bstp_timer_start(struct bstp_timer *, uint16_t); static void bstp_timer_stop(struct bstp_timer *); static void bstp_timer_latch(struct bstp_timer *); static int bstp_timer_dectest(struct bstp_timer *); static void bstp_hello_timer_expiry(struct bstp_state *, struct bstp_port *); static void bstp_message_age_expiry(struct bstp_state *, struct bstp_port *); static void bstp_migrate_delay_expiry(struct bstp_state *, struct bstp_port *); static void bstp_edge_delay_expiry(struct bstp_state *, struct bstp_port *); static int bstp_addr_cmp(const uint8_t *, const uint8_t *); static int bstp_same_bridgeid(uint64_t, uint64_t); static void bstp_reinit(struct bstp_state *); static void bstp_transmit(struct bstp_state *bs, struct bstp_port *bp) { if (bs->bs_running == 0) return; /* * a PDU can only be sent if we have tx quota left and the * hello timer is running. */ if (bp->bp_hello_timer.active == 0) { /* Test if it needs to be reset */ bstp_hello_timer_expiry(bs, bp); return; } if (bp->bp_txcount > bs->bs_txholdcount) /* Ran out of karma */ return; if (bp->bp_protover == BSTP_PROTO_RSTP) { bstp_transmit_bpdu(bs, bp); bp->bp_tc_ack = 0; } else { /* STP */ switch (bp->bp_role) { case BSTP_ROLE_DESIGNATED: bstp_transmit_bpdu(bs, bp); bp->bp_tc_ack = 0; break; case BSTP_ROLE_ROOT: bstp_transmit_tcn(bs, bp); break; } } bstp_timer_start(&bp->bp_hello_timer, bp->bp_desg_htime); bp->bp_flags &= ~BSTP_PORT_NEWINFO; } static void bstp_transmit_bpdu(struct bstp_state *bs, struct bstp_port *bp) { struct bstp_cbpdu bpdu; BSTP_LOCK_ASSERT(bs); bpdu.cbu_rootpri = htons(bp->bp_desg_pv.pv_root_id >> 48); PV2ADDR(bp->bp_desg_pv.pv_root_id, bpdu.cbu_rootaddr); bpdu.cbu_rootpathcost = htonl(bp->bp_desg_pv.pv_cost); bpdu.cbu_bridgepri = htons(bp->bp_desg_pv.pv_dbridge_id >> 48); PV2ADDR(bp->bp_desg_pv.pv_dbridge_id, bpdu.cbu_bridgeaddr); bpdu.cbu_portid = htons(bp->bp_port_id); bpdu.cbu_messageage = htons(bp->bp_desg_msg_age); bpdu.cbu_maxage = htons(bp->bp_desg_max_age); bpdu.cbu_hellotime = htons(bp->bp_desg_htime); bpdu.cbu_forwarddelay = htons(bp->bp_desg_fdelay); bpdu.cbu_flags = bstp_pdu_flags(bp); switch (bp->bp_protover) { case BSTP_PROTO_STP: bpdu.cbu_bpdutype = BSTP_MSGTYPE_CFG; break; case BSTP_PROTO_RSTP: bpdu.cbu_bpdutype = BSTP_MSGTYPE_RSTP; break; } bstp_send_bpdu(bs, bp, &bpdu); } static void bstp_transmit_tcn(struct bstp_state *bs, struct bstp_port *bp) { struct bstp_tbpdu bpdu; struct ifnet *ifp = bp->bp_ifp; struct ether_header *eh; struct mbuf *m; KASSERT(bp == bs->bs_root_port, ("%s: bad root port\n", __func__)); if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) return; m = m_gethdr(M_NOWAIT, MT_DATA); if (m == NULL) return; m->m_pkthdr.rcvif = ifp; m->m_pkthdr.len = sizeof(*eh) + sizeof(bpdu); m->m_len = m->m_pkthdr.len; eh = mtod(m, struct ether_header *); memcpy(eh->ether_shost, IF_LLADDR(ifp), ETHER_ADDR_LEN); memcpy(eh->ether_dhost, bstp_etheraddr, ETHER_ADDR_LEN); eh->ether_type = htons(sizeof(bpdu)); bpdu.tbu_ssap = bpdu.tbu_dsap = LLC_8021D_LSAP; bpdu.tbu_ctl = LLC_UI; bpdu.tbu_protoid = 0; bpdu.tbu_protover = 0; bpdu.tbu_bpdutype = BSTP_MSGTYPE_TCN; memcpy(mtod(m, caddr_t) + sizeof(*eh), &bpdu, sizeof(bpdu)); bp->bp_txcount++; ifp->if_transmit(ifp, m); } static void bstp_decode_bpdu(struct bstp_port *bp, struct bstp_cbpdu *cpdu, struct bstp_config_unit *cu) { int flags; cu->cu_pv.pv_root_id = (((uint64_t)ntohs(cpdu->cbu_rootpri)) << 48) | (((uint64_t)cpdu->cbu_rootaddr[0]) << 40) | (((uint64_t)cpdu->cbu_rootaddr[1]) << 32) | (((uint64_t)cpdu->cbu_rootaddr[2]) << 24) | (((uint64_t)cpdu->cbu_rootaddr[3]) << 16) | (((uint64_t)cpdu->cbu_rootaddr[4]) << 8) | (((uint64_t)cpdu->cbu_rootaddr[5]) << 0); cu->cu_pv.pv_dbridge_id = (((uint64_t)ntohs(cpdu->cbu_bridgepri)) << 48) | (((uint64_t)cpdu->cbu_bridgeaddr[0]) << 40) | (((uint64_t)cpdu->cbu_bridgeaddr[1]) << 32) | (((uint64_t)cpdu->cbu_bridgeaddr[2]) << 24) | (((uint64_t)cpdu->cbu_bridgeaddr[3]) << 16) | (((uint64_t)cpdu->cbu_bridgeaddr[4]) << 8) | (((uint64_t)cpdu->cbu_bridgeaddr[5]) << 0); cu->cu_pv.pv_cost = ntohl(cpdu->cbu_rootpathcost); cu->cu_message_age = ntohs(cpdu->cbu_messageage); cu->cu_max_age = ntohs(cpdu->cbu_maxage); cu->cu_hello_time = ntohs(cpdu->cbu_hellotime); cu->cu_forward_delay = ntohs(cpdu->cbu_forwarddelay); cu->cu_pv.pv_dport_id = ntohs(cpdu->cbu_portid); cu->cu_pv.pv_port_id = bp->bp_port_id; cu->cu_message_type = cpdu->cbu_bpdutype; /* Strip off unused flags in STP mode */ flags = cpdu->cbu_flags; switch (cpdu->cbu_protover) { case BSTP_PROTO_STP: flags &= BSTP_PDU_STPMASK; /* A STP BPDU explicitly conveys a Designated Port */ cu->cu_role = BSTP_ROLE_DESIGNATED; break; case BSTP_PROTO_RSTP: flags &= BSTP_PDU_RSTPMASK; break; } cu->cu_topology_change_ack = (flags & BSTP_PDU_F_TCA) ? 1 : 0; cu->cu_proposal = (flags & BSTP_PDU_F_P) ? 1 : 0; cu->cu_agree = (flags & BSTP_PDU_F_A) ? 1 : 0; cu->cu_learning = (flags & BSTP_PDU_F_L) ? 1 : 0; cu->cu_forwarding = (flags & BSTP_PDU_F_F) ? 1 : 0; cu->cu_topology_change = (flags & BSTP_PDU_F_TC) ? 1 : 0; switch ((flags & BSTP_PDU_PRMASK) >> BSTP_PDU_PRSHIFT) { case BSTP_PDU_F_ROOT: cu->cu_role = BSTP_ROLE_ROOT; break; case BSTP_PDU_F_ALT: cu->cu_role = BSTP_ROLE_ALTERNATE; break; case BSTP_PDU_F_DESG: cu->cu_role = BSTP_ROLE_DESIGNATED; break; } } static void bstp_send_bpdu(struct bstp_state *bs, struct bstp_port *bp, struct bstp_cbpdu *bpdu) { struct ifnet *ifp; struct mbuf *m; struct ether_header *eh; BSTP_LOCK_ASSERT(bs); ifp = bp->bp_ifp; if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) return; m = m_gethdr(M_NOWAIT, MT_DATA); if (m == NULL) return; eh = mtod(m, struct ether_header *); bpdu->cbu_ssap = bpdu->cbu_dsap = LLC_8021D_LSAP; bpdu->cbu_ctl = LLC_UI; bpdu->cbu_protoid = htons(BSTP_PROTO_ID); memcpy(eh->ether_shost, IF_LLADDR(ifp), ETHER_ADDR_LEN); memcpy(eh->ether_dhost, bstp_etheraddr, ETHER_ADDR_LEN); switch (bpdu->cbu_bpdutype) { case BSTP_MSGTYPE_CFG: bpdu->cbu_protover = BSTP_PROTO_STP; m->m_pkthdr.len = sizeof(*eh) + BSTP_BPDU_STP_LEN; eh->ether_type = htons(BSTP_BPDU_STP_LEN); memcpy(mtod(m, caddr_t) + sizeof(*eh), bpdu, BSTP_BPDU_STP_LEN); break; case BSTP_MSGTYPE_RSTP: bpdu->cbu_protover = BSTP_PROTO_RSTP; bpdu->cbu_versionlen = htons(0); m->m_pkthdr.len = sizeof(*eh) + BSTP_BPDU_RSTP_LEN; eh->ether_type = htons(BSTP_BPDU_RSTP_LEN); memcpy(mtod(m, caddr_t) + sizeof(*eh), bpdu, BSTP_BPDU_RSTP_LEN); break; default: panic("not implemented"); } m->m_pkthdr.rcvif = ifp; m->m_len = m->m_pkthdr.len; bp->bp_txcount++; ifp->if_transmit(ifp, m); } static int bstp_pdu_flags(struct bstp_port *bp) { int flags = 0; if (bp->bp_proposing && bp->bp_state != BSTP_IFSTATE_FORWARDING) flags |= BSTP_PDU_F_P; if (bp->bp_agree) flags |= BSTP_PDU_F_A; if (bp->bp_tc_timer.active) flags |= BSTP_PDU_F_TC; if (bp->bp_tc_ack) flags |= BSTP_PDU_F_TCA; switch (bp->bp_state) { case BSTP_IFSTATE_LEARNING: flags |= BSTP_PDU_F_L; break; case BSTP_IFSTATE_FORWARDING: flags |= (BSTP_PDU_F_L | BSTP_PDU_F_F); break; } switch (bp->bp_role) { case BSTP_ROLE_ROOT: flags |= (BSTP_PDU_F_ROOT << BSTP_PDU_PRSHIFT); break; case BSTP_ROLE_ALTERNATE: case BSTP_ROLE_BACKUP: /* fall through */ flags |= (BSTP_PDU_F_ALT << BSTP_PDU_PRSHIFT); break; case BSTP_ROLE_DESIGNATED: flags |= (BSTP_PDU_F_DESG << BSTP_PDU_PRSHIFT); break; } /* Strip off unused flags in either mode */ switch (bp->bp_protover) { case BSTP_PROTO_STP: flags &= BSTP_PDU_STPMASK; break; case BSTP_PROTO_RSTP: flags &= BSTP_PDU_RSTPMASK; break; } return (flags); } void bstp_input(struct bstp_port *bp, struct ifnet *ifp, struct mbuf *m) { struct bstp_state *bs = bp->bp_bs; struct ether_header *eh; struct bstp_tbpdu tpdu; uint16_t len; if (bp->bp_active == 0) { m_freem(m); return; } BSTP_LOCK(bs); eh = mtod(m, struct ether_header *); len = ntohs(eh->ether_type); if (len < sizeof(tpdu)) goto out; m_adj(m, ETHER_HDR_LEN); if (m->m_pkthdr.len > len) m_adj(m, len - m->m_pkthdr.len); if (m->m_len < sizeof(tpdu) && (m = m_pullup(m, sizeof(tpdu))) == NULL) goto out; memcpy(&tpdu, mtod(m, caddr_t), sizeof(tpdu)); /* basic packet checks */ if (tpdu.tbu_dsap != LLC_8021D_LSAP || tpdu.tbu_ssap != LLC_8021D_LSAP || tpdu.tbu_ctl != LLC_UI) goto out; if (tpdu.tbu_protoid != BSTP_PROTO_ID) goto out; /* * We can treat later versions of the PDU as the same as the maximum * version we implement. All additional parameters/flags are ignored. */ if (tpdu.tbu_protover > BSTP_PROTO_MAX) tpdu.tbu_protover = BSTP_PROTO_MAX; if (tpdu.tbu_protover != bp->bp_protover) { /* * Wait for the migration delay timer to expire before changing * protocol version to avoid flip-flops. */ if (bp->bp_flags & BSTP_PORT_CANMIGRATE) bstp_set_port_proto(bp, tpdu.tbu_protover); else goto out; } /* Clear operedge upon receiving a PDU on the port */ bp->bp_operedge = 0; bstp_timer_start(&bp->bp_edge_delay_timer, BSTP_DEFAULT_MIGRATE_DELAY); switch (tpdu.tbu_protover) { case BSTP_PROTO_STP: bstp_received_stp(bs, bp, &m, &tpdu); break; case BSTP_PROTO_RSTP: bstp_received_rstp(bs, bp, &m, &tpdu); break; } out: BSTP_UNLOCK(bs); if (m) m_freem(m); } static void bstp_received_stp(struct bstp_state *bs, struct bstp_port *bp, struct mbuf **mp, struct bstp_tbpdu *tpdu) { struct bstp_cbpdu cpdu; struct bstp_config_unit *cu = &bp->bp_msg_cu; struct bstp_tcn_unit tu; switch (tpdu->tbu_bpdutype) { case BSTP_MSGTYPE_TCN: tu.tu_message_type = tpdu->tbu_bpdutype; bstp_received_tcn(bs, bp, &tu); break; case BSTP_MSGTYPE_CFG: if ((*mp)->m_len < BSTP_BPDU_STP_LEN && (*mp = m_pullup(*mp, BSTP_BPDU_STP_LEN)) == NULL) return; memcpy(&cpdu, mtod(*mp, caddr_t), BSTP_BPDU_STP_LEN); bstp_decode_bpdu(bp, &cpdu, cu); bstp_received_bpdu(bs, bp, cu); break; } } static void bstp_received_rstp(struct bstp_state *bs, struct bstp_port *bp, struct mbuf **mp, struct bstp_tbpdu *tpdu) { struct bstp_cbpdu cpdu; struct bstp_config_unit *cu = &bp->bp_msg_cu; if (tpdu->tbu_bpdutype != BSTP_MSGTYPE_RSTP) return; if ((*mp)->m_len < BSTP_BPDU_RSTP_LEN && (*mp = m_pullup(*mp, BSTP_BPDU_RSTP_LEN)) == NULL) return; memcpy(&cpdu, mtod(*mp, caddr_t), BSTP_BPDU_RSTP_LEN); bstp_decode_bpdu(bp, &cpdu, cu); bstp_received_bpdu(bs, bp, cu); } static void bstp_received_tcn(struct bstp_state *bs, struct bstp_port *bp, struct bstp_tcn_unit *tcn) { bp->bp_rcvdtcn = 1; bstp_update_tc(bp); } static void bstp_received_bpdu(struct bstp_state *bs, struct bstp_port *bp, struct bstp_config_unit *cu) { int type; BSTP_LOCK_ASSERT(bs); /* We need to have transitioned to INFO_MINE before proceeding */ switch (bp->bp_infois) { case BSTP_INFO_DISABLED: case BSTP_INFO_AGED: return; } type = bstp_pdu_rcvtype(bp, cu); switch (type) { case BSTP_PDU_SUPERIOR: bs->bs_allsynced = 0; bp->bp_agreed = 0; bp->bp_proposing = 0; if (cu->cu_proposal && cu->cu_forwarding == 0) bp->bp_proposed = 1; if (cu->cu_topology_change) bp->bp_rcvdtc = 1; if (cu->cu_topology_change_ack) bp->bp_rcvdtca = 1; if (bp->bp_agree && !bstp_pdu_bettersame(bp, BSTP_INFO_RECEIVED)) bp->bp_agree = 0; /* copy the received priority and timers to the port */ bp->bp_port_pv = cu->cu_pv; bp->bp_port_msg_age = cu->cu_message_age; bp->bp_port_max_age = cu->cu_max_age; bp->bp_port_fdelay = cu->cu_forward_delay; bp->bp_port_htime = (cu->cu_hello_time > BSTP_MIN_HELLO_TIME ? cu->cu_hello_time : BSTP_MIN_HELLO_TIME); /* set expiry for the new info */ bstp_set_timer_msgage(bp); bp->bp_infois = BSTP_INFO_RECEIVED; bstp_assign_roles(bs); break; case BSTP_PDU_REPEATED: if (cu->cu_proposal && cu->cu_forwarding == 0) bp->bp_proposed = 1; if (cu->cu_topology_change) bp->bp_rcvdtc = 1; if (cu->cu_topology_change_ack) bp->bp_rcvdtca = 1; /* rearm the age timer */ bstp_set_timer_msgage(bp); break; case BSTP_PDU_INFERIOR: if (cu->cu_learning) { bp->bp_agreed = 1; bp->bp_proposing = 0; } break; case BSTP_PDU_INFERIORALT: /* * only point to point links are allowed fast * transitions to forwarding. */ if (cu->cu_agree && bp->bp_ptp_link) { bp->bp_agreed = 1; bp->bp_proposing = 0; } else bp->bp_agreed = 0; if (cu->cu_topology_change) bp->bp_rcvdtc = 1; if (cu->cu_topology_change_ack) bp->bp_rcvdtca = 1; break; case BSTP_PDU_OTHER: return; /* do nothing */ } /* update the state machines with the new data */ bstp_update_state(bs, bp); } static int bstp_pdu_rcvtype(struct bstp_port *bp, struct bstp_config_unit *cu) { int type; /* default return type */ type = BSTP_PDU_OTHER; switch (cu->cu_role) { case BSTP_ROLE_DESIGNATED: if (bstp_info_superior(&bp->bp_port_pv, &cu->cu_pv)) /* bpdu priority is superior */ type = BSTP_PDU_SUPERIOR; else if (bstp_info_cmp(&bp->bp_port_pv, &cu->cu_pv) == INFO_SAME) { if (bp->bp_port_msg_age != cu->cu_message_age || bp->bp_port_max_age != cu->cu_max_age || bp->bp_port_fdelay != cu->cu_forward_delay || bp->bp_port_htime != cu->cu_hello_time) /* bpdu priority is equal and timers differ */ type = BSTP_PDU_SUPERIOR; else /* bpdu is equal */ type = BSTP_PDU_REPEATED; } else /* bpdu priority is worse */ type = BSTP_PDU_INFERIOR; break; case BSTP_ROLE_ROOT: case BSTP_ROLE_ALTERNATE: case BSTP_ROLE_BACKUP: if (bstp_info_cmp(&bp->bp_port_pv, &cu->cu_pv) <= INFO_SAME) /* * not a designated port and priority is the same or * worse */ type = BSTP_PDU_INFERIORALT; break; } return (type); } static int bstp_pdu_bettersame(struct bstp_port *bp, int newinfo) { if (newinfo == BSTP_INFO_RECEIVED && bp->bp_infois == BSTP_INFO_RECEIVED && bstp_info_cmp(&bp->bp_port_pv, &bp->bp_msg_cu.cu_pv) >= INFO_SAME) return (1); if (newinfo == BSTP_INFO_MINE && bp->bp_infois == BSTP_INFO_MINE && bstp_info_cmp(&bp->bp_port_pv, &bp->bp_desg_pv) >= INFO_SAME) return (1); return (0); } static int bstp_info_cmp(struct bstp_pri_vector *pv, struct bstp_pri_vector *cpv) { if (cpv->pv_root_id < pv->pv_root_id) return (INFO_BETTER); if (cpv->pv_root_id > pv->pv_root_id) return (INFO_WORSE); if (cpv->pv_cost < pv->pv_cost) return (INFO_BETTER); if (cpv->pv_cost > pv->pv_cost) return (INFO_WORSE); if (cpv->pv_dbridge_id < pv->pv_dbridge_id) return (INFO_BETTER); if (cpv->pv_dbridge_id > pv->pv_dbridge_id) return (INFO_WORSE); if (cpv->pv_dport_id < pv->pv_dport_id) return (INFO_BETTER); if (cpv->pv_dport_id > pv->pv_dport_id) return (INFO_WORSE); return (INFO_SAME); } /* * This message priority vector is superior to the port priority vector and * will replace it if, and only if, the message priority vector is better than * the port priority vector, or the message has been transmitted from the same * designated bridge and designated port as the port priority vector. */ static int bstp_info_superior(struct bstp_pri_vector *pv, struct bstp_pri_vector *cpv) { if (bstp_info_cmp(pv, cpv) == INFO_BETTER || (bstp_same_bridgeid(pv->pv_dbridge_id, cpv->pv_dbridge_id) && (cpv->pv_dport_id & 0xfff) == (pv->pv_dport_id & 0xfff))) return (1); return (0); } static void bstp_assign_roles(struct bstp_state *bs) { struct bstp_port *bp, *rbp = NULL; struct bstp_pri_vector pv; /* default to our priority vector */ bs->bs_root_pv = bs->bs_bridge_pv; bs->bs_root_msg_age = 0; bs->bs_root_max_age = bs->bs_bridge_max_age; bs->bs_root_fdelay = bs->bs_bridge_fdelay; bs->bs_root_htime = bs->bs_bridge_htime; bs->bs_root_port = NULL; /* check if any received info supersedes us */ LIST_FOREACH(bp, &bs->bs_bplist, bp_next) { if (bp->bp_infois != BSTP_INFO_RECEIVED) continue; pv = bp->bp_port_pv; pv.pv_cost += bp->bp_path_cost; /* * The root priority vector is the best of the set comprising * the bridge priority vector plus all root path priority * vectors whose bridge address is not equal to us. */ if (bstp_same_bridgeid(pv.pv_dbridge_id, bs->bs_bridge_pv.pv_dbridge_id) == 0 && bstp_info_cmp(&bs->bs_root_pv, &pv) == INFO_BETTER) { /* the port vector replaces the root */ bs->bs_root_pv = pv; bs->bs_root_msg_age = bp->bp_port_msg_age + BSTP_MESSAGE_AGE_INCR; bs->bs_root_max_age = bp->bp_port_max_age; bs->bs_root_fdelay = bp->bp_port_fdelay; bs->bs_root_htime = bp->bp_port_htime; rbp = bp; } } LIST_FOREACH(bp, &bs->bs_bplist, bp_next) { /* calculate the port designated vector */ bp->bp_desg_pv.pv_root_id = bs->bs_root_pv.pv_root_id; bp->bp_desg_pv.pv_cost = bs->bs_root_pv.pv_cost; bp->bp_desg_pv.pv_dbridge_id = bs->bs_bridge_pv.pv_dbridge_id; bp->bp_desg_pv.pv_dport_id = bp->bp_port_id; bp->bp_desg_pv.pv_port_id = bp->bp_port_id; /* calculate designated times */ bp->bp_desg_msg_age = bs->bs_root_msg_age; bp->bp_desg_max_age = bs->bs_root_max_age; bp->bp_desg_fdelay = bs->bs_root_fdelay; bp->bp_desg_htime = bs->bs_bridge_htime; switch (bp->bp_infois) { case BSTP_INFO_DISABLED: bstp_set_port_role(bp, BSTP_ROLE_DISABLED); break; case BSTP_INFO_AGED: bstp_set_port_role(bp, BSTP_ROLE_DESIGNATED); bstp_update_info(bp); break; case BSTP_INFO_MINE: bstp_set_port_role(bp, BSTP_ROLE_DESIGNATED); /* update the port info if stale */ if (bstp_info_cmp(&bp->bp_port_pv, &bp->bp_desg_pv) != INFO_SAME || (rbp != NULL && (bp->bp_port_msg_age != rbp->bp_port_msg_age || bp->bp_port_max_age != rbp->bp_port_max_age || bp->bp_port_fdelay != rbp->bp_port_fdelay || bp->bp_port_htime != rbp->bp_port_htime))) bstp_update_info(bp); break; case BSTP_INFO_RECEIVED: if (bp == rbp) { /* * root priority is derived from this * port, make it the root port. */ bstp_set_port_role(bp, BSTP_ROLE_ROOT); bs->bs_root_port = bp; } else if (bstp_info_cmp(&bp->bp_port_pv, &bp->bp_desg_pv) == INFO_BETTER) { /* * the port priority is lower than the root * port. */ bstp_set_port_role(bp, BSTP_ROLE_DESIGNATED); bstp_update_info(bp); } else { if (bstp_same_bridgeid( bp->bp_port_pv.pv_dbridge_id, bs->bs_bridge_pv.pv_dbridge_id)) { /* * the designated bridge refers to * another port on this bridge. */ bstp_set_port_role(bp, BSTP_ROLE_BACKUP); } else { /* * the port is an inferior path to the * root bridge. */ bstp_set_port_role(bp, BSTP_ROLE_ALTERNATE); } } break; } } } static void bstp_update_state(struct bstp_state *bs, struct bstp_port *bp) { struct bstp_port *bp2; int synced; BSTP_LOCK_ASSERT(bs); /* check if all the ports have syncronised again */ if (!bs->bs_allsynced) { synced = 1; LIST_FOREACH(bp2, &bs->bs_bplist, bp_next) { if (!(bp2->bp_synced || bp2->bp_role == BSTP_ROLE_ROOT)) { synced = 0; break; } } bs->bs_allsynced = synced; } bstp_update_roles(bs, bp); bstp_update_tc(bp); } static void bstp_update_roles(struct bstp_state *bs, struct bstp_port *bp) { switch (bp->bp_role) { case BSTP_ROLE_DISABLED: /* Clear any flags if set */ if (bp->bp_sync || !bp->bp_synced || bp->bp_reroot) { bp->bp_sync = 0; bp->bp_synced = 1; bp->bp_reroot = 0; } break; case BSTP_ROLE_ALTERNATE: case BSTP_ROLE_BACKUP: if ((bs->bs_allsynced && !bp->bp_agree) || (bp->bp_proposed && bp->bp_agree)) { bp->bp_proposed = 0; bp->bp_agree = 1; bp->bp_flags |= BSTP_PORT_NEWINFO; DPRINTF("%s -> ALTERNATE_AGREED\n", bp->bp_ifp->if_xname); } if (bp->bp_proposed && !bp->bp_agree) { bstp_set_all_sync(bs); bp->bp_proposed = 0; DPRINTF("%s -> ALTERNATE_PROPOSED\n", bp->bp_ifp->if_xname); } /* Clear any flags if set */ if (bp->bp_sync || !bp->bp_synced || bp->bp_reroot) { bp->bp_sync = 0; bp->bp_synced = 1; bp->bp_reroot = 0; DPRINTF("%s -> ALTERNATE_PORT\n", bp->bp_ifp->if_xname); } break; case BSTP_ROLE_ROOT: if (bp->bp_state != BSTP_IFSTATE_FORWARDING && !bp->bp_reroot) { bstp_set_all_reroot(bs); DPRINTF("%s -> ROOT_REROOT\n", bp->bp_ifp->if_xname); } if ((bs->bs_allsynced && !bp->bp_agree) || (bp->bp_proposed && bp->bp_agree)) { bp->bp_proposed = 0; bp->bp_sync = 0; bp->bp_agree = 1; bp->bp_flags |= BSTP_PORT_NEWINFO; DPRINTF("%s -> ROOT_AGREED\n", bp->bp_ifp->if_xname); } if (bp->bp_proposed && !bp->bp_agree) { bstp_set_all_sync(bs); bp->bp_proposed = 0; DPRINTF("%s -> ROOT_PROPOSED\n", bp->bp_ifp->if_xname); } if (bp->bp_state != BSTP_IFSTATE_FORWARDING && (bp->bp_forward_delay_timer.active == 0 || (bstp_rerooted(bs, bp) && bp->bp_recent_backup_timer.active == 0 && bp->bp_protover == BSTP_PROTO_RSTP))) { switch (bp->bp_state) { case BSTP_IFSTATE_DISCARDING: bstp_set_port_state(bp, BSTP_IFSTATE_LEARNING); break; case BSTP_IFSTATE_LEARNING: bstp_set_port_state(bp, BSTP_IFSTATE_FORWARDING); break; } } if (bp->bp_state == BSTP_IFSTATE_FORWARDING && bp->bp_reroot) { bp->bp_reroot = 0; DPRINTF("%s -> ROOT_REROOTED\n", bp->bp_ifp->if_xname); } break; case BSTP_ROLE_DESIGNATED: if (bp->bp_recent_root_timer.active == 0 && bp->bp_reroot) { bp->bp_reroot = 0; DPRINTF("%s -> DESIGNATED_RETIRED\n", bp->bp_ifp->if_xname); } if ((bp->bp_state == BSTP_IFSTATE_DISCARDING && !bp->bp_synced) || (bp->bp_agreed && !bp->bp_synced) || (bp->bp_operedge && !bp->bp_synced) || (bp->bp_sync && bp->bp_synced)) { bstp_timer_stop(&bp->bp_recent_root_timer); bp->bp_synced = 1; bp->bp_sync = 0; DPRINTF("%s -> DESIGNATED_SYNCED\n", bp->bp_ifp->if_xname); } if (bp->bp_state != BSTP_IFSTATE_FORWARDING && !bp->bp_agreed && !bp->bp_proposing && !bp->bp_operedge) { bp->bp_proposing = 1; bp->bp_flags |= BSTP_PORT_NEWINFO; bstp_timer_start(&bp->bp_edge_delay_timer, (bp->bp_ptp_link ? BSTP_DEFAULT_MIGRATE_DELAY : bp->bp_desg_max_age)); DPRINTF("%s -> DESIGNATED_PROPOSE\n", bp->bp_ifp->if_xname); } if (bp->bp_state != BSTP_IFSTATE_FORWARDING && (bp->bp_forward_delay_timer.active == 0 || bp->bp_agreed || bp->bp_operedge) && (bp->bp_recent_root_timer.active == 0 || !bp->bp_reroot) && !bp->bp_sync) { if (bp->bp_agreed) DPRINTF("%s -> AGREED\n", bp->bp_ifp->if_xname); /* * If agreed|operedge then go straight to forwarding, * otherwise follow discard -> learn -> forward. */ if (bp->bp_agreed || bp->bp_operedge || bp->bp_state == BSTP_IFSTATE_LEARNING) { bstp_set_port_state(bp, BSTP_IFSTATE_FORWARDING); bp->bp_agreed = bp->bp_protover; } else if (bp->bp_state == BSTP_IFSTATE_DISCARDING) bstp_set_port_state(bp, BSTP_IFSTATE_LEARNING); } if (((bp->bp_sync && !bp->bp_synced) || (bp->bp_reroot && bp->bp_recent_root_timer.active) || (bp->bp_flags & BSTP_PORT_DISPUTED)) && !bp->bp_operedge && bp->bp_state != BSTP_IFSTATE_DISCARDING) { bstp_set_port_state(bp, BSTP_IFSTATE_DISCARDING); bp->bp_flags &= ~BSTP_PORT_DISPUTED; bstp_timer_start(&bp->bp_forward_delay_timer, bp->bp_protover == BSTP_PROTO_RSTP ? bp->bp_desg_htime : bp->bp_desg_fdelay); DPRINTF("%s -> DESIGNATED_DISCARD\n", bp->bp_ifp->if_xname); } break; } if (bp->bp_flags & BSTP_PORT_NEWINFO) bstp_transmit(bs, bp); } static void bstp_update_tc(struct bstp_port *bp) { switch (bp->bp_tcstate) { case BSTP_TCSTATE_ACTIVE: if ((bp->bp_role != BSTP_ROLE_DESIGNATED && bp->bp_role != BSTP_ROLE_ROOT) || bp->bp_operedge) bstp_set_port_tc(bp, BSTP_TCSTATE_LEARNING); if (bp->bp_rcvdtcn) bstp_set_port_tc(bp, BSTP_TCSTATE_TCN); if (bp->bp_rcvdtc) bstp_set_port_tc(bp, BSTP_TCSTATE_TC); if (bp->bp_tc_prop && !bp->bp_operedge) bstp_set_port_tc(bp, BSTP_TCSTATE_PROPAG); if (bp->bp_rcvdtca) bstp_set_port_tc(bp, BSTP_TCSTATE_ACK); break; case BSTP_TCSTATE_INACTIVE: if ((bp->bp_state == BSTP_IFSTATE_LEARNING || bp->bp_state == BSTP_IFSTATE_FORWARDING) && bp->bp_fdbflush == 0) bstp_set_port_tc(bp, BSTP_TCSTATE_LEARNING); break; case BSTP_TCSTATE_LEARNING: if (bp->bp_rcvdtc || bp->bp_rcvdtcn || bp->bp_rcvdtca || bp->bp_tc_prop) bstp_set_port_tc(bp, BSTP_TCSTATE_LEARNING); else if (bp->bp_role != BSTP_ROLE_DESIGNATED && bp->bp_role != BSTP_ROLE_ROOT && bp->bp_state == BSTP_IFSTATE_DISCARDING) bstp_set_port_tc(bp, BSTP_TCSTATE_INACTIVE); if ((bp->bp_role == BSTP_ROLE_DESIGNATED || bp->bp_role == BSTP_ROLE_ROOT) && bp->bp_state == BSTP_IFSTATE_FORWARDING && !bp->bp_operedge) bstp_set_port_tc(bp, BSTP_TCSTATE_DETECTED); break; /* these are transient states and go straight back to ACTIVE */ case BSTP_TCSTATE_DETECTED: case BSTP_TCSTATE_TCN: case BSTP_TCSTATE_TC: case BSTP_TCSTATE_PROPAG: case BSTP_TCSTATE_ACK: DPRINTF("Invalid TC state for %s\n", bp->bp_ifp->if_xname); break; } } static void bstp_update_info(struct bstp_port *bp) { struct bstp_state *bs = bp->bp_bs; bp->bp_proposing = 0; bp->bp_proposed = 0; if (bp->bp_agreed && !bstp_pdu_bettersame(bp, BSTP_INFO_MINE)) bp->bp_agreed = 0; if (bp->bp_synced && !bp->bp_agreed) { bp->bp_synced = 0; bs->bs_allsynced = 0; } /* copy the designated pv to the port */ bp->bp_port_pv = bp->bp_desg_pv; bp->bp_port_msg_age = bp->bp_desg_msg_age; bp->bp_port_max_age = bp->bp_desg_max_age; bp->bp_port_fdelay = bp->bp_desg_fdelay; bp->bp_port_htime = bp->bp_desg_htime; bp->bp_infois = BSTP_INFO_MINE; /* Set transmit flag but do not immediately send */ bp->bp_flags |= BSTP_PORT_NEWINFO; } /* set tcprop on every port other than the caller */ static void bstp_set_other_tcprop(struct bstp_port *bp) { struct bstp_state *bs = bp->bp_bs; struct bstp_port *bp2; BSTP_LOCK_ASSERT(bs); LIST_FOREACH(bp2, &bs->bs_bplist, bp_next) { if (bp2 == bp) continue; bp2->bp_tc_prop = 1; } } static void bstp_set_all_reroot(struct bstp_state *bs) { struct bstp_port *bp; BSTP_LOCK_ASSERT(bs); LIST_FOREACH(bp, &bs->bs_bplist, bp_next) bp->bp_reroot = 1; } static void bstp_set_all_sync(struct bstp_state *bs) { struct bstp_port *bp; BSTP_LOCK_ASSERT(bs); LIST_FOREACH(bp, &bs->bs_bplist, bp_next) { bp->bp_sync = 1; bp->bp_synced = 0; /* Not explicit in spec */ } bs->bs_allsynced = 0; } static void bstp_set_port_state(struct bstp_port *bp, int state) { if (bp->bp_state == state) return; bp->bp_state = state; switch (bp->bp_state) { case BSTP_IFSTATE_DISCARDING: DPRINTF("state changed to DISCARDING on %s\n", bp->bp_ifp->if_xname); break; case BSTP_IFSTATE_LEARNING: DPRINTF("state changed to LEARNING on %s\n", bp->bp_ifp->if_xname); bstp_timer_start(&bp->bp_forward_delay_timer, bp->bp_protover == BSTP_PROTO_RSTP ? bp->bp_desg_htime : bp->bp_desg_fdelay); break; case BSTP_IFSTATE_FORWARDING: DPRINTF("state changed to FORWARDING on %s\n", bp->bp_ifp->if_xname); bstp_timer_stop(&bp->bp_forward_delay_timer); /* Record that we enabled forwarding */ bp->bp_forward_transitions++; break; } /* notify the parent bridge */ taskqueue_enqueue(taskqueue_swi, &bp->bp_statetask); } static void bstp_set_port_role(struct bstp_port *bp, int role) { struct bstp_state *bs = bp->bp_bs; if (bp->bp_role == role) return; /* perform pre-change tasks */ switch (bp->bp_role) { case BSTP_ROLE_DISABLED: bstp_timer_start(&bp->bp_forward_delay_timer, bp->bp_desg_max_age); break; case BSTP_ROLE_BACKUP: bstp_timer_start(&bp->bp_recent_backup_timer, bp->bp_desg_htime * 2); /* fall through */ case BSTP_ROLE_ALTERNATE: bstp_timer_start(&bp->bp_forward_delay_timer, bp->bp_desg_fdelay); bp->bp_sync = 0; bp->bp_synced = 1; bp->bp_reroot = 0; break; case BSTP_ROLE_ROOT: bstp_timer_start(&bp->bp_recent_root_timer, BSTP_DEFAULT_FORWARD_DELAY); break; } bp->bp_role = role; /* clear values not carried between roles */ bp->bp_proposing = 0; bs->bs_allsynced = 0; /* initialise the new role */ switch (bp->bp_role) { case BSTP_ROLE_DISABLED: case BSTP_ROLE_ALTERNATE: case BSTP_ROLE_BACKUP: DPRINTF("%s role -> ALT/BACK/DISABLED\n", bp->bp_ifp->if_xname); bstp_set_port_state(bp, BSTP_IFSTATE_DISCARDING); bstp_timer_stop(&bp->bp_recent_root_timer); bstp_timer_latch(&bp->bp_forward_delay_timer); bp->bp_sync = 0; bp->bp_synced = 1; bp->bp_reroot = 0; break; case BSTP_ROLE_ROOT: DPRINTF("%s role -> ROOT\n", bp->bp_ifp->if_xname); bstp_set_port_state(bp, BSTP_IFSTATE_DISCARDING); bstp_timer_latch(&bp->bp_recent_root_timer); bp->bp_proposing = 0; break; case BSTP_ROLE_DESIGNATED: DPRINTF("%s role -> DESIGNATED\n", bp->bp_ifp->if_xname); bstp_timer_start(&bp->bp_hello_timer, bp->bp_desg_htime); bp->bp_agree = 0; break; } /* let the TC state know that the role changed */ bstp_update_tc(bp); } static void bstp_set_port_proto(struct bstp_port *bp, int proto) { struct bstp_state *bs = bp->bp_bs; /* supported protocol versions */ switch (proto) { case BSTP_PROTO_STP: /* we can downgrade protocols only */ bstp_timer_stop(&bp->bp_migrate_delay_timer); /* clear unsupported features */ bp->bp_operedge = 0; /* STP compat mode only uses 16 bits of the 32 */ if (bp->bp_path_cost > 65535) bp->bp_path_cost = 65535; break; case BSTP_PROTO_RSTP: bstp_timer_start(&bp->bp_migrate_delay_timer, bs->bs_migration_delay); break; default: DPRINTF("Unsupported STP version %d\n", proto); return; } bp->bp_protover = proto; bp->bp_flags &= ~BSTP_PORT_CANMIGRATE; } static void bstp_set_port_tc(struct bstp_port *bp, int state) { struct bstp_state *bs = bp->bp_bs; bp->bp_tcstate = state; /* initialise the new state */ switch (bp->bp_tcstate) { case BSTP_TCSTATE_ACTIVE: DPRINTF("%s -> TC_ACTIVE\n", bp->bp_ifp->if_xname); /* nothing to do */ break; case BSTP_TCSTATE_INACTIVE: bstp_timer_stop(&bp->bp_tc_timer); /* flush routes on the parent bridge */ bp->bp_fdbflush = 1; taskqueue_enqueue(taskqueue_swi, &bp->bp_rtagetask); bp->bp_tc_ack = 0; DPRINTF("%s -> TC_INACTIVE\n", bp->bp_ifp->if_xname); break; case BSTP_TCSTATE_LEARNING: bp->bp_rcvdtc = 0; bp->bp_rcvdtcn = 0; bp->bp_rcvdtca = 0; bp->bp_tc_prop = 0; DPRINTF("%s -> TC_LEARNING\n", bp->bp_ifp->if_xname); break; case BSTP_TCSTATE_DETECTED: bstp_set_timer_tc(bp); bstp_set_other_tcprop(bp); /* send out notification */ bp->bp_flags |= BSTP_PORT_NEWINFO; bstp_transmit(bs, bp); getmicrotime(&bs->bs_last_tc_time); DPRINTF("%s -> TC_DETECTED\n", bp->bp_ifp->if_xname); bp->bp_tcstate = BSTP_TCSTATE_ACTIVE; /* UCT */ break; case BSTP_TCSTATE_TCN: bstp_set_timer_tc(bp); DPRINTF("%s -> TC_TCN\n", bp->bp_ifp->if_xname); /* fall through */ case BSTP_TCSTATE_TC: bp->bp_rcvdtc = 0; bp->bp_rcvdtcn = 0; if (bp->bp_role == BSTP_ROLE_DESIGNATED) bp->bp_tc_ack = 1; bstp_set_other_tcprop(bp); DPRINTF("%s -> TC_TC\n", bp->bp_ifp->if_xname); bp->bp_tcstate = BSTP_TCSTATE_ACTIVE; /* UCT */ break; case BSTP_TCSTATE_PROPAG: /* flush routes on the parent bridge */ bp->bp_fdbflush = 1; taskqueue_enqueue(taskqueue_swi, &bp->bp_rtagetask); bp->bp_tc_prop = 0; bstp_set_timer_tc(bp); DPRINTF("%s -> TC_PROPAG\n", bp->bp_ifp->if_xname); bp->bp_tcstate = BSTP_TCSTATE_ACTIVE; /* UCT */ break; case BSTP_TCSTATE_ACK: bstp_timer_stop(&bp->bp_tc_timer); bp->bp_rcvdtca = 0; DPRINTF("%s -> TC_ACK\n", bp->bp_ifp->if_xname); bp->bp_tcstate = BSTP_TCSTATE_ACTIVE; /* UCT */ break; } } static void bstp_set_timer_tc(struct bstp_port *bp) { struct bstp_state *bs = bp->bp_bs; if (bp->bp_tc_timer.active) return; switch (bp->bp_protover) { case BSTP_PROTO_RSTP: bstp_timer_start(&bp->bp_tc_timer, bp->bp_desg_htime + BSTP_TICK_VAL); bp->bp_flags |= BSTP_PORT_NEWINFO; break; case BSTP_PROTO_STP: bstp_timer_start(&bp->bp_tc_timer, bs->bs_root_max_age + bs->bs_root_fdelay); break; } } static void bstp_set_timer_msgage(struct bstp_port *bp) { if (bp->bp_port_msg_age + BSTP_MESSAGE_AGE_INCR <= bp->bp_port_max_age) { bstp_timer_start(&bp->bp_message_age_timer, bp->bp_port_htime * 3); } else /* expires immediately */ bstp_timer_start(&bp->bp_message_age_timer, 0); } static int bstp_rerooted(struct bstp_state *bs, struct bstp_port *bp) { struct bstp_port *bp2; int rr_set = 0; LIST_FOREACH(bp2, &bs->bs_bplist, bp_next) { if (bp2 == bp) continue; if (bp2->bp_recent_root_timer.active) { rr_set = 1; break; } } return (!rr_set); } int bstp_set_htime(struct bstp_state *bs, int t) { /* convert seconds to ticks */ t *= BSTP_TICK_VAL; /* value can only be changed in leagacy stp mode */ if (bs->bs_protover != BSTP_PROTO_STP) return (EPERM); if (t < BSTP_MIN_HELLO_TIME || t > BSTP_MAX_HELLO_TIME) return (EINVAL); BSTP_LOCK(bs); bs->bs_bridge_htime = t; bstp_reinit(bs); BSTP_UNLOCK(bs); return (0); } int bstp_set_fdelay(struct bstp_state *bs, int t) { /* convert seconds to ticks */ t *= BSTP_TICK_VAL; if (t < BSTP_MIN_FORWARD_DELAY || t > BSTP_MAX_FORWARD_DELAY) return (EINVAL); BSTP_LOCK(bs); bs->bs_bridge_fdelay = t; bstp_reinit(bs); BSTP_UNLOCK(bs); return (0); } int bstp_set_maxage(struct bstp_state *bs, int t) { /* convert seconds to ticks */ t *= BSTP_TICK_VAL; if (t < BSTP_MIN_MAX_AGE || t > BSTP_MAX_MAX_AGE) return (EINVAL); BSTP_LOCK(bs); bs->bs_bridge_max_age = t; bstp_reinit(bs); BSTP_UNLOCK(bs); return (0); } int bstp_set_holdcount(struct bstp_state *bs, int count) { struct bstp_port *bp; if (count < BSTP_MIN_HOLD_COUNT || count > BSTP_MAX_HOLD_COUNT) return (EINVAL); BSTP_LOCK(bs); bs->bs_txholdcount = count; LIST_FOREACH(bp, &bs->bs_bplist, bp_next) bp->bp_txcount = 0; BSTP_UNLOCK(bs); return (0); } int bstp_set_protocol(struct bstp_state *bs, int proto) { struct bstp_port *bp; switch (proto) { /* Supported protocol versions */ case BSTP_PROTO_STP: case BSTP_PROTO_RSTP: break; default: return (EINVAL); } BSTP_LOCK(bs); bs->bs_protover = proto; bs->bs_bridge_htime = BSTP_DEFAULT_HELLO_TIME; LIST_FOREACH(bp, &bs->bs_bplist, bp_next) { /* reinit state */ bp->bp_infois = BSTP_INFO_DISABLED; bp->bp_txcount = 0; bstp_set_port_proto(bp, bs->bs_protover); bstp_set_port_role(bp, BSTP_ROLE_DISABLED); bstp_set_port_tc(bp, BSTP_TCSTATE_INACTIVE); bstp_timer_stop(&bp->bp_recent_backup_timer); } bstp_reinit(bs); BSTP_UNLOCK(bs); return (0); } int bstp_set_priority(struct bstp_state *bs, int pri) { if (pri < 0 || pri > BSTP_MAX_PRIORITY) return (EINVAL); /* Limit to steps of 4096 */ pri -= pri % 4096; BSTP_LOCK(bs); bs->bs_bridge_priority = pri; bstp_reinit(bs); BSTP_UNLOCK(bs); return (0); } int bstp_set_port_priority(struct bstp_port *bp, int pri) { struct bstp_state *bs = bp->bp_bs; if (pri < 0 || pri > BSTP_MAX_PORT_PRIORITY) return (EINVAL); /* Limit to steps of 16 */ pri -= pri % 16; BSTP_LOCK(bs); bp->bp_priority = pri; bstp_reinit(bs); BSTP_UNLOCK(bs); return (0); } int bstp_set_path_cost(struct bstp_port *bp, uint32_t path_cost) { struct bstp_state *bs = bp->bp_bs; if (path_cost > BSTP_MAX_PATH_COST) return (EINVAL); /* STP compat mode only uses 16 bits of the 32 */ if (bp->bp_protover == BSTP_PROTO_STP && path_cost > 65535) path_cost = 65535; BSTP_LOCK(bs); if (path_cost == 0) { /* use auto */ bp->bp_flags &= ~BSTP_PORT_ADMCOST; bp->bp_path_cost = bstp_calc_path_cost(bp); } else { bp->bp_path_cost = path_cost; bp->bp_flags |= BSTP_PORT_ADMCOST; } bstp_reinit(bs); BSTP_UNLOCK(bs); return (0); } int bstp_set_edge(struct bstp_port *bp, int set) { struct bstp_state *bs = bp->bp_bs; BSTP_LOCK(bs); if ((bp->bp_operedge = set) == 0) bp->bp_flags &= ~BSTP_PORT_ADMEDGE; else bp->bp_flags |= BSTP_PORT_ADMEDGE; BSTP_UNLOCK(bs); return (0); } int bstp_set_autoedge(struct bstp_port *bp, int set) { struct bstp_state *bs = bp->bp_bs; BSTP_LOCK(bs); if (set) { bp->bp_flags |= BSTP_PORT_AUTOEDGE; /* we may be able to transition straight to edge */ if (bp->bp_edge_delay_timer.active == 0) bstp_edge_delay_expiry(bs, bp); } else bp->bp_flags &= ~BSTP_PORT_AUTOEDGE; BSTP_UNLOCK(bs); return (0); } int bstp_set_ptp(struct bstp_port *bp, int set) { struct bstp_state *bs = bp->bp_bs; BSTP_LOCK(bs); bp->bp_ptp_link = set; BSTP_UNLOCK(bs); return (0); } int bstp_set_autoptp(struct bstp_port *bp, int set) { struct bstp_state *bs = bp->bp_bs; BSTP_LOCK(bs); if (set) { bp->bp_flags |= BSTP_PORT_AUTOPTP; if (bp->bp_role != BSTP_ROLE_DISABLED) taskqueue_enqueue(taskqueue_swi, &bp->bp_mediatask); } else bp->bp_flags &= ~BSTP_PORT_AUTOPTP; BSTP_UNLOCK(bs); return (0); } /* * Calculate the path cost according to the link speed. */ static uint32_t bstp_calc_path_cost(struct bstp_port *bp) { struct ifnet *ifp = bp->bp_ifp; uint32_t path_cost; /* If the priority has been manually set then retain the value */ if (bp->bp_flags & BSTP_PORT_ADMCOST) return bp->bp_path_cost; if (ifp->if_link_state == LINK_STATE_DOWN) { /* Recalc when the link comes up again */ bp->bp_flags |= BSTP_PORT_PNDCOST; return (BSTP_DEFAULT_PATH_COST); } if (ifp->if_baudrate < 1000) return (BSTP_DEFAULT_PATH_COST); /* formula from section 17.14, IEEE Std 802.1D-2004 */ path_cost = 20000000000ULL / (ifp->if_baudrate / 1000); if (path_cost > BSTP_MAX_PATH_COST) path_cost = BSTP_MAX_PATH_COST; /* STP compat mode only uses 16 bits of the 32 */ if (bp->bp_protover == BSTP_PROTO_STP && path_cost > 65535) path_cost = 65535; return (path_cost); } /* * Notify the bridge that a port state has changed, we need to do this from a * taskqueue to avoid a LOR. */ static void bstp_notify_state(void *arg, int pending) { struct bstp_port *bp = (struct bstp_port *)arg; struct bstp_state *bs = bp->bp_bs; if (bp->bp_active == 1 && bs->bs_state_cb != NULL) (*bs->bs_state_cb)(bp->bp_ifp, bp->bp_state); } /* * Flush the routes on the bridge port, we need to do this from a * taskqueue to avoid a LOR. */ static void bstp_notify_rtage(void *arg, int pending) { struct bstp_port *bp = (struct bstp_port *)arg; struct bstp_state *bs = bp->bp_bs; int age = 0; BSTP_LOCK(bs); switch (bp->bp_protover) { case BSTP_PROTO_STP: /* convert to seconds */ age = bp->bp_desg_fdelay / BSTP_TICK_VAL; break; case BSTP_PROTO_RSTP: age = 0; break; } BSTP_UNLOCK(bs); if (bp->bp_active == 1 && bs->bs_rtage_cb != NULL) (*bs->bs_rtage_cb)(bp->bp_ifp, age); /* flush is complete */ BSTP_LOCK(bs); bp->bp_fdbflush = 0; BSTP_UNLOCK(bs); } void bstp_linkstate(struct bstp_port *bp) { struct bstp_state *bs = bp->bp_bs; if (!bp->bp_active) return; bstp_ifupdstatus(bp, 0); BSTP_LOCK(bs); bstp_update_state(bs, bp); BSTP_UNLOCK(bs); } static void bstp_ifupdstatus(void *arg, int pending) { struct bstp_port *bp = (struct bstp_port *)arg; struct bstp_state *bs = bp->bp_bs; struct ifnet *ifp = bp->bp_ifp; struct ifmediareq ifmr; int error, changed; if (!bp->bp_active) return; bzero((char *)&ifmr, sizeof(ifmr)); error = (*ifp->if_ioctl)(ifp, SIOCGIFMEDIA, (caddr_t)&ifmr); BSTP_LOCK(bs); changed = 0; if ((error == 0) && (ifp->if_flags & IFF_UP)) { if (ifmr.ifm_status & IFM_ACTIVE) { /* A full-duplex link is assumed to be point to point */ if (bp->bp_flags & BSTP_PORT_AUTOPTP) { int fdx; fdx = ifmr.ifm_active & IFM_FDX ? 1 : 0; if (bp->bp_ptp_link ^ fdx) { bp->bp_ptp_link = fdx; changed = 1; } } /* Calc the cost if the link was down previously */ if (bp->bp_flags & BSTP_PORT_PNDCOST) { uint32_t cost; cost = bstp_calc_path_cost(bp); if (bp->bp_path_cost != cost) { bp->bp_path_cost = cost; changed = 1; } bp->bp_flags &= ~BSTP_PORT_PNDCOST; } if (bp->bp_role == BSTP_ROLE_DISABLED) { bstp_enable_port(bs, bp); changed = 1; } } else { if (bp->bp_role != BSTP_ROLE_DISABLED) { bstp_disable_port(bs, bp); changed = 1; if ((bp->bp_flags & BSTP_PORT_ADMEDGE) && bp->bp_protover == BSTP_PROTO_RSTP) bp->bp_operedge = 1; } } } else if (bp->bp_infois != BSTP_INFO_DISABLED) { bstp_disable_port(bs, bp); changed = 1; } if (changed) bstp_assign_roles(bs); BSTP_UNLOCK(bs); } static void bstp_enable_port(struct bstp_state *bs, struct bstp_port *bp) { bp->bp_infois = BSTP_INFO_AGED; } static void bstp_disable_port(struct bstp_state *bs, struct bstp_port *bp) { bp->bp_infois = BSTP_INFO_DISABLED; } static void bstp_tick(void *arg) { struct bstp_state *bs = arg; struct bstp_port *bp; BSTP_LOCK_ASSERT(bs); if (bs->bs_running == 0) return; CURVNET_SET(bs->bs_vnet); /* poll link events on interfaces that do not support linkstate */ if (bstp_timer_dectest(&bs->bs_link_timer)) { LIST_FOREACH(bp, &bs->bs_bplist, bp_next) { if (!(bp->bp_ifp->if_capabilities & IFCAP_LINKSTATE)) taskqueue_enqueue(taskqueue_swi, &bp->bp_mediatask); } bstp_timer_start(&bs->bs_link_timer, BSTP_LINK_TIMER); } LIST_FOREACH(bp, &bs->bs_bplist, bp_next) { /* no events need to happen for these */ bstp_timer_dectest(&bp->bp_tc_timer); bstp_timer_dectest(&bp->bp_recent_root_timer); bstp_timer_dectest(&bp->bp_forward_delay_timer); bstp_timer_dectest(&bp->bp_recent_backup_timer); if (bstp_timer_dectest(&bp->bp_hello_timer)) bstp_hello_timer_expiry(bs, bp); if (bstp_timer_dectest(&bp->bp_message_age_timer)) bstp_message_age_expiry(bs, bp); if (bstp_timer_dectest(&bp->bp_migrate_delay_timer)) bstp_migrate_delay_expiry(bs, bp); if (bstp_timer_dectest(&bp->bp_edge_delay_timer)) bstp_edge_delay_expiry(bs, bp); /* update the various state machines for the port */ bstp_update_state(bs, bp); if (bp->bp_txcount > 0) bp->bp_txcount--; } CURVNET_RESTORE(); callout_reset(&bs->bs_bstpcallout, hz, bstp_tick, bs); } static void bstp_timer_start(struct bstp_timer *t, uint16_t v) { t->value = v; t->active = 1; t->latched = 0; } static void bstp_timer_stop(struct bstp_timer *t) { t->value = 0; t->active = 0; t->latched = 0; } static void bstp_timer_latch(struct bstp_timer *t) { t->latched = 1; t->active = 1; } static int bstp_timer_dectest(struct bstp_timer *t) { if (t->active == 0 || t->latched) return (0); t->value -= BSTP_TICK_VAL; if (t->value <= 0) { bstp_timer_stop(t); return (1); } return (0); } static void bstp_hello_timer_expiry(struct bstp_state *bs, struct bstp_port *bp) { if ((bp->bp_flags & BSTP_PORT_NEWINFO) || bp->bp_role == BSTP_ROLE_DESIGNATED || (bp->bp_role == BSTP_ROLE_ROOT && bp->bp_tc_timer.active == 1)) { bstp_timer_start(&bp->bp_hello_timer, bp->bp_desg_htime); bp->bp_flags |= BSTP_PORT_NEWINFO; bstp_transmit(bs, bp); } } static void bstp_message_age_expiry(struct bstp_state *bs, struct bstp_port *bp) { if (bp->bp_infois == BSTP_INFO_RECEIVED) { bp->bp_infois = BSTP_INFO_AGED; bstp_assign_roles(bs); DPRINTF("aged info on %s\n", bp->bp_ifp->if_xname); } } static void bstp_migrate_delay_expiry(struct bstp_state *bs, struct bstp_port *bp) { bp->bp_flags |= BSTP_PORT_CANMIGRATE; } static void bstp_edge_delay_expiry(struct bstp_state *bs, struct bstp_port *bp) { if ((bp->bp_flags & BSTP_PORT_AUTOEDGE) && bp->bp_protover == BSTP_PROTO_RSTP && bp->bp_proposing && bp->bp_role == BSTP_ROLE_DESIGNATED) { bp->bp_operedge = 1; DPRINTF("%s -> edge port\n", bp->bp_ifp->if_xname); } } static int bstp_addr_cmp(const uint8_t *a, const uint8_t *b) { int i, d; for (i = 0, d = 0; i < ETHER_ADDR_LEN && d == 0; i++) { d = ((int)a[i]) - ((int)b[i]); } return (d); } /* * compare the bridge address component of the bridgeid */ static int bstp_same_bridgeid(uint64_t id1, uint64_t id2) { u_char addr1[ETHER_ADDR_LEN]; u_char addr2[ETHER_ADDR_LEN]; PV2ADDR(id1, addr1); PV2ADDR(id2, addr2); if (bstp_addr_cmp(addr1, addr2) == 0) return (1); return (0); } void bstp_reinit(struct bstp_state *bs) { struct bstp_port *bp; struct ifnet *ifp, *mif; u_char *e_addr; void *bridgeptr; static const u_char llzero[ETHER_ADDR_LEN]; /* 00:00:00:00:00:00 */ BSTP_LOCK_ASSERT(bs); if (LIST_EMPTY(&bs->bs_bplist)) goto disablestp; mif = NULL; bridgeptr = LIST_FIRST(&bs->bs_bplist)->bp_ifp->if_bridge; KASSERT(bridgeptr != NULL, ("Invalid bridge pointer")); /* * Search through the Ethernet adapters and find the one with the * lowest value. Make sure the adapter which we take the MAC address * from is part of this bridge, so we can have more than one independent * bridges in the same STP domain. */ IFNET_RLOCK_NOSLEEP(); - TAILQ_FOREACH(ifp, &V_ifnet, if_link) { + CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) { if (ifp->if_type != IFT_ETHER) continue; /* Not Ethernet */ if (ifp->if_bridge != bridgeptr) continue; /* Not part of our bridge */ if (bstp_addr_cmp(IF_LLADDR(ifp), llzero) == 0) continue; /* No mac address set */ if (mif == NULL) { mif = ifp; continue; } if (bstp_addr_cmp(IF_LLADDR(ifp), IF_LLADDR(mif)) < 0) { mif = ifp; continue; } } IFNET_RUNLOCK_NOSLEEP(); if (mif == NULL) goto disablestp; e_addr = IF_LLADDR(mif); bs->bs_bridge_pv.pv_dbridge_id = (((uint64_t)bs->bs_bridge_priority) << 48) | (((uint64_t)e_addr[0]) << 40) | (((uint64_t)e_addr[1]) << 32) | (((uint64_t)e_addr[2]) << 24) | (((uint64_t)e_addr[3]) << 16) | (((uint64_t)e_addr[4]) << 8) | (((uint64_t)e_addr[5])); bs->bs_bridge_pv.pv_root_id = bs->bs_bridge_pv.pv_dbridge_id; bs->bs_bridge_pv.pv_cost = 0; bs->bs_bridge_pv.pv_dport_id = 0; bs->bs_bridge_pv.pv_port_id = 0; if (bs->bs_running && callout_pending(&bs->bs_bstpcallout) == 0) callout_reset(&bs->bs_bstpcallout, hz, bstp_tick, bs); LIST_FOREACH(bp, &bs->bs_bplist, bp_next) { bp->bp_port_id = (bp->bp_priority << 8) | (bp->bp_ifp->if_index & 0xfff); taskqueue_enqueue(taskqueue_swi, &bp->bp_mediatask); } bstp_assign_roles(bs); bstp_timer_start(&bs->bs_link_timer, BSTP_LINK_TIMER); return; disablestp: /* Set the bridge and root id (lower bits) to zero */ bs->bs_bridge_pv.pv_dbridge_id = ((uint64_t)bs->bs_bridge_priority) << 48; bs->bs_bridge_pv.pv_root_id = bs->bs_bridge_pv.pv_dbridge_id; bs->bs_root_pv = bs->bs_bridge_pv; /* Disable any remaining ports, they will have no MAC address */ LIST_FOREACH(bp, &bs->bs_bplist, bp_next) { bp->bp_infois = BSTP_INFO_DISABLED; bstp_set_port_role(bp, BSTP_ROLE_DISABLED); } callout_stop(&bs->bs_bstpcallout); } static int bstp_modevent(module_t mod, int type, void *data) { switch (type) { case MOD_LOAD: mtx_init(&bstp_list_mtx, "bridgestp list", NULL, MTX_DEF); LIST_INIT(&bstp_list); break; case MOD_UNLOAD: mtx_destroy(&bstp_list_mtx); break; default: return (EOPNOTSUPP); } return (0); } static moduledata_t bstp_mod = { "bridgestp", bstp_modevent, 0 }; DECLARE_MODULE(bridgestp, bstp_mod, SI_SUB_PSEUDO, SI_ORDER_ANY); MODULE_VERSION(bridgestp, 1); void bstp_attach(struct bstp_state *bs, struct bstp_cb_ops *cb) { BSTP_LOCK_INIT(bs); callout_init_mtx(&bs->bs_bstpcallout, &bs->bs_mtx, 0); LIST_INIT(&bs->bs_bplist); bs->bs_bridge_max_age = BSTP_DEFAULT_MAX_AGE; bs->bs_bridge_htime = BSTP_DEFAULT_HELLO_TIME; bs->bs_bridge_fdelay = BSTP_DEFAULT_FORWARD_DELAY; bs->bs_bridge_priority = BSTP_DEFAULT_BRIDGE_PRIORITY; bs->bs_hold_time = BSTP_DEFAULT_HOLD_TIME; bs->bs_migration_delay = BSTP_DEFAULT_MIGRATE_DELAY; bs->bs_txholdcount = BSTP_DEFAULT_HOLD_COUNT; bs->bs_protover = BSTP_PROTO_RSTP; bs->bs_state_cb = cb->bcb_state; bs->bs_rtage_cb = cb->bcb_rtage; bs->bs_vnet = curvnet; getmicrotime(&bs->bs_last_tc_time); mtx_lock(&bstp_list_mtx); LIST_INSERT_HEAD(&bstp_list, bs, bs_list); mtx_unlock(&bstp_list_mtx); } void bstp_detach(struct bstp_state *bs) { KASSERT(LIST_EMPTY(&bs->bs_bplist), ("bstp still active")); mtx_lock(&bstp_list_mtx); LIST_REMOVE(bs, bs_list); mtx_unlock(&bstp_list_mtx); callout_drain(&bs->bs_bstpcallout); BSTP_LOCK_DESTROY(bs); } void bstp_init(struct bstp_state *bs) { BSTP_LOCK(bs); callout_reset(&bs->bs_bstpcallout, hz, bstp_tick, bs); bs->bs_running = 1; bstp_reinit(bs); BSTP_UNLOCK(bs); } void bstp_stop(struct bstp_state *bs) { struct bstp_port *bp; BSTP_LOCK(bs); LIST_FOREACH(bp, &bs->bs_bplist, bp_next) bstp_set_port_state(bp, BSTP_IFSTATE_DISCARDING); bs->bs_running = 0; callout_stop(&bs->bs_bstpcallout); BSTP_UNLOCK(bs); } int bstp_create(struct bstp_state *bs, struct bstp_port *bp, struct ifnet *ifp) { bzero(bp, sizeof(struct bstp_port)); BSTP_LOCK(bs); bp->bp_ifp = ifp; bp->bp_bs = bs; bp->bp_priority = BSTP_DEFAULT_PORT_PRIORITY; TASK_INIT(&bp->bp_statetask, 0, bstp_notify_state, bp); TASK_INIT(&bp->bp_rtagetask, 0, bstp_notify_rtage, bp); TASK_INIT(&bp->bp_mediatask, 0, bstp_ifupdstatus, bp); /* Init state */ bp->bp_infois = BSTP_INFO_DISABLED; bp->bp_flags = BSTP_PORT_AUTOEDGE|BSTP_PORT_AUTOPTP; bstp_set_port_state(bp, BSTP_IFSTATE_DISCARDING); bstp_set_port_proto(bp, bs->bs_protover); bstp_set_port_role(bp, BSTP_ROLE_DISABLED); bstp_set_port_tc(bp, BSTP_TCSTATE_INACTIVE); bp->bp_path_cost = bstp_calc_path_cost(bp); BSTP_UNLOCK(bs); return (0); } int bstp_enable(struct bstp_port *bp) { struct bstp_state *bs = bp->bp_bs; struct ifnet *ifp = bp->bp_ifp; KASSERT(bp->bp_active == 0, ("already a bstp member")); switch (ifp->if_type) { case IFT_ETHER: /* These can do spanning tree. */ break; default: /* Nothing else can. */ return (EINVAL); } BSTP_LOCK(bs); LIST_INSERT_HEAD(&bs->bs_bplist, bp, bp_next); bp->bp_active = 1; bp->bp_flags |= BSTP_PORT_NEWINFO; bstp_reinit(bs); bstp_update_roles(bs, bp); BSTP_UNLOCK(bs); return (0); } void bstp_disable(struct bstp_port *bp) { struct bstp_state *bs = bp->bp_bs; KASSERT(bp->bp_active == 1, ("not a bstp member")); BSTP_LOCK(bs); bstp_disable_port(bs, bp); LIST_REMOVE(bp, bp_next); bp->bp_active = 0; bstp_reinit(bs); BSTP_UNLOCK(bs); } /* * The bstp_port structure is about to be freed by the parent bridge. */ void bstp_destroy(struct bstp_port *bp) { KASSERT(bp->bp_active == 0, ("port is still attached")); taskqueue_drain(taskqueue_swi, &bp->bp_statetask); taskqueue_drain(taskqueue_swi, &bp->bp_rtagetask); taskqueue_drain(taskqueue_swi, &bp->bp_mediatask); } Index: head/sys/net/if.c =================================================================== --- head/sys/net/if.c (revision 334117) +++ head/sys/net/if.c (revision 334118) @@ -1,4539 +1,4529 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1980, 1986, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)if.c 8.5 (Berkeley) 1/9/95 * $FreeBSD$ */ #include "opt_inet6.h" #include "opt_inet.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #if defined(INET) || defined(INET6) #include #include #include #include #include #ifdef INET #include #include #endif /* INET */ #ifdef INET6 #include #include #endif /* INET6 */ #endif /* INET || INET6 */ #include /* * Consumers of struct ifreq such as tcpdump assume no pad between ifr_name * and ifr_ifru when it is used in SIOCGIFCONF. */ _Static_assert(sizeof(((struct ifreq *)0)->ifr_name) == offsetof(struct ifreq, ifr_ifru), "gap between ifr_name and ifr_ifru"); __read_mostly epoch_t net_epoch_preempt; __read_mostly epoch_t net_epoch; #ifdef COMPAT_FREEBSD32 #include #include struct ifreq_buffer32 { uint32_t length; /* (size_t) */ uint32_t buffer; /* (void *) */ }; /* * Interface request structure used for socket * ioctl's. All interface ioctl's must have parameter * definitions which begin with ifr_name. The * remainder may be interface specific. */ struct ifreq32 { char ifr_name[IFNAMSIZ]; /* if name, e.g. "en0" */ union { struct sockaddr ifru_addr; struct sockaddr ifru_dstaddr; struct sockaddr ifru_broadaddr; struct ifreq_buffer32 ifru_buffer; short ifru_flags[2]; short ifru_index; int ifru_jid; int ifru_metric; int ifru_mtu; int ifru_phys; int ifru_media; uint32_t ifru_data; int ifru_cap[2]; u_int ifru_fib; u_char ifru_vlan_pcp; } ifr_ifru; }; CTASSERT(sizeof(struct ifreq) == sizeof(struct ifreq32)); CTASSERT(__offsetof(struct ifreq, ifr_ifru) == __offsetof(struct ifreq32, ifr_ifru)); struct ifgroupreq32 { char ifgr_name[IFNAMSIZ]; u_int ifgr_len; union { char ifgru_group[IFNAMSIZ]; uint32_t ifgru_groups; } ifgr_ifgru; }; struct ifmediareq32 { char ifm_name[IFNAMSIZ]; int ifm_current; int ifm_mask; int ifm_status; int ifm_active; int ifm_count; uint32_t ifm_ulist; /* (int *) */ }; #define SIOCGIFMEDIA32 _IOC_NEWTYPE(SIOCGIFMEDIA, struct ifmediareq32) #define SIOCGIFXMEDIA32 _IOC_NEWTYPE(SIOCGIFXMEDIA, struct ifmediareq32) #define _CASE_IOC_IFGROUPREQ_32(cmd) \ case _IOC_NEWTYPE((cmd), struct ifgroupreq32): #else /* !COMPAT_FREEBSD32 */ #define _CASE_IOC_IFGROUPREQ_32(cmd) #endif /* !COMPAT_FREEBSD32 */ #define CASE_IOC_IFGROUPREQ(cmd) \ _CASE_IOC_IFGROUPREQ_32(cmd) \ case (cmd) union ifreq_union { struct ifreq ifr; #ifdef COMPAT_FREEBSD32 struct ifreq32 ifr32; #endif }; union ifgroupreq_union { struct ifgroupreq ifgr; #ifdef COMPAT_FREEBSD32 struct ifgroupreq32 ifgr32; #endif }; SYSCTL_NODE(_net, PF_LINK, link, CTLFLAG_RW, 0, "Link layers"); SYSCTL_NODE(_net_link, 0, generic, CTLFLAG_RW, 0, "Generic link-management"); SYSCTL_INT(_net_link, OID_AUTO, ifqmaxlen, CTLFLAG_RDTUN, &ifqmaxlen, 0, "max send queue size"); /* Log link state change events */ static int log_link_state_change = 1; SYSCTL_INT(_net_link, OID_AUTO, log_link_state_change, CTLFLAG_RW, &log_link_state_change, 0, "log interface link state change events"); /* Log promiscuous mode change events */ static int log_promisc_mode_change = 1; SYSCTL_INT(_net_link, OID_AUTO, log_promisc_mode_change, CTLFLAG_RDTUN, &log_promisc_mode_change, 1, "log promiscuous mode change events"); /* Interface description */ static unsigned int ifdescr_maxlen = 1024; SYSCTL_UINT(_net, OID_AUTO, ifdescr_maxlen, CTLFLAG_RW, &ifdescr_maxlen, 0, "administrative maximum length for interface description"); static MALLOC_DEFINE(M_IFDESCR, "ifdescr", "ifnet descriptions"); /* global sx for non-critical path ifdescr */ static struct sx ifdescr_sx; SX_SYSINIT(ifdescr_sx, &ifdescr_sx, "ifnet descr"); void (*ng_ether_link_state_p)(struct ifnet *ifp, int state); void (*lagg_linkstate_p)(struct ifnet *ifp, int state); /* These are external hooks for CARP. */ void (*carp_linkstate_p)(struct ifnet *ifp); void (*carp_demote_adj_p)(int, char *); int (*carp_master_p)(struct ifaddr *); #if defined(INET) || defined(INET6) int (*carp_forus_p)(struct ifnet *ifp, u_char *dhost); int (*carp_output_p)(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *sa); int (*carp_ioctl_p)(struct ifreq *, u_long, struct thread *); int (*carp_attach_p)(struct ifaddr *, int); void (*carp_detach_p)(struct ifaddr *, bool); #endif #ifdef INET int (*carp_iamatch_p)(struct ifaddr *, uint8_t **); #endif #ifdef INET6 struct ifaddr *(*carp_iamatch6_p)(struct ifnet *ifp, struct in6_addr *taddr6); caddr_t (*carp_macmatch6_p)(struct ifnet *ifp, struct mbuf *m, const struct in6_addr *taddr); #endif struct mbuf *(*tbr_dequeue_ptr)(struct ifaltq *, int) = NULL; /* * XXX: Style; these should be sorted alphabetically, and unprototyped * static functions should be prototyped. Currently they are sorted by * declaration order. */ static void if_attachdomain(void *); static void if_attachdomain1(struct ifnet *); static int ifconf(u_long, caddr_t); -static void if_grow(void); +static void *if_grow(void); static void if_input_default(struct ifnet *, struct mbuf *); static int if_requestencap_default(struct ifnet *, struct if_encap_req *); static void if_route(struct ifnet *, int flag, int fam); static int if_setflag(struct ifnet *, int, int, int *, int); static int if_transmit(struct ifnet *ifp, struct mbuf *m); static void if_unroute(struct ifnet *, int flag, int fam); static void link_rtrequest(int, struct rtentry *, struct rt_addrinfo *); static int ifhwioctl(u_long, struct ifnet *, caddr_t, struct thread *); static int if_delmulti_locked(struct ifnet *, struct ifmultiaddr *, int); static void do_link_state_change(void *, int); static int if_getgroup(struct ifgroupreq *, struct ifnet *); static int if_getgroupmembers(struct ifgroupreq *); static void if_delgroups(struct ifnet *); static void if_attach_internal(struct ifnet *, int, struct if_clone *); static int if_detach_internal(struct ifnet *, int, struct if_clone **); #ifdef VIMAGE static void if_vmove(struct ifnet *, struct vnet *); #endif #ifdef INET6 /* * XXX: declare here to avoid to include many inet6 related files.. * should be more generalized? */ extern void nd6_setmtu(struct ifnet *); #endif /* ipsec helper hooks */ VNET_DEFINE(struct hhook_head *, ipsec_hhh_in[HHOOK_IPSEC_COUNT]); VNET_DEFINE(struct hhook_head *, ipsec_hhh_out[HHOOK_IPSEC_COUNT]); VNET_DEFINE(int, if_index); int ifqmaxlen = IFQ_MAXLEN; VNET_DEFINE(struct ifnethead, ifnet); /* depend on static init XXX */ VNET_DEFINE(struct ifgrouphead, ifg_head); static VNET_DEFINE(int, if_indexlim) = 8; /* Table of ifnet by index. */ VNET_DEFINE(struct ifnet **, ifindex_table); #define V_if_indexlim VNET(if_indexlim) #define V_ifindex_table VNET(ifindex_table) /* * The global network interface list (V_ifnet) and related state (such as * if_index, if_indexlim, and ifindex_table) are protected by an sxlock and * an rwlock. Either may be acquired shared to stablize the list, but both * must be acquired writable to modify the list. This model allows us to * both stablize the interface list during interrupt thread processing, but * also to stablize it over long-running ioctls, without introducing priority * inversions and deadlocks. */ struct rwlock ifnet_rwlock; RW_SYSINIT_FLAGS(ifnet_rw, &ifnet_rwlock, "ifnet_rw", RW_RECURSE); struct sx ifnet_sxlock; SX_SYSINIT_FLAGS(ifnet_sx, &ifnet_sxlock, "ifnet_sx", SX_RECURSE); /* * The allocation of network interfaces is a rather non-atomic affair; we * need to select an index before we are ready to expose the interface for * use, so will use this pointer value to indicate reservation. */ #define IFNET_HOLD (void *)(uintptr_t)(-1) static if_com_alloc_t *if_com_alloc[256]; static if_com_free_t *if_com_free[256]; static MALLOC_DEFINE(M_IFNET, "ifnet", "interface internals"); MALLOC_DEFINE(M_IFADDR, "ifaddr", "interface address"); MALLOC_DEFINE(M_IFMADDR, "ether_multi", "link-level multicast address"); struct ifnet * ifnet_byindex_locked(u_short idx) { if (idx > V_if_index) return (NULL); if (V_ifindex_table[idx] == IFNET_HOLD) return (NULL); return (V_ifindex_table[idx]); } struct ifnet * ifnet_byindex(u_short idx) { struct ifnet *ifp; - IFNET_RLOCK_NOSLEEP(); ifp = ifnet_byindex_locked(idx); - IFNET_RUNLOCK_NOSLEEP(); return (ifp); } struct ifnet * ifnet_byindex_ref(u_short idx) { struct ifnet *ifp; IFNET_RLOCK_NOSLEEP(); ifp = ifnet_byindex_locked(idx); if (ifp == NULL || (ifp->if_flags & IFF_DYING)) { IFNET_RUNLOCK_NOSLEEP(); return (NULL); } if_ref(ifp); IFNET_RUNLOCK_NOSLEEP(); return (ifp); } /* * Allocate an ifindex array entry; return 0 on success or an error on * failure. */ static u_short -ifindex_alloc(void) +ifindex_alloc(void **old) { u_short idx; IFNET_WLOCK_ASSERT(); -retry: /* * Try to find an empty slot below V_if_index. If we fail, take the * next slot. */ for (idx = 1; idx <= V_if_index; idx++) { if (V_ifindex_table[idx] == NULL) break; } /* Catch if_index overflow. */ if (idx >= V_if_indexlim) { - if_grow(); - goto retry; + *old = if_grow(); + return (USHRT_MAX); } if (idx > V_if_index) V_if_index = idx; return (idx); } static void ifindex_free_locked(u_short idx) { IFNET_WLOCK_ASSERT(); V_ifindex_table[idx] = NULL; while (V_if_index > 0 && V_ifindex_table[V_if_index] == NULL) V_if_index--; } static void ifindex_free(u_short idx) { IFNET_WLOCK(); ifindex_free_locked(idx); IFNET_WUNLOCK(); } static void -ifnet_setbyindex_locked(u_short idx, struct ifnet *ifp) +ifnet_setbyindex(u_short idx, struct ifnet *ifp) { - IFNET_WLOCK_ASSERT(); - V_ifindex_table[idx] = ifp; } -static void -ifnet_setbyindex(u_short idx, struct ifnet *ifp) -{ - - IFNET_WLOCK(); - ifnet_setbyindex_locked(idx, ifp); - IFNET_WUNLOCK(); -} - struct ifaddr * ifaddr_byindex(u_short idx) { struct ifnet *ifp; struct ifaddr *ifa = NULL; IFNET_RLOCK_NOSLEEP(); ifp = ifnet_byindex_locked(idx); if (ifp != NULL && (ifa = ifp->if_addr) != NULL) ifa_ref(ifa); IFNET_RUNLOCK_NOSLEEP(); return (ifa); } /* * Network interface utility routines. * * Routines with ifa_ifwith* names take sockaddr *'s as * parameters. */ static void vnet_if_init(const void *unused __unused) { + void *old; - TAILQ_INIT(&V_ifnet); - TAILQ_INIT(&V_ifg_head); + CK_STAILQ_INIT(&V_ifnet); + CK_STAILQ_INIT(&V_ifg_head); IFNET_WLOCK(); - if_grow(); /* create initial table */ + old = if_grow(); /* create initial table */ IFNET_WUNLOCK(); + epoch_wait_preempt(net_epoch_preempt); + free(old, M_IFNET); vnet_if_clone_init(); } VNET_SYSINIT(vnet_if_init, SI_SUB_INIT_IF, SI_ORDER_SECOND, vnet_if_init, NULL); #ifdef VIMAGE static void vnet_if_uninit(const void *unused __unused) { - VNET_ASSERT(TAILQ_EMPTY(&V_ifnet), ("%s:%d tailq &V_ifnet=%p " + VNET_ASSERT(CK_STAILQ_EMPTY(&V_ifnet), ("%s:%d tailq &V_ifnet=%p " "not empty", __func__, __LINE__, &V_ifnet)); - VNET_ASSERT(TAILQ_EMPTY(&V_ifg_head), ("%s:%d tailq &V_ifg_head=%p " + VNET_ASSERT(CK_STAILQ_EMPTY(&V_ifg_head), ("%s:%d tailq &V_ifg_head=%p " "not empty", __func__, __LINE__, &V_ifg_head)); free((caddr_t)V_ifindex_table, M_IFNET); } VNET_SYSUNINIT(vnet_if_uninit, SI_SUB_INIT_IF, SI_ORDER_FIRST, vnet_if_uninit, NULL); static void vnet_if_return(const void *unused __unused) { struct ifnet *ifp, *nifp; /* Return all inherited interfaces to their parent vnets. */ - TAILQ_FOREACH_SAFE(ifp, &V_ifnet, if_link, nifp) { + CK_STAILQ_FOREACH_SAFE(ifp, &V_ifnet, if_link, nifp) { if (ifp->if_home_vnet != ifp->if_vnet) if_vmove(ifp, ifp->if_home_vnet); } } VNET_SYSUNINIT(vnet_if_return, SI_SUB_VNET_DONE, SI_ORDER_ANY, vnet_if_return, NULL); #endif -static void + +static void * if_grow(void) { int oldlim; u_int n; struct ifnet **e; + void *old; + old = NULL; IFNET_WLOCK_ASSERT(); oldlim = V_if_indexlim; IFNET_WUNLOCK(); n = (oldlim << 1) * sizeof(*e); e = malloc(n, M_IFNET, M_WAITOK | M_ZERO); IFNET_WLOCK(); if (V_if_indexlim != oldlim) { free(e, M_IFNET); - return; + return (NULL); } if (V_ifindex_table != NULL) { memcpy((caddr_t)e, (caddr_t)V_ifindex_table, n/2); - free((caddr_t)V_ifindex_table, M_IFNET); + old = V_ifindex_table; } V_if_indexlim <<= 1; V_ifindex_table = e; + return (old); } /* * Allocate a struct ifnet and an index for an interface. A layer 2 * common structure will also be allocated if an allocation routine is * registered for the passed type. */ struct ifnet * if_alloc(u_char type) { struct ifnet *ifp; u_short idx; + void *old; ifp = malloc(sizeof(struct ifnet), M_IFNET, M_WAITOK|M_ZERO); + restart: IFNET_WLOCK(); - idx = ifindex_alloc(); - ifnet_setbyindex_locked(idx, IFNET_HOLD); + idx = ifindex_alloc(&old); + if (__predict_false(idx == USHRT_MAX)) { + IFNET_WUNLOCK(); + epoch_wait_preempt(net_epoch_preempt); + free(old, M_IFNET); + goto restart; + } + ifnet_setbyindex(idx, IFNET_HOLD); IFNET_WUNLOCK(); ifp->if_index = idx; ifp->if_type = type; ifp->if_alloctype = type; #ifdef VIMAGE ifp->if_vnet = curvnet; #endif if (if_com_alloc[type] != NULL) { ifp->if_l2com = if_com_alloc[type](type, ifp); if (ifp->if_l2com == NULL) { free(ifp, M_IFNET); ifindex_free(idx); return (NULL); } } IF_ADDR_LOCK_INIT(ifp); TASK_INIT(&ifp->if_linktask, 0, do_link_state_change, ifp); ifp->if_afdata_initialized = 0; IF_AFDATA_LOCK_INIT(ifp); CK_STAILQ_INIT(&ifp->if_addrhead); CK_STAILQ_INIT(&ifp->if_multiaddrs); - TAILQ_INIT(&ifp->if_groups); + CK_STAILQ_INIT(&ifp->if_groups); #ifdef MAC mac_ifnet_init(ifp); #endif ifq_init(&ifp->if_snd, ifp); refcount_init(&ifp->if_refcount, 1); /* Index reference. */ for (int i = 0; i < IFCOUNTERS; i++) ifp->if_counters[i] = counter_u64_alloc(M_WAITOK); ifp->if_get_counter = if_get_counter_default; ifp->if_pcp = IFNET_PCP_NONE; ifnet_setbyindex(ifp->if_index, ifp); return (ifp); } /* * Do the actual work of freeing a struct ifnet, and layer 2 common * structure. This call is made when the last reference to an * interface is released. */ static void if_free_internal(struct ifnet *ifp) { KASSERT((ifp->if_flags & IFF_DYING), ("if_free_internal: interface not dying")); if (if_com_free[ifp->if_alloctype] != NULL) if_com_free[ifp->if_alloctype](ifp->if_l2com, ifp->if_alloctype); #ifdef MAC mac_ifnet_destroy(ifp); #endif /* MAC */ if (ifp->if_description != NULL) free(ifp->if_description, M_IFDESCR); IF_AFDATA_DESTROY(ifp); IF_ADDR_LOCK_DESTROY(ifp); ifq_delete(&ifp->if_snd); for (int i = 0; i < IFCOUNTERS; i++) counter_u64_free(ifp->if_counters[i]); free(ifp, M_IFNET); } +static void +if_destroy(epoch_context_t ctx) +{ + struct ifnet *ifp; + + ifp = __containerof(ctx, struct ifnet, if_epoch_ctx); + if_free_internal(ifp); +} + /* * Deregister an interface and free the associated storage. */ void if_free(struct ifnet *ifp) { ifp->if_flags |= IFF_DYING; /* XXX: Locking */ CURVNET_SET_QUIET(ifp->if_vnet); IFNET_WLOCK(); KASSERT(ifp == ifnet_byindex_locked(ifp->if_index), ("%s: freeing unallocated ifnet", ifp->if_xname)); ifindex_free_locked(ifp->if_index); IFNET_WUNLOCK(); if (refcount_release(&ifp->if_refcount)) - if_free_internal(ifp); + epoch_call(net_epoch_preempt, &ifp->if_epoch_ctx, if_destroy); CURVNET_RESTORE(); } /* * Interfaces to keep an ifnet type-stable despite the possibility of the * driver calling if_free(). If there are additional references, we defer * freeing the underlying data structure. */ void if_ref(struct ifnet *ifp) { /* We don't assert the ifnet list lock here, but arguably should. */ refcount_acquire(&ifp->if_refcount); } void if_rele(struct ifnet *ifp) { if (!refcount_release(&ifp->if_refcount)) return; - if_free_internal(ifp); + epoch_call(net_epoch_preempt, &ifp->if_epoch_ctx, if_destroy); } void ifq_init(struct ifaltq *ifq, struct ifnet *ifp) { mtx_init(&ifq->ifq_mtx, ifp->if_xname, "if send queue", MTX_DEF); if (ifq->ifq_maxlen == 0) ifq->ifq_maxlen = ifqmaxlen; ifq->altq_type = 0; ifq->altq_disc = NULL; ifq->altq_flags &= ALTQF_CANTCHANGE; ifq->altq_tbr = NULL; ifq->altq_ifp = ifp; } void ifq_delete(struct ifaltq *ifq) { mtx_destroy(&ifq->ifq_mtx); } /* * Perform generic interface initialization tasks and attach the interface * to the list of "active" interfaces. If vmove flag is set on entry * to if_attach_internal(), perform only a limited subset of initialization * tasks, given that we are moving from one vnet to another an ifnet which * has already been fully initialized. * * Note that if_detach_internal() removes group membership unconditionally * even when vmove flag is set, and if_attach_internal() adds only IFG_ALL. * Thus, when if_vmove() is applied to a cloned interface, group membership * is lost while a cloned one always joins a group whose name is * ifc->ifc_name. To recover this after if_detach_internal() and * if_attach_internal(), the cloner should be specified to * if_attach_internal() via ifc. If it is non-NULL, if_attach_internal() * attempts to join a group whose name is ifc->ifc_name. * * XXX: * - The decision to return void and thus require this function to * succeed is questionable. * - We should probably do more sanity checking. For instance we don't * do anything to insure if_xname is unique or non-empty. */ void if_attach(struct ifnet *ifp) { if_attach_internal(ifp, 0, NULL); } /* * Compute the least common TSO limit. */ void if_hw_tsomax_common(if_t ifp, struct ifnet_hw_tsomax *pmax) { /* * 1) If there is no limit currently, take the limit from * the network adapter. * * 2) If the network adapter has a limit below the current * limit, apply it. */ if (pmax->tsomaxbytes == 0 || (ifp->if_hw_tsomax != 0 && ifp->if_hw_tsomax < pmax->tsomaxbytes)) { pmax->tsomaxbytes = ifp->if_hw_tsomax; } if (pmax->tsomaxsegcount == 0 || (ifp->if_hw_tsomaxsegcount != 0 && ifp->if_hw_tsomaxsegcount < pmax->tsomaxsegcount)) { pmax->tsomaxsegcount = ifp->if_hw_tsomaxsegcount; } if (pmax->tsomaxsegsize == 0 || (ifp->if_hw_tsomaxsegsize != 0 && ifp->if_hw_tsomaxsegsize < pmax->tsomaxsegsize)) { pmax->tsomaxsegsize = ifp->if_hw_tsomaxsegsize; } } /* * Update TSO limit of a network adapter. * * Returns zero if no change. Else non-zero. */ int if_hw_tsomax_update(if_t ifp, struct ifnet_hw_tsomax *pmax) { int retval = 0; if (ifp->if_hw_tsomax != pmax->tsomaxbytes) { ifp->if_hw_tsomax = pmax->tsomaxbytes; retval++; } if (ifp->if_hw_tsomaxsegsize != pmax->tsomaxsegsize) { ifp->if_hw_tsomaxsegsize = pmax->tsomaxsegsize; retval++; } if (ifp->if_hw_tsomaxsegcount != pmax->tsomaxsegcount) { ifp->if_hw_tsomaxsegcount = pmax->tsomaxsegcount; retval++; } return (retval); } static void if_attach_internal(struct ifnet *ifp, int vmove, struct if_clone *ifc) { unsigned socksize, ifasize; int namelen, masklen; struct sockaddr_dl *sdl; struct ifaddr *ifa; if (ifp->if_index == 0 || ifp != ifnet_byindex(ifp->if_index)) panic ("%s: BUG: if_attach called without if_alloc'd input()\n", ifp->if_xname); #ifdef VIMAGE ifp->if_vnet = curvnet; if (ifp->if_home_vnet == NULL) ifp->if_home_vnet = curvnet; #endif if_addgroup(ifp, IFG_ALL); /* Restore group membership for cloned interfaces. */ if (vmove && ifc != NULL) if_clone_addgroup(ifp, ifc); getmicrotime(&ifp->if_lastchange); ifp->if_epoch = time_uptime; KASSERT((ifp->if_transmit == NULL && ifp->if_qflush == NULL) || (ifp->if_transmit != NULL && ifp->if_qflush != NULL), ("transmit and qflush must both either be set or both be NULL")); if (ifp->if_transmit == NULL) { ifp->if_transmit = if_transmit; ifp->if_qflush = if_qflush; } if (ifp->if_input == NULL) ifp->if_input = if_input_default; if (ifp->if_requestencap == NULL) ifp->if_requestencap = if_requestencap_default; if (!vmove) { #ifdef MAC mac_ifnet_create(ifp); #endif /* * Create a Link Level name for this device. */ namelen = strlen(ifp->if_xname); /* * Always save enough space for any possiable name so we * can do a rename in place later. */ masklen = offsetof(struct sockaddr_dl, sdl_data[0]) + IFNAMSIZ; socksize = masklen + ifp->if_addrlen; if (socksize < sizeof(*sdl)) socksize = sizeof(*sdl); socksize = roundup2(socksize, sizeof(long)); ifasize = sizeof(*ifa) + 2 * socksize; ifa = ifa_alloc(ifasize, M_WAITOK); sdl = (struct sockaddr_dl *)(ifa + 1); sdl->sdl_len = socksize; sdl->sdl_family = AF_LINK; bcopy(ifp->if_xname, sdl->sdl_data, namelen); sdl->sdl_nlen = namelen; sdl->sdl_index = ifp->if_index; sdl->sdl_type = ifp->if_type; ifp->if_addr = ifa; ifa->ifa_ifp = ifp; ifa->ifa_rtrequest = link_rtrequest; ifa->ifa_addr = (struct sockaddr *)sdl; sdl = (struct sockaddr_dl *)(socksize + (caddr_t)sdl); ifa->ifa_netmask = (struct sockaddr *)sdl; sdl->sdl_len = masklen; while (namelen != 0) sdl->sdl_data[--namelen] = 0xff; CK_STAILQ_INSERT_HEAD(&ifp->if_addrhead, ifa, ifa_link); /* Reliably crash if used uninitialized. */ ifp->if_broadcastaddr = NULL; if (ifp->if_type == IFT_ETHER) { ifp->if_hw_addr = malloc(ifp->if_addrlen, M_IFADDR, M_WAITOK | M_ZERO); } #if defined(INET) || defined(INET6) /* Use defaults for TSO, if nothing is set */ if (ifp->if_hw_tsomax == 0 && ifp->if_hw_tsomaxsegcount == 0 && ifp->if_hw_tsomaxsegsize == 0) { /* * The TSO defaults needs to be such that an * NFS mbuf list of 35 mbufs totalling just * below 64K works and that a chain of mbufs * can be defragged into at most 32 segments: */ ifp->if_hw_tsomax = min(IP_MAXPACKET, (32 * MCLBYTES) - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN)); ifp->if_hw_tsomaxsegcount = 35; ifp->if_hw_tsomaxsegsize = 2048; /* 2K */ /* XXX some drivers set IFCAP_TSO after ethernet attach */ if (ifp->if_capabilities & IFCAP_TSO) { if_printf(ifp, "Using defaults for TSO: %u/%u/%u\n", ifp->if_hw_tsomax, ifp->if_hw_tsomaxsegcount, ifp->if_hw_tsomaxsegsize); } } #endif } #ifdef VIMAGE else { /* * Update the interface index in the link layer address * of the interface. */ for (ifa = ifp->if_addr; ifa != NULL; ifa = CK_STAILQ_NEXT(ifa, ifa_link)) { if (ifa->ifa_addr->sa_family == AF_LINK) { sdl = (struct sockaddr_dl *)ifa->ifa_addr; sdl->sdl_index = ifp->if_index; } } } #endif IFNET_WLOCK(); - TAILQ_INSERT_TAIL(&V_ifnet, ifp, if_link); + CK_STAILQ_INSERT_TAIL(&V_ifnet, ifp, if_link); #ifdef VIMAGE curvnet->vnet_ifcnt++; #endif IFNET_WUNLOCK(); if (domain_init_status >= 2) if_attachdomain1(ifp); EVENTHANDLER_INVOKE(ifnet_arrival_event, ifp); if (IS_DEFAULT_VNET(curvnet)) devctl_notify("IFNET", ifp->if_xname, "ATTACH", NULL); /* Announce the interface. */ rt_ifannouncemsg(ifp, IFAN_ARRIVAL); } static void if_epochalloc(void *dummy __unused) { net_epoch_preempt = epoch_alloc(EPOCH_PREEMPT); net_epoch = epoch_alloc(0); } SYSINIT(ifepochalloc, SI_SUB_TASKQ + 1, SI_ORDER_ANY, if_epochalloc, NULL); static void if_attachdomain(void *dummy) { struct ifnet *ifp; - TAILQ_FOREACH(ifp, &V_ifnet, if_link) + CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) if_attachdomain1(ifp); } SYSINIT(domainifattach, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_SECOND, if_attachdomain, NULL); static void if_attachdomain1(struct ifnet *ifp) { struct domain *dp; /* * Since dp->dom_ifattach calls malloc() with M_WAITOK, we * cannot lock ifp->if_afdata initialization, entirely. */ IF_AFDATA_LOCK(ifp); if (ifp->if_afdata_initialized >= domain_init_status) { IF_AFDATA_UNLOCK(ifp); log(LOG_WARNING, "%s called more than once on %s\n", __func__, ifp->if_xname); return; } ifp->if_afdata_initialized = domain_init_status; IF_AFDATA_UNLOCK(ifp); /* address family dependent data region */ bzero(ifp->if_afdata, sizeof(ifp->if_afdata)); for (dp = domains; dp; dp = dp->dom_next) { if (dp->dom_ifattach) ifp->if_afdata[dp->dom_family] = (*dp->dom_ifattach)(ifp); } } /* * Remove any unicast or broadcast network addresses from an interface. */ void if_purgeaddrs(struct ifnet *ifp) { struct ifaddr *ifa, *next; /* XXX cannot hold IF_ADDR_WLOCK over called functions. */ CK_STAILQ_FOREACH_SAFE(ifa, &ifp->if_addrhead, ifa_link, next) { if (ifa->ifa_addr->sa_family == AF_LINK) continue; #ifdef INET /* XXX: Ugly!! ad hoc just for INET */ if (ifa->ifa_addr->sa_family == AF_INET) { struct ifaliasreq ifr; bzero(&ifr, sizeof(ifr)); ifr.ifra_addr = *ifa->ifa_addr; if (ifa->ifa_dstaddr) ifr.ifra_broadaddr = *ifa->ifa_dstaddr; if (in_control(NULL, SIOCDIFADDR, (caddr_t)&ifr, ifp, NULL) == 0) continue; } #endif /* INET */ #ifdef INET6 if (ifa->ifa_addr->sa_family == AF_INET6) { in6_purgeaddr(ifa); /* ifp_addrhead is already updated */ continue; } #endif /* INET6 */ IF_ADDR_WLOCK(ifp); CK_STAILQ_REMOVE(&ifp->if_addrhead, ifa, ifaddr, ifa_link); IF_ADDR_WUNLOCK(ifp); ifa_free(ifa); } } /* * Remove any multicast network addresses from an interface when an ifnet * is going away. */ static void if_purgemaddrs(struct ifnet *ifp) { struct ifmultiaddr *ifma; IF_ADDR_WLOCK(ifp); while (!CK_STAILQ_EMPTY(&ifp->if_multiaddrs)) { ifma = CK_STAILQ_FIRST(&ifp->if_multiaddrs); CK_STAILQ_REMOVE(&ifp->if_multiaddrs, ifma, ifmultiaddr, ifma_link); if_delmulti_locked(ifp, ifma, 1); } IF_ADDR_WUNLOCK(ifp); } /* * Detach an interface, removing it from the list of "active" interfaces. * If vmove flag is set on entry to if_detach_internal(), perform only a * limited subset of cleanup tasks, given that we are moving an ifnet from * one vnet to another, where it must be fully operational. * * XXXRW: There are some significant questions about event ordering, and * how to prevent things from starting to use the interface during detach. */ void if_detach(struct ifnet *ifp) { CURVNET_SET_QUIET(ifp->if_vnet); if_detach_internal(ifp, 0, NULL); CURVNET_RESTORE(); } /* * The vmove flag, if set, indicates that we are called from a callpath * that is moving an interface to a different vnet instance. * * The shutdown flag, if set, indicates that we are called in the * process of shutting down a vnet instance. Currently only the * vnet_if_return SYSUNINIT function sets it. Note: we can be called * on a vnet instance shutdown without this flag being set, e.g., when * the cloned interfaces are destoyed as first thing of teardown. */ static int if_detach_internal(struct ifnet *ifp, int vmove, struct if_clone **ifcp) { struct ifaddr *ifa; int i; struct domain *dp; struct ifnet *iter; int found = 0; #ifdef VIMAGE int shutdown; shutdown = (ifp->if_vnet->vnet_state > SI_SUB_VNET && ifp->if_vnet->vnet_state < SI_SUB_VNET_DONE) ? 1 : 0; #endif IFNET_WLOCK(); - TAILQ_FOREACH(iter, &V_ifnet, if_link) + CK_STAILQ_FOREACH(iter, &V_ifnet, if_link) if (iter == ifp) { - TAILQ_REMOVE(&V_ifnet, ifp, if_link); + CK_STAILQ_REMOVE(&V_ifnet, ifp, ifnet, if_link); found = 1; break; } IFNET_WUNLOCK(); if (!found) { /* * While we would want to panic here, we cannot * guarantee that the interface is indeed still on * the list given we don't hold locks all the way. */ return (ENOENT); #if 0 if (vmove) panic("%s: ifp=%p not on the ifnet tailq %p", __func__, ifp, &V_ifnet); else return; /* XXX this should panic as well? */ #endif } /* * At this point we know the interface still was on the ifnet list * and we removed it so we are in a stable state. */ #ifdef VIMAGE curvnet->vnet_ifcnt--; #endif - + epoch_wait_preempt(net_epoch_preempt); /* * In any case (destroy or vmove) detach us from the groups * and remove/wait for pending events on the taskq. * XXX-BZ in theory an interface could still enqueue a taskq change? */ if_delgroups(ifp); taskqueue_drain(taskqueue_swi, &ifp->if_linktask); /* * Check if this is a cloned interface or not. Must do even if * shutting down as a if_vmove_reclaim() would move the ifp and * the if_clone_addgroup() will have a corrupted string overwise * from a gibberish pointer. */ if (vmove && ifcp != NULL) *ifcp = if_clone_findifc(ifp); if_down(ifp); #ifdef VIMAGE /* * On VNET shutdown abort here as the stack teardown will do all * the work top-down for us. */ if (shutdown) { /* * In case of a vmove we are done here without error. * If we would signal an error it would lead to the same * abort as if we did not find the ifnet anymore. * if_detach() calls us in void context and does not care * about an early abort notification, so life is splendid :) */ goto finish_vnet_shutdown; } #endif /* * At this point we are not tearing down a VNET and are either * going to destroy or vmove the interface and have to cleanup * accordingly. */ /* * Remove routes and flush queues. */ #ifdef ALTQ if (ALTQ_IS_ENABLED(&ifp->if_snd)) altq_disable(&ifp->if_snd); if (ALTQ_IS_ATTACHED(&ifp->if_snd)) altq_detach(&ifp->if_snd); #endif if_purgeaddrs(ifp); #ifdef INET in_ifdetach(ifp); #endif #ifdef INET6 /* * Remove all IPv6 kernel structs related to ifp. This should be done * before removing routing entries below, since IPv6 interface direct * routes are expected to be removed by the IPv6-specific kernel API. * Otherwise, the kernel will detect some inconsistency and bark it. */ in6_ifdetach(ifp); #endif if_purgemaddrs(ifp); /* Announce that the interface is gone. */ rt_ifannouncemsg(ifp, IFAN_DEPARTURE); EVENTHANDLER_INVOKE(ifnet_departure_event, ifp); if (IS_DEFAULT_VNET(curvnet)) devctl_notify("IFNET", ifp->if_xname, "DETACH", NULL); if (!vmove) { /* * Prevent further calls into the device driver via ifnet. */ if_dead(ifp); /* * Remove link ifaddr pointer and maybe decrement if_index. * Clean up all addresses. */ free(ifp->if_hw_addr, M_IFADDR); ifp->if_hw_addr = NULL; ifp->if_addr = NULL; /* We can now free link ifaddr. */ IF_ADDR_WLOCK(ifp); if (!CK_STAILQ_EMPTY(&ifp->if_addrhead)) { ifa = CK_STAILQ_FIRST(&ifp->if_addrhead); CK_STAILQ_REMOVE(&ifp->if_addrhead, ifa, ifaddr, ifa_link); IF_ADDR_WUNLOCK(ifp); ifa_free(ifa); } else IF_ADDR_WUNLOCK(ifp); } rt_flushifroutes(ifp); #ifdef VIMAGE finish_vnet_shutdown: #endif /* * We cannot hold the lock over dom_ifdetach calls as they might * sleep, for example trying to drain a callout, thus open up the * theoretical race with re-attaching. */ IF_AFDATA_LOCK(ifp); i = ifp->if_afdata_initialized; ifp->if_afdata_initialized = 0; IF_AFDATA_UNLOCK(ifp); for (dp = domains; i > 0 && dp; dp = dp->dom_next) { if (dp->dom_ifdetach && ifp->if_afdata[dp->dom_family]) { (*dp->dom_ifdetach)(ifp, ifp->if_afdata[dp->dom_family]); ifp->if_afdata[dp->dom_family] = NULL; } } return (0); } #ifdef VIMAGE /* * if_vmove() performs a limited version of if_detach() in current * vnet and if_attach()es the ifnet to the vnet specified as 2nd arg. * An attempt is made to shrink if_index in current vnet, find an * unused if_index in target vnet and calls if_grow() if necessary, * and finally find an unused if_xname for the target vnet. */ static void if_vmove(struct ifnet *ifp, struct vnet *new_vnet) { struct if_clone *ifc; u_int bif_dlt, bif_hdrlen; + void *old; int rc; /* * if_detach_internal() will call the eventhandler to notify * interface departure. That will detach if_bpf. We need to * safe the dlt and hdrlen so we can re-attach it later. */ bpf_get_bp_params(ifp->if_bpf, &bif_dlt, &bif_hdrlen); /* * Detach from current vnet, but preserve LLADDR info, do not * mark as dead etc. so that the ifnet can be reattached later. * If we cannot find it, we lost the race to someone else. */ rc = if_detach_internal(ifp, 1, &ifc); if (rc != 0) return; /* * Unlink the ifnet from ifindex_table[] in current vnet, and shrink * the if_index for that vnet if possible. * * NOTE: IFNET_WLOCK/IFNET_WUNLOCK() are assumed to be unvirtualized, * or we'd lock on one vnet and unlock on another. */ IFNET_WLOCK(); ifindex_free_locked(ifp->if_index); IFNET_WUNLOCK(); /* * Perform interface-specific reassignment tasks, if provided by * the driver. */ if (ifp->if_reassign != NULL) ifp->if_reassign(ifp, new_vnet, NULL); /* * Switch to the context of the target vnet. */ CURVNET_SET_QUIET(new_vnet); - + restart: IFNET_WLOCK(); - ifp->if_index = ifindex_alloc(); - ifnet_setbyindex_locked(ifp->if_index, ifp); + ifp->if_index = ifindex_alloc(&old); + if (__predict_false(ifp->if_index == USHRT_MAX)) { + IFNET_WUNLOCK(); + epoch_wait_preempt(net_epoch_preempt); + free(old, M_IFNET); + goto restart; + } + ifnet_setbyindex(ifp->if_index, ifp); IFNET_WUNLOCK(); if_attach_internal(ifp, 1, ifc); if (ifp->if_bpf == NULL) bpfattach(ifp, bif_dlt, bif_hdrlen); CURVNET_RESTORE(); } /* * Move an ifnet to or from another child prison/vnet, specified by the jail id. */ static int if_vmove_loan(struct thread *td, struct ifnet *ifp, char *ifname, int jid) { struct prison *pr; struct ifnet *difp; int shutdown; /* Try to find the prison within our visibility. */ sx_slock(&allprison_lock); pr = prison_find_child(td->td_ucred->cr_prison, jid); sx_sunlock(&allprison_lock); if (pr == NULL) return (ENXIO); prison_hold_locked(pr); mtx_unlock(&pr->pr_mtx); /* Do not try to move the iface from and to the same prison. */ if (pr->pr_vnet == ifp->if_vnet) { prison_free(pr); return (EEXIST); } /* Make sure the named iface does not exists in the dst. prison/vnet. */ /* XXX Lock interfaces to avoid races. */ CURVNET_SET_QUIET(pr->pr_vnet); difp = ifunit(ifname); if (difp != NULL) { CURVNET_RESTORE(); prison_free(pr); return (EEXIST); } /* Make sure the VNET is stable. */ shutdown = (ifp->if_vnet->vnet_state > SI_SUB_VNET && ifp->if_vnet->vnet_state < SI_SUB_VNET_DONE) ? 1 : 0; if (shutdown) { CURVNET_RESTORE(); prison_free(pr); return (EBUSY); } CURVNET_RESTORE(); /* Move the interface into the child jail/vnet. */ if_vmove(ifp, pr->pr_vnet); /* Report the new if_xname back to the userland. */ sprintf(ifname, "%s", ifp->if_xname); prison_free(pr); return (0); } static int if_vmove_reclaim(struct thread *td, char *ifname, int jid) { struct prison *pr; struct vnet *vnet_dst; struct ifnet *ifp; int shutdown; /* Try to find the prison within our visibility. */ sx_slock(&allprison_lock); pr = prison_find_child(td->td_ucred->cr_prison, jid); sx_sunlock(&allprison_lock); if (pr == NULL) return (ENXIO); prison_hold_locked(pr); mtx_unlock(&pr->pr_mtx); /* Make sure the named iface exists in the source prison/vnet. */ CURVNET_SET(pr->pr_vnet); ifp = ifunit(ifname); /* XXX Lock to avoid races. */ if (ifp == NULL) { CURVNET_RESTORE(); prison_free(pr); return (ENXIO); } /* Do not try to move the iface from and to the same prison. */ vnet_dst = TD_TO_VNET(td); if (vnet_dst == ifp->if_vnet) { CURVNET_RESTORE(); prison_free(pr); return (EEXIST); } /* Make sure the VNET is stable. */ shutdown = (ifp->if_vnet->vnet_state > SI_SUB_VNET && ifp->if_vnet->vnet_state < SI_SUB_VNET_DONE) ? 1 : 0; if (shutdown) { CURVNET_RESTORE(); prison_free(pr); return (EBUSY); } /* Get interface back from child jail/vnet. */ if_vmove(ifp, vnet_dst); CURVNET_RESTORE(); /* Report the new if_xname back to the userland. */ sprintf(ifname, "%s", ifp->if_xname); prison_free(pr); return (0); } #endif /* VIMAGE */ /* * Add a group to an interface */ int if_addgroup(struct ifnet *ifp, const char *groupname) { struct ifg_list *ifgl; struct ifg_group *ifg = NULL; struct ifg_member *ifgm; int new = 0; if (groupname[0] && groupname[strlen(groupname) - 1] >= '0' && groupname[strlen(groupname) - 1] <= '9') return (EINVAL); IFNET_WLOCK(); - TAILQ_FOREACH(ifgl, &ifp->if_groups, ifgl_next) + CK_STAILQ_FOREACH(ifgl, &ifp->if_groups, ifgl_next) if (!strcmp(ifgl->ifgl_group->ifg_group, groupname)) { IFNET_WUNLOCK(); return (EEXIST); } if ((ifgl = (struct ifg_list *)malloc(sizeof(struct ifg_list), M_TEMP, M_NOWAIT)) == NULL) { IFNET_WUNLOCK(); return (ENOMEM); } if ((ifgm = (struct ifg_member *)malloc(sizeof(struct ifg_member), M_TEMP, M_NOWAIT)) == NULL) { free(ifgl, M_TEMP); IFNET_WUNLOCK(); return (ENOMEM); } - TAILQ_FOREACH(ifg, &V_ifg_head, ifg_next) + CK_STAILQ_FOREACH(ifg, &V_ifg_head, ifg_next) if (!strcmp(ifg->ifg_group, groupname)) break; if (ifg == NULL) { if ((ifg = (struct ifg_group *)malloc(sizeof(struct ifg_group), M_TEMP, M_NOWAIT)) == NULL) { free(ifgl, M_TEMP); free(ifgm, M_TEMP); IFNET_WUNLOCK(); return (ENOMEM); } strlcpy(ifg->ifg_group, groupname, sizeof(ifg->ifg_group)); ifg->ifg_refcnt = 0; - TAILQ_INIT(&ifg->ifg_members); - TAILQ_INSERT_TAIL(&V_ifg_head, ifg, ifg_next); + CK_STAILQ_INIT(&ifg->ifg_members); + CK_STAILQ_INSERT_TAIL(&V_ifg_head, ifg, ifg_next); new = 1; } ifg->ifg_refcnt++; ifgl->ifgl_group = ifg; ifgm->ifgm_ifp = ifp; IF_ADDR_WLOCK(ifp); - TAILQ_INSERT_TAIL(&ifg->ifg_members, ifgm, ifgm_next); - TAILQ_INSERT_TAIL(&ifp->if_groups, ifgl, ifgl_next); + CK_STAILQ_INSERT_TAIL(&ifg->ifg_members, ifgm, ifgm_next); + CK_STAILQ_INSERT_TAIL(&ifp->if_groups, ifgl, ifgl_next); IF_ADDR_WUNLOCK(ifp); IFNET_WUNLOCK(); if (new) EVENTHANDLER_INVOKE(group_attach_event, ifg); EVENTHANDLER_INVOKE(group_change_event, groupname); return (0); } /* * Remove a group from an interface */ int if_delgroup(struct ifnet *ifp, const char *groupname) { struct ifg_list *ifgl; struct ifg_member *ifgm; + int freeifgl; IFNET_WLOCK(); - TAILQ_FOREACH(ifgl, &ifp->if_groups, ifgl_next) + CK_STAILQ_FOREACH(ifgl, &ifp->if_groups, ifgl_next) if (!strcmp(ifgl->ifgl_group->ifg_group, groupname)) break; if (ifgl == NULL) { IFNET_WUNLOCK(); return (ENOENT); } + freeifgl = 0; IF_ADDR_WLOCK(ifp); - TAILQ_REMOVE(&ifp->if_groups, ifgl, ifgl_next); + CK_STAILQ_REMOVE(&ifp->if_groups, ifgl, ifg_list, ifgl_next); IF_ADDR_WUNLOCK(ifp); - TAILQ_FOREACH(ifgm, &ifgl->ifgl_group->ifg_members, ifgm_next) + CK_STAILQ_FOREACH(ifgm, &ifgl->ifgl_group->ifg_members, ifgm_next) if (ifgm->ifgm_ifp == ifp) break; - if (ifgm != NULL) { - TAILQ_REMOVE(&ifgl->ifgl_group->ifg_members, ifgm, ifgm_next); - free(ifgm, M_TEMP); - } + if (ifgm != NULL) + CK_STAILQ_REMOVE(&ifgl->ifgl_group->ifg_members, ifgm, ifg_member, ifgm_next); if (--ifgl->ifgl_group->ifg_refcnt == 0) { - TAILQ_REMOVE(&V_ifg_head, ifgl->ifgl_group, ifg_next); - IFNET_WUNLOCK(); + CK_STAILQ_REMOVE(&V_ifg_head, ifgl->ifgl_group, ifg_group, ifg_next); + freeifgl = 1; + } + IFNET_WUNLOCK(); + + epoch_wait_preempt(net_epoch_preempt); + if (freeifgl) { EVENTHANDLER_INVOKE(group_detach_event, ifgl->ifgl_group); free(ifgl->ifgl_group, M_TEMP); - } else - IFNET_WUNLOCK(); - + } + free(ifgm, M_TEMP); free(ifgl, M_TEMP); EVENTHANDLER_INVOKE(group_change_event, groupname); return (0); } /* * Remove an interface from all groups */ static void if_delgroups(struct ifnet *ifp) { struct ifg_list *ifgl; struct ifg_member *ifgm; char groupname[IFNAMSIZ]; + int ifglfree; IFNET_WLOCK(); - while (!TAILQ_EMPTY(&ifp->if_groups)) { - ifgl = TAILQ_FIRST(&ifp->if_groups); + while (!CK_STAILQ_EMPTY(&ifp->if_groups)) { + ifgl = CK_STAILQ_FIRST(&ifp->if_groups); strlcpy(groupname, ifgl->ifgl_group->ifg_group, IFNAMSIZ); IF_ADDR_WLOCK(ifp); - TAILQ_REMOVE(&ifp->if_groups, ifgl, ifgl_next); + CK_STAILQ_REMOVE(&ifp->if_groups, ifgl, ifg_list, ifgl_next); IF_ADDR_WUNLOCK(ifp); - TAILQ_FOREACH(ifgm, &ifgl->ifgl_group->ifg_members, ifgm_next) + CK_STAILQ_FOREACH(ifgm, &ifgl->ifgl_group->ifg_members, ifgm_next) if (ifgm->ifgm_ifp == ifp) break; - if (ifgm != NULL) { - TAILQ_REMOVE(&ifgl->ifgl_group->ifg_members, ifgm, + if (ifgm != NULL) + CK_STAILQ_REMOVE(&ifgl->ifgl_group->ifg_members, ifgm, ifg_member, ifgm_next); - free(ifgm, M_TEMP); + ifglfree = 0; + if (--ifgl->ifgl_group->ifg_refcnt == 0) { + CK_STAILQ_REMOVE(&V_ifg_head, ifgl->ifgl_group, ifg_group, ifg_next); + ifglfree = 1; } - if (--ifgl->ifgl_group->ifg_refcnt == 0) { - TAILQ_REMOVE(&V_ifg_head, ifgl->ifgl_group, ifg_next); - IFNET_WUNLOCK(); + epoch_wait_preempt(net_epoch_preempt); + free(ifgm, M_TEMP); + if (ifglfree) { EVENTHANDLER_INVOKE(group_detach_event, - ifgl->ifgl_group); + ifgl->ifgl_group); free(ifgl->ifgl_group, M_TEMP); - } else - IFNET_WUNLOCK(); - - free(ifgl, M_TEMP); - + } EVENTHANDLER_INVOKE(group_change_event, groupname); IFNET_WLOCK(); } IFNET_WUNLOCK(); } static char * ifgr_group_get(void *ifgrp) { union ifgroupreq_union *ifgrup; ifgrup = ifgrp; #ifdef COMPAT_FREEBSD32 if (SV_CURPROC_FLAG(SV_ILP32)) return (&ifgrup->ifgr32.ifgr_ifgru.ifgru_group[0]); #endif return (&ifgrup->ifgr.ifgr_ifgru.ifgru_group[0]); } static struct ifg_req * ifgr_groups_get(void *ifgrp) { union ifgroupreq_union *ifgrup; ifgrup = ifgrp; #ifdef COMPAT_FREEBSD32 if (SV_CURPROC_FLAG(SV_ILP32)) return ((struct ifg_req *)(uintptr_t) ifgrup->ifgr32.ifgr_ifgru.ifgru_groups); #endif return (ifgrup->ifgr.ifgr_ifgru.ifgru_groups); } /* * Stores all groups from an interface in memory pointed to by ifgr. */ static int if_getgroup(struct ifgroupreq *ifgr, struct ifnet *ifp) { int len, error; struct ifg_list *ifgl; struct ifg_req ifgrq, *ifgp; if (ifgr->ifgr_len == 0) { IF_ADDR_RLOCK(ifp); - TAILQ_FOREACH(ifgl, &ifp->if_groups, ifgl_next) + CK_STAILQ_FOREACH(ifgl, &ifp->if_groups, ifgl_next) ifgr->ifgr_len += sizeof(struct ifg_req); IF_ADDR_RUNLOCK(ifp); return (0); } len = ifgr->ifgr_len; ifgp = ifgr_groups_get(ifgr); /* XXX: wire */ IF_ADDR_RLOCK(ifp); - TAILQ_FOREACH(ifgl, &ifp->if_groups, ifgl_next) { + CK_STAILQ_FOREACH(ifgl, &ifp->if_groups, ifgl_next) { if (len < sizeof(ifgrq)) { IF_ADDR_RUNLOCK(ifp); return (EINVAL); } bzero(&ifgrq, sizeof ifgrq); strlcpy(ifgrq.ifgrq_group, ifgl->ifgl_group->ifg_group, sizeof(ifgrq.ifgrq_group)); if ((error = copyout(&ifgrq, ifgp, sizeof(struct ifg_req)))) { IF_ADDR_RUNLOCK(ifp); return (error); } len -= sizeof(ifgrq); ifgp++; } IF_ADDR_RUNLOCK(ifp); return (0); } /* * Stores all members of a group in memory pointed to by igfr */ static int if_getgroupmembers(struct ifgroupreq *ifgr) { struct ifg_group *ifg; struct ifg_member *ifgm; struct ifg_req ifgrq, *ifgp; int len, error; IFNET_RLOCK(); - TAILQ_FOREACH(ifg, &V_ifg_head, ifg_next) + CK_STAILQ_FOREACH(ifg, &V_ifg_head, ifg_next) if (!strcmp(ifg->ifg_group, ifgr->ifgr_name)) break; if (ifg == NULL) { IFNET_RUNLOCK(); return (ENOENT); } if (ifgr->ifgr_len == 0) { - TAILQ_FOREACH(ifgm, &ifg->ifg_members, ifgm_next) + CK_STAILQ_FOREACH(ifgm, &ifg->ifg_members, ifgm_next) ifgr->ifgr_len += sizeof(ifgrq); IFNET_RUNLOCK(); return (0); } len = ifgr->ifgr_len; ifgp = ifgr_groups_get(ifgr); - TAILQ_FOREACH(ifgm, &ifg->ifg_members, ifgm_next) { + CK_STAILQ_FOREACH(ifgm, &ifg->ifg_members, ifgm_next) { if (len < sizeof(ifgrq)) { IFNET_RUNLOCK(); return (EINVAL); } bzero(&ifgrq, sizeof ifgrq); strlcpy(ifgrq.ifgrq_member, ifgm->ifgm_ifp->if_xname, sizeof(ifgrq.ifgrq_member)); if ((error = copyout(&ifgrq, ifgp, sizeof(struct ifg_req)))) { IFNET_RUNLOCK(); return (error); } len -= sizeof(ifgrq); ifgp++; } IFNET_RUNLOCK(); return (0); } /* * Return counter values from counter(9)s stored in ifnet. */ uint64_t if_get_counter_default(struct ifnet *ifp, ift_counter cnt) { KASSERT(cnt < IFCOUNTERS, ("%s: invalid cnt %d", __func__, cnt)); return (counter_u64_fetch(ifp->if_counters[cnt])); } /* * Increase an ifnet counter. Usually used for counters shared * between the stack and a driver, but function supports them all. */ void if_inc_counter(struct ifnet *ifp, ift_counter cnt, int64_t inc) { KASSERT(cnt < IFCOUNTERS, ("%s: invalid cnt %d", __func__, cnt)); counter_u64_add(ifp->if_counters[cnt], inc); } /* * Copy data from ifnet to userland API structure if_data. */ void if_data_copy(struct ifnet *ifp, struct if_data *ifd) { ifd->ifi_type = ifp->if_type; ifd->ifi_physical = 0; ifd->ifi_addrlen = ifp->if_addrlen; ifd->ifi_hdrlen = ifp->if_hdrlen; ifd->ifi_link_state = ifp->if_link_state; ifd->ifi_vhid = 0; ifd->ifi_datalen = sizeof(struct if_data); ifd->ifi_mtu = ifp->if_mtu; ifd->ifi_metric = ifp->if_metric; ifd->ifi_baudrate = ifp->if_baudrate; ifd->ifi_hwassist = ifp->if_hwassist; ifd->ifi_epoch = ifp->if_epoch; ifd->ifi_lastchange = ifp->if_lastchange; ifd->ifi_ipackets = ifp->if_get_counter(ifp, IFCOUNTER_IPACKETS); ifd->ifi_ierrors = ifp->if_get_counter(ifp, IFCOUNTER_IERRORS); ifd->ifi_opackets = ifp->if_get_counter(ifp, IFCOUNTER_OPACKETS); ifd->ifi_oerrors = ifp->if_get_counter(ifp, IFCOUNTER_OERRORS); ifd->ifi_collisions = ifp->if_get_counter(ifp, IFCOUNTER_COLLISIONS); ifd->ifi_ibytes = ifp->if_get_counter(ifp, IFCOUNTER_IBYTES); ifd->ifi_obytes = ifp->if_get_counter(ifp, IFCOUNTER_OBYTES); ifd->ifi_imcasts = ifp->if_get_counter(ifp, IFCOUNTER_IMCASTS); ifd->ifi_omcasts = ifp->if_get_counter(ifp, IFCOUNTER_OMCASTS); ifd->ifi_iqdrops = ifp->if_get_counter(ifp, IFCOUNTER_IQDROPS); ifd->ifi_oqdrops = ifp->if_get_counter(ifp, IFCOUNTER_OQDROPS); ifd->ifi_noproto = ifp->if_get_counter(ifp, IFCOUNTER_NOPROTO); } /* * Wrapper functions for struct ifnet address list locking macros. These are * used by kernel modules to avoid encoding programming interface or binary * interface assumptions that may be violated when kernel-internal locking * approaches change. */ void if_addr_rlock(struct ifnet *ifp) { IF_ADDR_RLOCK(ifp); } void if_addr_runlock(struct ifnet *ifp) { IF_ADDR_RUNLOCK(ifp); } void if_maddr_rlock(if_t ifp) { IF_ADDR_RLOCK((struct ifnet *)ifp); } void if_maddr_runlock(if_t ifp) { IF_ADDR_RUNLOCK((struct ifnet *)ifp); } /* * Initialization, destruction and refcounting functions for ifaddrs. */ struct ifaddr * ifa_alloc(size_t size, int flags) { struct ifaddr *ifa; KASSERT(size >= sizeof(struct ifaddr), ("%s: invalid size %zu", __func__, size)); ifa = malloc(size, M_IFADDR, M_ZERO | flags); if (ifa == NULL) return (NULL); if ((ifa->ifa_opackets = counter_u64_alloc(flags)) == NULL) goto fail; if ((ifa->ifa_ipackets = counter_u64_alloc(flags)) == NULL) goto fail; if ((ifa->ifa_obytes = counter_u64_alloc(flags)) == NULL) goto fail; if ((ifa->ifa_ibytes = counter_u64_alloc(flags)) == NULL) goto fail; refcount_init(&ifa->ifa_refcnt, 1); return (ifa); fail: /* free(NULL) is okay */ counter_u64_free(ifa->ifa_opackets); counter_u64_free(ifa->ifa_ipackets); counter_u64_free(ifa->ifa_obytes); counter_u64_free(ifa->ifa_ibytes); free(ifa, M_IFADDR); return (NULL); } void ifa_ref(struct ifaddr *ifa) { refcount_acquire(&ifa->ifa_refcnt); } static void ifa_destroy(epoch_context_t ctx) { struct ifaddr *ifa; ifa = __containerof(ctx, struct ifaddr, ifa_epoch_ctx); counter_u64_free(ifa->ifa_opackets); counter_u64_free(ifa->ifa_ipackets); counter_u64_free(ifa->ifa_obytes); counter_u64_free(ifa->ifa_ibytes); free(ifa, M_IFADDR); } void ifa_free(struct ifaddr *ifa) { if (refcount_release(&ifa->ifa_refcnt)) epoch_call(net_epoch_preempt, &ifa->ifa_epoch_ctx, ifa_destroy); } static int ifa_maintain_loopback_route(int cmd, const char *otype, struct ifaddr *ifa, struct sockaddr *ia) { int error; struct rt_addrinfo info; struct sockaddr_dl null_sdl; struct ifnet *ifp; ifp = ifa->ifa_ifp; bzero(&info, sizeof(info)); if (cmd != RTM_DELETE) info.rti_ifp = V_loif; info.rti_flags = ifa->ifa_flags | RTF_HOST | RTF_STATIC | RTF_PINNED; info.rti_info[RTAX_DST] = ia; info.rti_info[RTAX_GATEWAY] = (struct sockaddr *)&null_sdl; link_init_sdl(ifp, (struct sockaddr *)&null_sdl, ifp->if_type); error = rtrequest1_fib(cmd, &info, NULL, ifp->if_fib); if (error != 0 && !(cmd == RTM_ADD && error == EEXIST) && !(cmd == RTM_DELETE && error == ENOENT)) if_printf(ifp, "%s failed: %d\n", otype, error); return (error); } int ifa_add_loopback_route(struct ifaddr *ifa, struct sockaddr *ia) { return (ifa_maintain_loopback_route(RTM_ADD, "insertion", ifa, ia)); } int ifa_del_loopback_route(struct ifaddr *ifa, struct sockaddr *ia) { return (ifa_maintain_loopback_route(RTM_DELETE, "deletion", ifa, ia)); } int ifa_switch_loopback_route(struct ifaddr *ifa, struct sockaddr *ia) { return (ifa_maintain_loopback_route(RTM_CHANGE, "switch", ifa, ia)); } /* * XXX: Because sockaddr_dl has deeper structure than the sockaddr * structs used to represent other address families, it is necessary * to perform a different comparison. */ #define sa_dl_equal(a1, a2) \ ((((const struct sockaddr_dl *)(a1))->sdl_len == \ ((const struct sockaddr_dl *)(a2))->sdl_len) && \ (bcmp(CLLADDR((const struct sockaddr_dl *)(a1)), \ CLLADDR((const struct sockaddr_dl *)(a2)), \ ((const struct sockaddr_dl *)(a1))->sdl_alen) == 0)) /* * Locate an interface based on a complete address. */ /*ARGSUSED*/ -static struct ifaddr * -ifa_ifwithaddr_internal(const struct sockaddr *addr, int getref) +struct ifaddr * +ifa_ifwithaddr(const struct sockaddr *addr) { struct ifnet *ifp; struct ifaddr *ifa; - IFNET_RLOCK_NOSLEEP(); - TAILQ_FOREACH(ifp, &V_ifnet, if_link) { - IF_ADDR_RLOCK(ifp); + MPASS(in_epoch()); + CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) { CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != addr->sa_family) continue; if (sa_equal(addr, ifa->ifa_addr)) { - if (getref) - ifa_ref(ifa); - IF_ADDR_RUNLOCK(ifp); goto done; } /* IP6 doesn't have broadcast */ if ((ifp->if_flags & IFF_BROADCAST) && ifa->ifa_broadaddr && ifa->ifa_broadaddr->sa_len != 0 && sa_equal(ifa->ifa_broadaddr, addr)) { - if (getref) - ifa_ref(ifa); - IF_ADDR_RUNLOCK(ifp); goto done; } } - IF_ADDR_RUNLOCK(ifp); } ifa = NULL; done: - IFNET_RUNLOCK_NOSLEEP(); return (ifa); } -struct ifaddr * -ifa_ifwithaddr(const struct sockaddr *addr) -{ - - return (ifa_ifwithaddr_internal(addr, 1)); -} - int ifa_ifwithaddr_check(const struct sockaddr *addr) { + int rc; - return (ifa_ifwithaddr_internal(addr, 0) != NULL); + NET_EPOCH_ENTER(); + rc = (ifa_ifwithaddr(addr) != NULL); + NET_EPOCH_EXIT(); + return (rc); } /* * Locate an interface based on the broadcast address. */ /* ARGSUSED */ struct ifaddr * ifa_ifwithbroadaddr(const struct sockaddr *addr, int fibnum) { struct ifnet *ifp; struct ifaddr *ifa; - IFNET_RLOCK_NOSLEEP(); - TAILQ_FOREACH(ifp, &V_ifnet, if_link) { + MPASS(in_epoch()); + CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) { if ((fibnum != RT_ALL_FIBS) && (ifp->if_fib != fibnum)) continue; - IF_ADDR_RLOCK(ifp); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != addr->sa_family) continue; if ((ifp->if_flags & IFF_BROADCAST) && ifa->ifa_broadaddr && ifa->ifa_broadaddr->sa_len != 0 && sa_equal(ifa->ifa_broadaddr, addr)) { - ifa_ref(ifa); - IF_ADDR_RUNLOCK(ifp); goto done; } } - IF_ADDR_RUNLOCK(ifp); } ifa = NULL; done: - IFNET_RUNLOCK_NOSLEEP(); return (ifa); } /* * Locate the point to point interface with a given destination address. */ /*ARGSUSED*/ struct ifaddr * ifa_ifwithdstaddr(const struct sockaddr *addr, int fibnum) { struct ifnet *ifp; struct ifaddr *ifa; - IFNET_RLOCK_NOSLEEP(); - TAILQ_FOREACH(ifp, &V_ifnet, if_link) { + MPASS(in_epoch()); + CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) { if ((ifp->if_flags & IFF_POINTOPOINT) == 0) continue; if ((fibnum != RT_ALL_FIBS) && (ifp->if_fib != fibnum)) continue; - IF_ADDR_RLOCK(ifp); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != addr->sa_family) continue; if (ifa->ifa_dstaddr != NULL && sa_equal(addr, ifa->ifa_dstaddr)) { - ifa_ref(ifa); - IF_ADDR_RUNLOCK(ifp); goto done; } } - IF_ADDR_RUNLOCK(ifp); } ifa = NULL; done: - IFNET_RUNLOCK_NOSLEEP(); return (ifa); } /* * Find an interface on a specific network. If many, choice * is most specific found. */ struct ifaddr * ifa_ifwithnet(const struct sockaddr *addr, int ignore_ptp, int fibnum) { struct ifnet *ifp; struct ifaddr *ifa; struct ifaddr *ifa_maybe = NULL; u_int af = addr->sa_family; const char *addr_data = addr->sa_data, *cplim; + MPASS(in_epoch()); /* * AF_LINK addresses can be looked up directly by their index number, * so do that if we can. */ if (af == AF_LINK) { const struct sockaddr_dl *sdl = (const struct sockaddr_dl *)addr; if (sdl->sdl_index && sdl->sdl_index <= V_if_index) return (ifaddr_byindex(sdl->sdl_index)); } /* * Scan though each interface, looking for ones that have addresses * in this address family and the requested fib. Maintain a reference * on ifa_maybe once we find one, as we release the IF_ADDR_RLOCK() that * kept it stable when we move onto the next interface. */ - IFNET_RLOCK_NOSLEEP(); - TAILQ_FOREACH(ifp, &V_ifnet, if_link) { + CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) { if ((fibnum != RT_ALL_FIBS) && (ifp->if_fib != fibnum)) continue; - IF_ADDR_RLOCK(ifp); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { const char *cp, *cp2, *cp3; if (ifa->ifa_addr->sa_family != af) next: continue; if (af == AF_INET && ifp->if_flags & IFF_POINTOPOINT && !ignore_ptp) { /* * This is a bit broken as it doesn't * take into account that the remote end may * be a single node in the network we are * looking for. * The trouble is that we don't know the * netmask for the remote end. */ if (ifa->ifa_dstaddr != NULL && sa_equal(addr, ifa->ifa_dstaddr)) { - ifa_ref(ifa); IF_ADDR_RUNLOCK(ifp); goto done; } } else { /* * Scan all the bits in the ifa's address. * If a bit dissagrees with what we are * looking for, mask it with the netmask * to see if it really matters. * (A byte at a time) */ if (ifa->ifa_netmask == 0) continue; cp = addr_data; cp2 = ifa->ifa_addr->sa_data; cp3 = ifa->ifa_netmask->sa_data; cplim = ifa->ifa_netmask->sa_len + (char *)ifa->ifa_netmask; while (cp3 < cplim) if ((*cp++ ^ *cp2++) & *cp3++) goto next; /* next address! */ /* * If the netmask of what we just found * is more specific than what we had before * (if we had one), or if the virtual status * of new prefix is better than of the old one, * then remember the new one before continuing * to search for an even better one. */ if (ifa_maybe == NULL || ifa_preferred(ifa_maybe, ifa) || rn_refines((caddr_t)ifa->ifa_netmask, (caddr_t)ifa_maybe->ifa_netmask)) { - if (ifa_maybe != NULL) - ifa_free(ifa_maybe); ifa_maybe = ifa; - ifa_ref(ifa_maybe); } } } - IF_ADDR_RUNLOCK(ifp); } ifa = ifa_maybe; ifa_maybe = NULL; done: - IFNET_RUNLOCK_NOSLEEP(); - if (ifa_maybe != NULL) - ifa_free(ifa_maybe); return (ifa); } /* * Find an interface address specific to an interface best matching * a given address. */ struct ifaddr * ifaof_ifpforaddr(const struct sockaddr *addr, struct ifnet *ifp) { struct ifaddr *ifa; const char *cp, *cp2, *cp3; char *cplim; struct ifaddr *ifa_maybe = NULL; u_int af = addr->sa_family; if (af >= AF_MAX) return (NULL); - IF_ADDR_RLOCK(ifp); + MPASS(in_epoch()); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != af) continue; if (ifa_maybe == NULL) ifa_maybe = ifa; if (ifa->ifa_netmask == 0) { if (sa_equal(addr, ifa->ifa_addr) || (ifa->ifa_dstaddr && sa_equal(addr, ifa->ifa_dstaddr))) goto done; continue; } if (ifp->if_flags & IFF_POINTOPOINT) { if (sa_equal(addr, ifa->ifa_dstaddr)) goto done; } else { cp = addr->sa_data; cp2 = ifa->ifa_addr->sa_data; cp3 = ifa->ifa_netmask->sa_data; cplim = ifa->ifa_netmask->sa_len + (char *)ifa->ifa_netmask; for (; cp3 < cplim; cp3++) if ((*cp++ ^ *cp2++) & *cp3) break; if (cp3 == cplim) goto done; } } ifa = ifa_maybe; done: - if (ifa != NULL) - ifa_ref(ifa); - IF_ADDR_RUNLOCK(ifp); return (ifa); } /* * See whether new ifa is better than current one: * 1) A non-virtual one is preferred over virtual. * 2) A virtual in master state preferred over any other state. * * Used in several address selecting functions. */ int ifa_preferred(struct ifaddr *cur, struct ifaddr *next) { return (cur->ifa_carp && (!next->ifa_carp || ((*carp_master_p)(next) && !(*carp_master_p)(cur)))); } #include /* * Default action when installing a route with a Link Level gateway. * Lookup an appropriate real ifa to point to. * This should be moved to /sys/net/link.c eventually. */ static void link_rtrequest(int cmd, struct rtentry *rt, struct rt_addrinfo *info) { struct ifaddr *ifa, *oifa; struct sockaddr *dst; struct ifnet *ifp; if (cmd != RTM_ADD || ((ifa = rt->rt_ifa) == NULL) || ((ifp = ifa->ifa_ifp) == NULL) || ((dst = rt_key(rt)) == NULL)) return; + NET_EPOCH_ENTER(); ifa = ifaof_ifpforaddr(dst, ifp); if (ifa) { oifa = rt->rt_ifa; rt->rt_ifa = ifa; - ifa_free(oifa); if (ifa->ifa_rtrequest && ifa->ifa_rtrequest != link_rtrequest) ifa->ifa_rtrequest(cmd, rt, info); } + NET_EPOCH_EXIT(); } struct sockaddr_dl * link_alloc_sdl(size_t size, int flags) { return (malloc(size, M_TEMP, flags)); } void link_free_sdl(struct sockaddr *sa) { free(sa, M_TEMP); } /* * Fills in given sdl with interface basic info. * Returns pointer to filled sdl. */ struct sockaddr_dl * link_init_sdl(struct ifnet *ifp, struct sockaddr *paddr, u_char iftype) { struct sockaddr_dl *sdl; sdl = (struct sockaddr_dl *)paddr; memset(sdl, 0, sizeof(struct sockaddr_dl)); sdl->sdl_len = sizeof(struct sockaddr_dl); sdl->sdl_family = AF_LINK; sdl->sdl_index = ifp->if_index; sdl->sdl_type = iftype; return (sdl); } /* * Mark an interface down and notify protocols of * the transition. */ static void if_unroute(struct ifnet *ifp, int flag, int fam) { struct ifaddr *ifa; KASSERT(flag == IFF_UP, ("if_unroute: flag != IFF_UP")); ifp->if_flags &= ~flag; getmicrotime(&ifp->if_lastchange); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) if (fam == PF_UNSPEC || (fam == ifa->ifa_addr->sa_family)) pfctlinput(PRC_IFDOWN, ifa->ifa_addr); ifp->if_qflush(ifp); if (ifp->if_carp) (*carp_linkstate_p)(ifp); rt_ifmsg(ifp); } /* * Mark an interface up and notify protocols of * the transition. */ static void if_route(struct ifnet *ifp, int flag, int fam) { struct ifaddr *ifa; KASSERT(flag == IFF_UP, ("if_route: flag != IFF_UP")); ifp->if_flags |= flag; getmicrotime(&ifp->if_lastchange); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) if (fam == PF_UNSPEC || (fam == ifa->ifa_addr->sa_family)) pfctlinput(PRC_IFUP, ifa->ifa_addr); if (ifp->if_carp) (*carp_linkstate_p)(ifp); rt_ifmsg(ifp); #ifdef INET6 in6_if_up(ifp); #endif } void (*vlan_link_state_p)(struct ifnet *); /* XXX: private from if_vlan */ void (*vlan_trunk_cap_p)(struct ifnet *); /* XXX: private from if_vlan */ struct ifnet *(*vlan_trunkdev_p)(struct ifnet *); struct ifnet *(*vlan_devat_p)(struct ifnet *, uint16_t); int (*vlan_tag_p)(struct ifnet *, uint16_t *); int (*vlan_setcookie_p)(struct ifnet *, void *); void *(*vlan_cookie_p)(struct ifnet *); /* * Handle a change in the interface link state. To avoid LORs * between driver lock and upper layer locks, as well as possible * recursions, we post event to taskqueue, and all job * is done in static do_link_state_change(). */ void if_link_state_change(struct ifnet *ifp, int link_state) { /* Return if state hasn't changed. */ if (ifp->if_link_state == link_state) return; ifp->if_link_state = link_state; taskqueue_enqueue(taskqueue_swi, &ifp->if_linktask); } static void do_link_state_change(void *arg, int pending) { struct ifnet *ifp = (struct ifnet *)arg; int link_state = ifp->if_link_state; CURVNET_SET(ifp->if_vnet); /* Notify that the link state has changed. */ rt_ifmsg(ifp); if (ifp->if_vlantrunk != NULL) (*vlan_link_state_p)(ifp); if ((ifp->if_type == IFT_ETHER || ifp->if_type == IFT_L2VLAN) && ifp->if_l2com != NULL) (*ng_ether_link_state_p)(ifp, link_state); if (ifp->if_carp) (*carp_linkstate_p)(ifp); if (ifp->if_bridge) ifp->if_bridge_linkstate(ifp); if (ifp->if_lagg) (*lagg_linkstate_p)(ifp, link_state); if (IS_DEFAULT_VNET(curvnet)) devctl_notify("IFNET", ifp->if_xname, (link_state == LINK_STATE_UP) ? "LINK_UP" : "LINK_DOWN", NULL); if (pending > 1) if_printf(ifp, "%d link states coalesced\n", pending); if (log_link_state_change) if_printf(ifp, "link state changed to %s\n", (link_state == LINK_STATE_UP) ? "UP" : "DOWN" ); EVENTHANDLER_INVOKE(ifnet_link_event, ifp, link_state); CURVNET_RESTORE(); } /* * Mark an interface down and notify protocols of * the transition. */ void if_down(struct ifnet *ifp) { EVENTHANDLER_INVOKE(ifnet_event, ifp, IFNET_EVENT_DOWN); if_unroute(ifp, IFF_UP, AF_UNSPEC); } /* * Mark an interface up and notify protocols of * the transition. */ void if_up(struct ifnet *ifp) { if_route(ifp, IFF_UP, AF_UNSPEC); EVENTHANDLER_INVOKE(ifnet_event, ifp, IFNET_EVENT_UP); } /* * Flush an interface queue. */ void if_qflush(struct ifnet *ifp) { struct mbuf *m, *n; struct ifaltq *ifq; ifq = &ifp->if_snd; IFQ_LOCK(ifq); #ifdef ALTQ if (ALTQ_IS_ENABLED(ifq)) ALTQ_PURGE(ifq); #endif n = ifq->ifq_head; while ((m = n) != NULL) { n = m->m_nextpkt; m_freem(m); } ifq->ifq_head = 0; ifq->ifq_tail = 0; ifq->ifq_len = 0; IFQ_UNLOCK(ifq); } /* * Map interface name to interface structure pointer, with or without * returning a reference. */ struct ifnet * ifunit_ref(const char *name) { struct ifnet *ifp; IFNET_RLOCK_NOSLEEP(); - TAILQ_FOREACH(ifp, &V_ifnet, if_link) { + CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) { if (strncmp(name, ifp->if_xname, IFNAMSIZ) == 0 && !(ifp->if_flags & IFF_DYING)) break; } if (ifp != NULL) if_ref(ifp); IFNET_RUNLOCK_NOSLEEP(); return (ifp); } struct ifnet * ifunit(const char *name) { struct ifnet *ifp; IFNET_RLOCK_NOSLEEP(); - TAILQ_FOREACH(ifp, &V_ifnet, if_link) { + CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) { if (strncmp(name, ifp->if_xname, IFNAMSIZ) == 0) break; } IFNET_RUNLOCK_NOSLEEP(); return (ifp); } static void * ifr_buffer_get_buffer(void *data) { union ifreq_union *ifrup; ifrup = data; #ifdef COMPAT_FREEBSD32 if (SV_CURPROC_FLAG(SV_ILP32)) return ((void *)(uintptr_t) ifrup->ifr32.ifr_ifru.ifru_buffer.buffer); #endif return (ifrup->ifr.ifr_ifru.ifru_buffer.buffer); } static void ifr_buffer_set_buffer_null(void *data) { union ifreq_union *ifrup; ifrup = data; #ifdef COMPAT_FREEBSD32 if (SV_CURPROC_FLAG(SV_ILP32)) ifrup->ifr32.ifr_ifru.ifru_buffer.buffer = 0; else #endif ifrup->ifr.ifr_ifru.ifru_buffer.buffer = NULL; } static size_t ifr_buffer_get_length(void *data) { union ifreq_union *ifrup; ifrup = data; #ifdef COMPAT_FREEBSD32 if (SV_CURPROC_FLAG(SV_ILP32)) return (ifrup->ifr32.ifr_ifru.ifru_buffer.length); #endif return (ifrup->ifr.ifr_ifru.ifru_buffer.length); } static void ifr_buffer_set_length(void *data, size_t len) { union ifreq_union *ifrup; ifrup = data; #ifdef COMPAT_FREEBSD32 if (SV_CURPROC_FLAG(SV_ILP32)) ifrup->ifr32.ifr_ifru.ifru_buffer.length = len; else #endif ifrup->ifr.ifr_ifru.ifru_buffer.length = len; } void * ifr_data_get_ptr(void *ifrp) { union ifreq_union *ifrup; ifrup = ifrp; #ifdef COMPAT_FREEBSD32 if (SV_CURPROC_FLAG(SV_ILP32)) return ((void *)(uintptr_t) ifrup->ifr32.ifr_ifru.ifru_data); #endif return (ifrup->ifr.ifr_ifru.ifru_data); } /* * Hardware specific interface ioctls. */ static int ifhwioctl(u_long cmd, struct ifnet *ifp, caddr_t data, struct thread *td) { struct ifreq *ifr; int error = 0, do_ifup = 0; int new_flags, temp_flags; size_t namelen, onamelen; size_t descrlen; char *descrbuf, *odescrbuf; char new_name[IFNAMSIZ]; struct ifaddr *ifa; struct sockaddr_dl *sdl; ifr = (struct ifreq *)data; switch (cmd) { case SIOCGIFINDEX: ifr->ifr_index = ifp->if_index; break; case SIOCGIFFLAGS: temp_flags = ifp->if_flags | ifp->if_drv_flags; ifr->ifr_flags = temp_flags & 0xffff; ifr->ifr_flagshigh = temp_flags >> 16; break; case SIOCGIFCAP: ifr->ifr_reqcap = ifp->if_capabilities; ifr->ifr_curcap = ifp->if_capenable; break; #ifdef MAC case SIOCGIFMAC: error = mac_ifnet_ioctl_get(td->td_ucred, ifr, ifp); break; #endif case SIOCGIFMETRIC: ifr->ifr_metric = ifp->if_metric; break; case SIOCGIFMTU: ifr->ifr_mtu = ifp->if_mtu; break; case SIOCGIFPHYS: /* XXXGL: did this ever worked? */ ifr->ifr_phys = 0; break; case SIOCGIFDESCR: error = 0; sx_slock(&ifdescr_sx); if (ifp->if_description == NULL) error = ENOMSG; else { /* space for terminating nul */ descrlen = strlen(ifp->if_description) + 1; if (ifr_buffer_get_length(ifr) < descrlen) ifr_buffer_set_buffer_null(ifr); else error = copyout(ifp->if_description, ifr_buffer_get_buffer(ifr), descrlen); ifr_buffer_set_length(ifr, descrlen); } sx_sunlock(&ifdescr_sx); break; case SIOCSIFDESCR: error = priv_check(td, PRIV_NET_SETIFDESCR); if (error) return (error); /* * Copy only (length-1) bytes to make sure that * if_description is always nul terminated. The * length parameter is supposed to count the * terminating nul in. */ if (ifr_buffer_get_length(ifr) > ifdescr_maxlen) return (ENAMETOOLONG); else if (ifr_buffer_get_length(ifr) == 0) descrbuf = NULL; else { descrbuf = malloc(ifr_buffer_get_length(ifr), M_IFDESCR, M_WAITOK | M_ZERO); error = copyin(ifr_buffer_get_buffer(ifr), descrbuf, ifr_buffer_get_length(ifr) - 1); if (error) { free(descrbuf, M_IFDESCR); break; } } sx_xlock(&ifdescr_sx); odescrbuf = ifp->if_description; ifp->if_description = descrbuf; sx_xunlock(&ifdescr_sx); getmicrotime(&ifp->if_lastchange); free(odescrbuf, M_IFDESCR); break; case SIOCGIFFIB: ifr->ifr_fib = ifp->if_fib; break; case SIOCSIFFIB: error = priv_check(td, PRIV_NET_SETIFFIB); if (error) return (error); if (ifr->ifr_fib >= rt_numfibs) return (EINVAL); ifp->if_fib = ifr->ifr_fib; break; case SIOCSIFFLAGS: error = priv_check(td, PRIV_NET_SETIFFLAGS); if (error) return (error); /* * Currently, no driver owned flags pass the IFF_CANTCHANGE * check, so we don't need special handling here yet. */ new_flags = (ifr->ifr_flags & 0xffff) | (ifr->ifr_flagshigh << 16); if (ifp->if_flags & IFF_UP && (new_flags & IFF_UP) == 0) { if_down(ifp); } else if (new_flags & IFF_UP && (ifp->if_flags & IFF_UP) == 0) { do_ifup = 1; } /* See if permanently promiscuous mode bit is about to flip */ if ((ifp->if_flags ^ new_flags) & IFF_PPROMISC) { if (new_flags & IFF_PPROMISC) ifp->if_flags |= IFF_PROMISC; else if (ifp->if_pcount == 0) ifp->if_flags &= ~IFF_PROMISC; if (log_promisc_mode_change) if_printf(ifp, "permanently promiscuous mode %s\n", ((new_flags & IFF_PPROMISC) ? "enabled" : "disabled")); } ifp->if_flags = (ifp->if_flags & IFF_CANTCHANGE) | (new_flags &~ IFF_CANTCHANGE); if (ifp->if_ioctl) { (void) (*ifp->if_ioctl)(ifp, cmd, data); } if (do_ifup) if_up(ifp); getmicrotime(&ifp->if_lastchange); break; case SIOCSIFCAP: error = priv_check(td, PRIV_NET_SETIFCAP); if (error) return (error); if (ifp->if_ioctl == NULL) return (EOPNOTSUPP); if (ifr->ifr_reqcap & ~ifp->if_capabilities) return (EINVAL); error = (*ifp->if_ioctl)(ifp, cmd, data); if (error == 0) getmicrotime(&ifp->if_lastchange); break; #ifdef MAC case SIOCSIFMAC: error = mac_ifnet_ioctl_set(td->td_ucred, ifr, ifp); break; #endif case SIOCSIFNAME: error = priv_check(td, PRIV_NET_SETIFNAME); if (error) return (error); error = copyinstr(ifr_data_get_ptr(ifr), new_name, IFNAMSIZ, NULL); if (error != 0) return (error); if (new_name[0] == '\0') return (EINVAL); if (new_name[IFNAMSIZ-1] != '\0') { new_name[IFNAMSIZ-1] = '\0'; if (strlen(new_name) == IFNAMSIZ-1) return (EINVAL); } if (ifunit(new_name) != NULL) return (EEXIST); /* * XXX: Locking. Nothing else seems to lock if_flags, * and there are numerous other races with the * ifunit() checks not being atomic with namespace * changes (renames, vmoves, if_attach, etc). */ ifp->if_flags |= IFF_RENAMING; /* Announce the departure of the interface. */ rt_ifannouncemsg(ifp, IFAN_DEPARTURE); EVENTHANDLER_INVOKE(ifnet_departure_event, ifp); if_printf(ifp, "changing name to '%s'\n", new_name); IF_ADDR_WLOCK(ifp); strlcpy(ifp->if_xname, new_name, sizeof(ifp->if_xname)); ifa = ifp->if_addr; sdl = (struct sockaddr_dl *)ifa->ifa_addr; namelen = strlen(new_name); onamelen = sdl->sdl_nlen; /* * Move the address if needed. This is safe because we * allocate space for a name of length IFNAMSIZ when we * create this in if_attach(). */ if (namelen != onamelen) { bcopy(sdl->sdl_data + onamelen, sdl->sdl_data + namelen, sdl->sdl_alen); } bcopy(new_name, sdl->sdl_data, namelen); sdl->sdl_nlen = namelen; sdl = (struct sockaddr_dl *)ifa->ifa_netmask; bzero(sdl->sdl_data, onamelen); while (namelen != 0) sdl->sdl_data[--namelen] = 0xff; IF_ADDR_WUNLOCK(ifp); EVENTHANDLER_INVOKE(ifnet_arrival_event, ifp); /* Announce the return of the interface. */ rt_ifannouncemsg(ifp, IFAN_ARRIVAL); ifp->if_flags &= ~IFF_RENAMING; break; #ifdef VIMAGE case SIOCSIFVNET: error = priv_check(td, PRIV_NET_SETIFVNET); if (error) return (error); error = if_vmove_loan(td, ifp, ifr->ifr_name, ifr->ifr_jid); break; #endif case SIOCSIFMETRIC: error = priv_check(td, PRIV_NET_SETIFMETRIC); if (error) return (error); ifp->if_metric = ifr->ifr_metric; getmicrotime(&ifp->if_lastchange); break; case SIOCSIFPHYS: error = priv_check(td, PRIV_NET_SETIFPHYS); if (error) return (error); if (ifp->if_ioctl == NULL) return (EOPNOTSUPP); error = (*ifp->if_ioctl)(ifp, cmd, data); if (error == 0) getmicrotime(&ifp->if_lastchange); break; case SIOCSIFMTU: { u_long oldmtu = ifp->if_mtu; error = priv_check(td, PRIV_NET_SETIFMTU); if (error) return (error); if (ifr->ifr_mtu < IF_MINMTU || ifr->ifr_mtu > IF_MAXMTU) return (EINVAL); if (ifp->if_ioctl == NULL) return (EOPNOTSUPP); error = (*ifp->if_ioctl)(ifp, cmd, data); if (error == 0) { getmicrotime(&ifp->if_lastchange); rt_ifmsg(ifp); #ifdef INET NETDUMP_REINIT(ifp); #endif } /* * If the link MTU changed, do network layer specific procedure. */ if (ifp->if_mtu != oldmtu) { #ifdef INET6 nd6_setmtu(ifp); #endif rt_updatemtu(ifp); } break; } case SIOCADDMULTI: case SIOCDELMULTI: if (cmd == SIOCADDMULTI) error = priv_check(td, PRIV_NET_ADDMULTI); else error = priv_check(td, PRIV_NET_DELMULTI); if (error) return (error); /* Don't allow group membership on non-multicast interfaces. */ if ((ifp->if_flags & IFF_MULTICAST) == 0) return (EOPNOTSUPP); /* Don't let users screw up protocols' entries. */ if (ifr->ifr_addr.sa_family != AF_LINK) return (EINVAL); if (cmd == SIOCADDMULTI) { struct ifmultiaddr *ifma; /* * Userland is only permitted to join groups once * via the if_addmulti() KPI, because it cannot hold * struct ifmultiaddr * between calls. It may also * lose a race while we check if the membership * already exists. */ IF_ADDR_RLOCK(ifp); ifma = if_findmulti(ifp, &ifr->ifr_addr); IF_ADDR_RUNLOCK(ifp); if (ifma != NULL) error = EADDRINUSE; else error = if_addmulti(ifp, &ifr->ifr_addr, &ifma); } else { error = if_delmulti(ifp, &ifr->ifr_addr); } if (error == 0) getmicrotime(&ifp->if_lastchange); break; case SIOCSIFPHYADDR: case SIOCDIFPHYADDR: #ifdef INET6 case SIOCSIFPHYADDR_IN6: #endif case SIOCSIFMEDIA: case SIOCSIFGENERIC: error = priv_check(td, PRIV_NET_HWIOCTL); if (error) return (error); if (ifp->if_ioctl == NULL) return (EOPNOTSUPP); error = (*ifp->if_ioctl)(ifp, cmd, data); if (error == 0) getmicrotime(&ifp->if_lastchange); break; case SIOCGIFSTATUS: case SIOCGIFPSRCADDR: case SIOCGIFPDSTADDR: case SIOCGIFMEDIA: case SIOCGIFXMEDIA: case SIOCGIFGENERIC: case SIOCGIFRSSKEY: case SIOCGIFRSSHASH: if (ifp->if_ioctl == NULL) return (EOPNOTSUPP); error = (*ifp->if_ioctl)(ifp, cmd, data); break; case SIOCSIFLLADDR: error = priv_check(td, PRIV_NET_SETLLADDR); if (error) return (error); error = if_setlladdr(ifp, ifr->ifr_addr.sa_data, ifr->ifr_addr.sa_len); break; case SIOCGHWADDR: error = if_gethwaddr(ifp, ifr); break; CASE_IOC_IFGROUPREQ(SIOCAIFGROUP): error = priv_check(td, PRIV_NET_ADDIFGROUP); if (error) return (error); if ((error = if_addgroup(ifp, ifgr_group_get((struct ifgroupreq *)data)))) return (error); break; CASE_IOC_IFGROUPREQ(SIOCGIFGROUP): if ((error = if_getgroup((struct ifgroupreq *)data, ifp))) return (error); break; CASE_IOC_IFGROUPREQ(SIOCDIFGROUP): error = priv_check(td, PRIV_NET_DELIFGROUP); if (error) return (error); if ((error = if_delgroup(ifp, ifgr_group_get((struct ifgroupreq *)data)))) return (error); break; default: error = ENOIOCTL; break; } return (error); } #ifdef COMPAT_FREEBSD32 struct ifconf32 { int32_t ifc_len; union { uint32_t ifcu_buf; uint32_t ifcu_req; } ifc_ifcu; }; #define SIOCGIFCONF32 _IOWR('i', 36, struct ifconf32) #endif #ifdef COMPAT_FREEBSD32 static void ifmr_init(struct ifmediareq *ifmr, caddr_t data) { struct ifmediareq32 *ifmr32; ifmr32 = (struct ifmediareq32 *)data; memcpy(ifmr->ifm_name, ifmr32->ifm_name, sizeof(ifmr->ifm_name)); ifmr->ifm_current = ifmr32->ifm_current; ifmr->ifm_mask = ifmr32->ifm_mask; ifmr->ifm_status = ifmr32->ifm_status; ifmr->ifm_active = ifmr32->ifm_active; ifmr->ifm_count = ifmr32->ifm_count; ifmr->ifm_ulist = (int *)(uintptr_t)ifmr32->ifm_ulist; } static void ifmr_update(const struct ifmediareq *ifmr, caddr_t data) { struct ifmediareq32 *ifmr32; ifmr32 = (struct ifmediareq32 *)data; ifmr32->ifm_current = ifmr->ifm_current; ifmr32->ifm_mask = ifmr->ifm_mask; ifmr32->ifm_status = ifmr->ifm_status; ifmr32->ifm_active = ifmr->ifm_active; ifmr32->ifm_count = ifmr->ifm_count; } #endif /* * Interface ioctls. */ int ifioctl(struct socket *so, u_long cmd, caddr_t data, struct thread *td) { #ifdef COMPAT_FREEBSD32 caddr_t saved_data = NULL; struct ifmediareq ifmr; #endif struct ifmediareq *ifmrp; struct ifnet *ifp; struct ifreq *ifr; int error; int oif_flags; #ifdef VIMAGE int shutdown; #endif CURVNET_SET(so->so_vnet); #ifdef VIMAGE /* Make sure the VNET is stable. */ shutdown = (so->so_vnet->vnet_state > SI_SUB_VNET && so->so_vnet->vnet_state < SI_SUB_VNET_DONE) ? 1 : 0; if (shutdown) { CURVNET_RESTORE(); return (EBUSY); } #endif switch (cmd) { case SIOCGIFCONF: error = ifconf(cmd, data); CURVNET_RESTORE(); return (error); #ifdef COMPAT_FREEBSD32 case SIOCGIFCONF32: { struct ifconf32 *ifc32; struct ifconf ifc; ifc32 = (struct ifconf32 *)data; ifc.ifc_len = ifc32->ifc_len; ifc.ifc_buf = PTRIN(ifc32->ifc_buf); error = ifconf(SIOCGIFCONF, (void *)&ifc); CURVNET_RESTORE(); if (error == 0) ifc32->ifc_len = ifc.ifc_len; return (error); } #endif } ifmrp = NULL; #ifdef COMPAT_FREEBSD32 switch (cmd) { case SIOCGIFMEDIA32: case SIOCGIFXMEDIA32: ifmrp = &ifmr; ifmr_init(ifmrp, data); cmd = _IOC_NEWTYPE(cmd, struct ifmediareq); saved_data = data; data = (caddr_t)ifmrp; } #endif ifr = (struct ifreq *)data; switch (cmd) { #ifdef VIMAGE case SIOCSIFRVNET: error = priv_check(td, PRIV_NET_SETIFVNET); if (error == 0) error = if_vmove_reclaim(td, ifr->ifr_name, ifr->ifr_jid); goto out_noref; #endif case SIOCIFCREATE: case SIOCIFCREATE2: error = priv_check(td, PRIV_NET_IFCREATE); if (error == 0) error = if_clone_create(ifr->ifr_name, sizeof(ifr->ifr_name), cmd == SIOCIFCREATE2 ? ifr_data_get_ptr(ifr) : NULL); goto out_noref; case SIOCIFDESTROY: error = priv_check(td, PRIV_NET_IFDESTROY); if (error == 0) error = if_clone_destroy(ifr->ifr_name); goto out_noref; case SIOCIFGCLONERS: error = if_clone_list((struct if_clonereq *)data); goto out_noref; CASE_IOC_IFGROUPREQ(SIOCGIFGMEMB): error = if_getgroupmembers((struct ifgroupreq *)data); goto out_noref; #if defined(INET) || defined(INET6) case SIOCSVH: case SIOCGVH: if (carp_ioctl_p == NULL) error = EPROTONOSUPPORT; else error = (*carp_ioctl_p)(ifr, cmd, td); goto out_noref; #endif } ifp = ifunit_ref(ifr->ifr_name); if (ifp == NULL) { error = ENXIO; goto out_noref; } error = ifhwioctl(cmd, ifp, data, td); if (error != ENOIOCTL) goto out_ref; oif_flags = ifp->if_flags; if (so->so_proto == NULL) { error = EOPNOTSUPP; goto out_ref; } /* * Pass the request on to the socket control method, and if the * latter returns EOPNOTSUPP, directly to the interface. * * Make an exception for the legacy SIOCSIF* requests. Drivers * trust SIOCSIFADDR et al to come from an already privileged * layer, and do not perform any credentials checks or input * validation. */ error = ((*so->so_proto->pr_usrreqs->pru_control)(so, cmd, data, ifp, td)); if (error == EOPNOTSUPP && ifp != NULL && ifp->if_ioctl != NULL && cmd != SIOCSIFADDR && cmd != SIOCSIFBRDADDR && cmd != SIOCSIFDSTADDR && cmd != SIOCSIFNETMASK) error = (*ifp->if_ioctl)(ifp, cmd, data); if ((oif_flags ^ ifp->if_flags) & IFF_UP) { #ifdef INET6 if (ifp->if_flags & IFF_UP) in6_if_up(ifp); #endif } out_ref: if_rele(ifp); out_noref: #ifdef COMPAT_FREEBSD32 if (ifmrp != NULL) { KASSERT((cmd == SIOCGIFMEDIA || cmd == SIOCGIFXMEDIA), ("ifmrp non-NULL, but cmd is not an ifmedia req 0x%lx", cmd)); data = saved_data; ifmr_update(ifmrp, data); } #endif CURVNET_RESTORE(); return (error); } /* * The code common to handling reference counted flags, * e.g., in ifpromisc() and if_allmulti(). * The "pflag" argument can specify a permanent mode flag to check, * such as IFF_PPROMISC for promiscuous mode; should be 0 if none. * * Only to be used on stack-owned flags, not driver-owned flags. */ static int if_setflag(struct ifnet *ifp, int flag, int pflag, int *refcount, int onswitch) { struct ifreq ifr; int error; int oldflags, oldcount; /* Sanity checks to catch programming errors */ KASSERT((flag & (IFF_DRV_OACTIVE|IFF_DRV_RUNNING)) == 0, ("%s: setting driver-owned flag %d", __func__, flag)); if (onswitch) KASSERT(*refcount >= 0, ("%s: increment negative refcount %d for flag %d", __func__, *refcount, flag)); else KASSERT(*refcount > 0, ("%s: decrement non-positive refcount %d for flag %d", __func__, *refcount, flag)); /* In case this mode is permanent, just touch refcount */ if (ifp->if_flags & pflag) { *refcount += onswitch ? 1 : -1; return (0); } /* Save ifnet parameters for if_ioctl() may fail */ oldcount = *refcount; oldflags = ifp->if_flags; /* * See if we aren't the only and touching refcount is enough. * Actually toggle interface flag if we are the first or last. */ if (onswitch) { if ((*refcount)++) return (0); ifp->if_flags |= flag; } else { if (--(*refcount)) return (0); ifp->if_flags &= ~flag; } /* Call down the driver since we've changed interface flags */ if (ifp->if_ioctl == NULL) { error = EOPNOTSUPP; goto recover; } ifr.ifr_flags = ifp->if_flags & 0xffff; ifr.ifr_flagshigh = ifp->if_flags >> 16; error = (*ifp->if_ioctl)(ifp, SIOCSIFFLAGS, (caddr_t)&ifr); if (error) goto recover; /* Notify userland that interface flags have changed */ rt_ifmsg(ifp); return (0); recover: /* Recover after driver error */ *refcount = oldcount; ifp->if_flags = oldflags; return (error); } /* * Set/clear promiscuous mode on interface ifp based on the truth value * of pswitch. The calls are reference counted so that only the first * "on" request actually has an effect, as does the final "off" request. * Results are undefined if the "off" and "on" requests are not matched. */ int ifpromisc(struct ifnet *ifp, int pswitch) { int error; int oldflags = ifp->if_flags; error = if_setflag(ifp, IFF_PROMISC, IFF_PPROMISC, &ifp->if_pcount, pswitch); /* If promiscuous mode status has changed, log a message */ if (error == 0 && ((ifp->if_flags ^ oldflags) & IFF_PROMISC) && log_promisc_mode_change) if_printf(ifp, "promiscuous mode %s\n", (ifp->if_flags & IFF_PROMISC) ? "enabled" : "disabled"); return (error); } /* * Return interface configuration * of system. List may be used * in later ioctl's (above) to get * other information. */ /*ARGSUSED*/ static int ifconf(u_long cmd, caddr_t data) { struct ifconf *ifc = (struct ifconf *)data; struct ifnet *ifp; struct ifaddr *ifa; struct ifreq ifr; struct sbuf *sb; int error, full = 0, valid_len, max_len; /* Limit initial buffer size to MAXPHYS to avoid DoS from userspace. */ max_len = MAXPHYS - 1; /* Prevent hostile input from being able to crash the system */ if (ifc->ifc_len <= 0) return (EINVAL); again: if (ifc->ifc_len <= max_len) { max_len = ifc->ifc_len; full = 1; } sb = sbuf_new(NULL, NULL, max_len + 1, SBUF_FIXEDLEN); max_len = 0; valid_len = 0; IFNET_RLOCK(); - TAILQ_FOREACH(ifp, &V_ifnet, if_link) { + CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) { int addrs; /* * Zero the ifr to make sure we don't disclose the contents * of the stack. */ memset(&ifr, 0, sizeof(ifr)); if (strlcpy(ifr.ifr_name, ifp->if_xname, sizeof(ifr.ifr_name)) >= sizeof(ifr.ifr_name)) { sbuf_delete(sb); IFNET_RUNLOCK(); return (ENAMETOOLONG); } addrs = 0; IF_ADDR_RLOCK(ifp); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { struct sockaddr *sa = ifa->ifa_addr; if (prison_if(curthread->td_ucred, sa) != 0) continue; addrs++; if (sa->sa_len <= sizeof(*sa)) { if (sa->sa_len < sizeof(*sa)) { memset(&ifr.ifr_ifru.ifru_addr, 0, sizeof(ifr.ifr_ifru.ifru_addr)); memcpy(&ifr.ifr_ifru.ifru_addr, sa, sa->sa_len); } else ifr.ifr_ifru.ifru_addr = *sa; sbuf_bcat(sb, &ifr, sizeof(ifr)); max_len += sizeof(ifr); } else { sbuf_bcat(sb, &ifr, offsetof(struct ifreq, ifr_addr)); max_len += offsetof(struct ifreq, ifr_addr); sbuf_bcat(sb, sa, sa->sa_len); max_len += sa->sa_len; } if (sbuf_error(sb) == 0) valid_len = sbuf_len(sb); } IF_ADDR_RUNLOCK(ifp); if (addrs == 0) { sbuf_bcat(sb, &ifr, sizeof(ifr)); max_len += sizeof(ifr); if (sbuf_error(sb) == 0) valid_len = sbuf_len(sb); } } IFNET_RUNLOCK(); /* * If we didn't allocate enough space (uncommon), try again. If * we have already allocated as much space as we are allowed, * return what we've got. */ if (valid_len != max_len && !full) { sbuf_delete(sb); goto again; } ifc->ifc_len = valid_len; sbuf_finish(sb); error = copyout(sbuf_data(sb), ifc->ifc_req, ifc->ifc_len); sbuf_delete(sb); return (error); } /* * Just like ifpromisc(), but for all-multicast-reception mode. */ int if_allmulti(struct ifnet *ifp, int onswitch) { return (if_setflag(ifp, IFF_ALLMULTI, 0, &ifp->if_amcount, onswitch)); } struct ifmultiaddr * if_findmulti(struct ifnet *ifp, const struct sockaddr *sa) { struct ifmultiaddr *ifma; IF_ADDR_LOCK_ASSERT(ifp); CK_STAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { if (sa->sa_family == AF_LINK) { if (sa_dl_equal(ifma->ifma_addr, sa)) break; } else { if (sa_equal(ifma->ifma_addr, sa)) break; } } return ifma; } /* * Allocate a new ifmultiaddr and initialize based on passed arguments. We * make copies of passed sockaddrs. The ifmultiaddr will not be added to * the ifnet multicast address list here, so the caller must do that and * other setup work (such as notifying the device driver). The reference * count is initialized to 1. */ static struct ifmultiaddr * if_allocmulti(struct ifnet *ifp, struct sockaddr *sa, struct sockaddr *llsa, int mflags) { struct ifmultiaddr *ifma; struct sockaddr *dupsa; ifma = malloc(sizeof *ifma, M_IFMADDR, mflags | M_ZERO); if (ifma == NULL) return (NULL); dupsa = malloc(sa->sa_len, M_IFMADDR, mflags); if (dupsa == NULL) { free(ifma, M_IFMADDR); return (NULL); } bcopy(sa, dupsa, sa->sa_len); ifma->ifma_addr = dupsa; ifma->ifma_ifp = ifp; ifma->ifma_refcount = 1; ifma->ifma_protospec = NULL; if (llsa == NULL) { ifma->ifma_lladdr = NULL; return (ifma); } dupsa = malloc(llsa->sa_len, M_IFMADDR, mflags); if (dupsa == NULL) { free(ifma->ifma_addr, M_IFMADDR); free(ifma, M_IFMADDR); return (NULL); } bcopy(llsa, dupsa, llsa->sa_len); ifma->ifma_lladdr = dupsa; return (ifma); } /* * if_freemulti: free ifmultiaddr structure and possibly attached related * addresses. The caller is responsible for implementing reference * counting, notifying the driver, handling routing messages, and releasing * any dependent link layer state. */ #ifdef MCAST_VERBOSE extern void kdb_backtrace(void); #endif static void if_freemulti_internal(struct ifmultiaddr *ifma) { KASSERT(ifma->ifma_refcount == 0, ("if_freemulti: refcount %d", ifma->ifma_refcount)); if (ifma->ifma_lladdr != NULL) free(ifma->ifma_lladdr, M_IFMADDR); #ifdef MCAST_VERBOSE kdb_backtrace(); printf("%s freeing ifma: %p\n", __func__, ifma); #endif free(ifma->ifma_addr, M_IFMADDR); free(ifma, M_IFMADDR); } static void if_destroymulti(epoch_context_t ctx) { struct ifmultiaddr *ifma; ifma = __containerof(ctx, struct ifmultiaddr, ifma_epoch_ctx); if_freemulti_internal(ifma); } void if_freemulti(struct ifmultiaddr *ifma) { KASSERT(ifma->ifma_refcount == 0, ("if_freemulti_epoch: refcount %d", ifma->ifma_refcount)); epoch_call(net_epoch_preempt, &ifma->ifma_epoch_ctx, if_destroymulti); } /* * Register an additional multicast address with a network interface. * * - If the address is already present, bump the reference count on the * address and return. * - If the address is not link-layer, look up a link layer address. * - Allocate address structures for one or both addresses, and attach to the * multicast address list on the interface. If automatically adding a link * layer address, the protocol address will own a reference to the link * layer address, to be freed when it is freed. * - Notify the network device driver of an addition to the multicast address * list. * * 'sa' points to caller-owned memory with the desired multicast address. * * 'retifma' will be used to return a pointer to the resulting multicast * address reference, if desired. */ int if_addmulti(struct ifnet *ifp, struct sockaddr *sa, struct ifmultiaddr **retifma) { struct ifmultiaddr *ifma, *ll_ifma; struct sockaddr *llsa; struct sockaddr_dl sdl; int error; #ifdef INET IN_MULTI_LIST_UNLOCK_ASSERT(); #endif #ifdef INET6 IN6_MULTI_LIST_UNLOCK_ASSERT(); #endif /* * If the address is already present, return a new reference to it; * otherwise, allocate storage and set up a new address. */ IF_ADDR_WLOCK(ifp); ifma = if_findmulti(ifp, sa); if (ifma != NULL) { ifma->ifma_refcount++; if (retifma != NULL) *retifma = ifma; IF_ADDR_WUNLOCK(ifp); return (0); } /* * The address isn't already present; resolve the protocol address * into a link layer address, and then look that up, bump its * refcount or allocate an ifma for that also. * Most link layer resolving functions returns address data which * fits inside default sockaddr_dl structure. However callback * can allocate another sockaddr structure, in that case we need to * free it later. */ llsa = NULL; ll_ifma = NULL; if (ifp->if_resolvemulti != NULL) { /* Provide called function with buffer size information */ sdl.sdl_len = sizeof(sdl); llsa = (struct sockaddr *)&sdl; error = ifp->if_resolvemulti(ifp, &llsa, sa); if (error) goto unlock_out; } /* * Allocate the new address. Don't hook it up yet, as we may also * need to allocate a link layer multicast address. */ ifma = if_allocmulti(ifp, sa, llsa, M_NOWAIT); if (ifma == NULL) { error = ENOMEM; goto free_llsa_out; } /* * If a link layer address is found, we'll need to see if it's * already present in the address list, or allocate is as well. * When this block finishes, the link layer address will be on the * list. */ if (llsa != NULL) { ll_ifma = if_findmulti(ifp, llsa); if (ll_ifma == NULL) { ll_ifma = if_allocmulti(ifp, llsa, NULL, M_NOWAIT); if (ll_ifma == NULL) { --ifma->ifma_refcount; if_freemulti(ifma); error = ENOMEM; goto free_llsa_out; } CK_STAILQ_INSERT_HEAD(&ifp->if_multiaddrs, ll_ifma, ifma_link); } else ll_ifma->ifma_refcount++; ifma->ifma_llifma = ll_ifma; } /* * We now have a new multicast address, ifma, and possibly a new or * referenced link layer address. Add the primary address to the * ifnet address list. */ CK_STAILQ_INSERT_HEAD(&ifp->if_multiaddrs, ifma, ifma_link); if (retifma != NULL) *retifma = ifma; /* * Must generate the message while holding the lock so that 'ifma' * pointer is still valid. */ rt_newmaddrmsg(RTM_NEWMADDR, ifma); IF_ADDR_WUNLOCK(ifp); /* * We are certain we have added something, so call down to the * interface to let them know about it. */ if (ifp->if_ioctl != NULL) { (void) (*ifp->if_ioctl)(ifp, SIOCADDMULTI, 0); } if ((llsa != NULL) && (llsa != (struct sockaddr *)&sdl)) link_free_sdl(llsa); return (0); free_llsa_out: if ((llsa != NULL) && (llsa != (struct sockaddr *)&sdl)) link_free_sdl(llsa); unlock_out: IF_ADDR_WUNLOCK(ifp); return (error); } /* * Delete a multicast group membership by network-layer group address. * * Returns ENOENT if the entry could not be found. If ifp no longer * exists, results are undefined. This entry point should only be used * from subsystems which do appropriate locking to hold ifp for the * duration of the call. * Network-layer protocol domains must use if_delmulti_ifma(). */ int if_delmulti(struct ifnet *ifp, struct sockaddr *sa) { struct ifmultiaddr *ifma; int lastref; #ifdef INVARIANTS struct ifnet *oifp; IFNET_RLOCK_NOSLEEP(); - TAILQ_FOREACH(oifp, &V_ifnet, if_link) + CK_STAILQ_FOREACH(oifp, &V_ifnet, if_link) if (ifp == oifp) break; if (ifp != oifp) ifp = NULL; IFNET_RUNLOCK_NOSLEEP(); KASSERT(ifp != NULL, ("%s: ifnet went away", __func__)); #endif if (ifp == NULL) return (ENOENT); IF_ADDR_WLOCK(ifp); lastref = 0; ifma = if_findmulti(ifp, sa); if (ifma != NULL) lastref = if_delmulti_locked(ifp, ifma, 0); IF_ADDR_WUNLOCK(ifp); if (ifma == NULL) return (ENOENT); if (lastref && ifp->if_ioctl != NULL) { (void)(*ifp->if_ioctl)(ifp, SIOCDELMULTI, 0); } return (0); } /* * Delete all multicast group membership for an interface. * Should be used to quickly flush all multicast filters. */ void if_delallmulti(struct ifnet *ifp) { struct ifmultiaddr *ifma; struct ifmultiaddr *next; IF_ADDR_WLOCK(ifp); CK_STAILQ_FOREACH_SAFE(ifma, &ifp->if_multiaddrs, ifma_link, next) if_delmulti_locked(ifp, ifma, 0); IF_ADDR_WUNLOCK(ifp); } void if_delmulti_ifma(struct ifmultiaddr *ifma) { if_delmulti_ifma_flags(ifma, 0); } /* * Delete a multicast group membership by group membership pointer. * Network-layer protocol domains must use this routine. * * It is safe to call this routine if the ifp disappeared. */ void if_delmulti_ifma_flags(struct ifmultiaddr *ifma, int flags) { struct ifnet *ifp; int lastref; MCDPRINTF("%s freeing ifma: %p\n", __func__, ifma); #ifdef INET IN_MULTI_LIST_UNLOCK_ASSERT(); #endif ifp = ifma->ifma_ifp; #ifdef DIAGNOSTIC if (ifp == NULL) { printf("%s: ifma_ifp seems to be detached\n", __func__); } else { struct ifnet *oifp; IFNET_RLOCK_NOSLEEP(); - TAILQ_FOREACH(oifp, &V_ifnet, if_link) + CK_STAILQ_FOREACH(oifp, &V_ifnet, if_link) if (ifp == oifp) break; if (ifp != oifp) { printf("%s: ifnet %p disappeared\n", __func__, ifp); ifp = NULL; } IFNET_RUNLOCK_NOSLEEP(); } #endif /* * If and only if the ifnet instance exists: Acquire the address lock. */ if (ifp != NULL) IF_ADDR_WLOCK(ifp); lastref = if_delmulti_locked(ifp, ifma, flags); if (ifp != NULL) { /* * If and only if the ifnet instance exists: * Release the address lock. * If the group was left: update the hardware hash filter. */ IF_ADDR_WUNLOCK(ifp); if (lastref && ifp->if_ioctl != NULL) { (void)(*ifp->if_ioctl)(ifp, SIOCDELMULTI, 0); } } } /* * Perform deletion of network-layer and/or link-layer multicast address. * * Return 0 if the reference count was decremented. * Return 1 if the final reference was released, indicating that the * hardware hash filter should be reprogrammed. */ static int if_delmulti_locked(struct ifnet *ifp, struct ifmultiaddr *ifma, int detaching) { struct ifmultiaddr *ll_ifma; if (ifp != NULL && ifma->ifma_ifp != NULL) { KASSERT(ifma->ifma_ifp == ifp, ("%s: inconsistent ifp %p", __func__, ifp)); IF_ADDR_WLOCK_ASSERT(ifp); } ifp = ifma->ifma_ifp; MCDPRINTF("%s freeing %p from %s \n", __func__, ifma, ifp ? ifp->if_xname : ""); /* * If the ifnet is detaching, null out references to ifnet, * so that upper protocol layers will notice, and not attempt * to obtain locks for an ifnet which no longer exists. The * routing socket announcement must happen before the ifnet * instance is detached from the system. */ if (detaching) { #ifdef DIAGNOSTIC printf("%s: detaching ifnet instance %p\n", __func__, ifp); #endif /* * ifp may already be nulled out if we are being reentered * to delete the ll_ifma. */ if (ifp != NULL) { rt_newmaddrmsg(RTM_DELMADDR, ifma); ifma->ifma_ifp = NULL; } } if (--ifma->ifma_refcount > 0) return 0; if (ifp != NULL && detaching == 0) CK_STAILQ_REMOVE(&ifp->if_multiaddrs, ifma, ifmultiaddr, ifma_link); /* * If this ifma is a network-layer ifma, a link-layer ifma may * have been associated with it. Release it first if so. */ ll_ifma = ifma->ifma_llifma; if (ll_ifma != NULL) { KASSERT(ifma->ifma_lladdr != NULL, ("%s: llifma w/o lladdr", __func__)); if (detaching) ll_ifma->ifma_ifp = NULL; /* XXX */ if (--ll_ifma->ifma_refcount == 0) { if (ifp != NULL) { CK_STAILQ_REMOVE(&ifp->if_multiaddrs, ll_ifma, ifmultiaddr, ifma_link); } if_freemulti(ll_ifma); } } #ifdef INVARIANTS if (ifp) { struct ifmultiaddr *ifmatmp; CK_STAILQ_FOREACH(ifmatmp, &ifp->if_multiaddrs, ifma_link) MPASS(ifma != ifmatmp); } #endif if_freemulti(ifma); /* * The last reference to this instance of struct ifmultiaddr * was released; the hardware should be notified of this change. */ return 1; } /* * Set the link layer address on an interface. * * At this time we only support certain types of interfaces, * and we don't allow the length of the address to change. * * Set noinline to be dtrace-friendly */ __noinline int if_setlladdr(struct ifnet *ifp, const u_char *lladdr, int len) { struct sockaddr_dl *sdl; struct ifaddr *ifa; struct ifreq ifr; + int rc; - IF_ADDR_RLOCK(ifp); + rc = 0; + NET_EPOCH_ENTER(); ifa = ifp->if_addr; if (ifa == NULL) { - IF_ADDR_RUNLOCK(ifp); - return (EINVAL); + rc = EINVAL; + goto out; } - ifa_ref(ifa); - IF_ADDR_RUNLOCK(ifp); + sdl = (struct sockaddr_dl *)ifa->ifa_addr; if (sdl == NULL) { - ifa_free(ifa); - return (EINVAL); + rc = EINVAL; + goto out; } if (len != sdl->sdl_alen) { /* don't allow length to change */ - ifa_free(ifa); - return (EINVAL); + rc = EINVAL; + goto out; } switch (ifp->if_type) { case IFT_ETHER: case IFT_XETHER: case IFT_L2VLAN: case IFT_BRIDGE: case IFT_IEEE8023ADLAG: bcopy(lladdr, LLADDR(sdl), len); - ifa_free(ifa); break; default: - ifa_free(ifa); - return (ENODEV); + rc = ENODEV; + goto out; } /* * If the interface is already up, we need * to re-init it in order to reprogram its * address filter. */ if ((ifp->if_flags & IFF_UP) != 0) { if (ifp->if_ioctl) { ifp->if_flags &= ~IFF_UP; ifr.ifr_flags = ifp->if_flags & 0xffff; ifr.ifr_flagshigh = ifp->if_flags >> 16; (*ifp->if_ioctl)(ifp, SIOCSIFFLAGS, (caddr_t)&ifr); ifp->if_flags |= IFF_UP; ifr.ifr_flags = ifp->if_flags & 0xffff; ifr.ifr_flagshigh = ifp->if_flags >> 16; (*ifp->if_ioctl)(ifp, SIOCSIFFLAGS, (caddr_t)&ifr); } } EVENTHANDLER_INVOKE(iflladdr_event, ifp); - return (0); + out: + NET_EPOCH_EXIT(); + return (rc); } /* * Compat function for handling basic encapsulation requests. * Not converted stacks (FDDI, IB, ..) supports traditional * output model: ARP (and other similar L2 protocols) are handled * inside output routine, arpresolve/nd6_resolve() returns MAC * address instead of full prepend. * * This function creates calculated header==MAC for IPv4/IPv6 and * returns EAFNOSUPPORT (which is then handled in ARP code) for other * address families. */ static int if_requestencap_default(struct ifnet *ifp, struct if_encap_req *req) { if (req->rtype != IFENCAP_LL) return (EOPNOTSUPP); if (req->bufsize < req->lladdr_len) return (ENOMEM); switch (req->family) { case AF_INET: case AF_INET6: break; default: return (EAFNOSUPPORT); } /* Copy lladdr to storage as is */ memmove(req->buf, req->lladdr, req->lladdr_len); req->bufsize = req->lladdr_len; req->lladdr_off = 0; return (0); } /* * Get the link layer address that was read from the hardware at attach. * * This is only set by Ethernet NICs (IFT_ETHER), but laggX interfaces re-type * their component interfaces as IFT_IEEE8023ADLAG. */ int if_gethwaddr(struct ifnet *ifp, struct ifreq *ifr) { if (ifp->if_hw_addr == NULL) return (ENODEV); switch (ifp->if_type) { case IFT_ETHER: case IFT_IEEE8023ADLAG: bcopy(ifp->if_hw_addr, ifr->ifr_addr.sa_data, ifp->if_addrlen); return (0); default: return (ENODEV); } } /* * The name argument must be a pointer to storage which will last as * long as the interface does. For physical devices, the result of * device_get_name(dev) is a good choice and for pseudo-devices a * static string works well. */ void if_initname(struct ifnet *ifp, const char *name, int unit) { ifp->if_dname = name; ifp->if_dunit = unit; if (unit != IF_DUNIT_NONE) snprintf(ifp->if_xname, IFNAMSIZ, "%s%d", name, unit); else strlcpy(ifp->if_xname, name, IFNAMSIZ); } int if_printf(struct ifnet *ifp, const char *fmt, ...) { char if_fmt[256]; va_list ap; snprintf(if_fmt, sizeof(if_fmt), "%s: %s", ifp->if_xname, fmt); va_start(ap, fmt); vlog(LOG_INFO, if_fmt, ap); va_end(ap); return (0); } void if_start(struct ifnet *ifp) { (*(ifp)->if_start)(ifp); } /* * Backwards compatibility interface for drivers * that have not implemented it */ static int if_transmit(struct ifnet *ifp, struct mbuf *m) { int error; IFQ_HANDOFF(ifp, m, error); return (error); } static void if_input_default(struct ifnet *ifp __unused, struct mbuf *m) { m_freem(m); } int if_handoff(struct ifqueue *ifq, struct mbuf *m, struct ifnet *ifp, int adjust) { int active = 0; IF_LOCK(ifq); if (_IF_QFULL(ifq)) { IF_UNLOCK(ifq); if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1); m_freem(m); return (0); } if (ifp != NULL) { if_inc_counter(ifp, IFCOUNTER_OBYTES, m->m_pkthdr.len + adjust); if (m->m_flags & (M_BCAST|M_MCAST)) if_inc_counter(ifp, IFCOUNTER_OMCASTS, 1); active = ifp->if_drv_flags & IFF_DRV_OACTIVE; } _IF_ENQUEUE(ifq, m); IF_UNLOCK(ifq); if (ifp != NULL && !active) (*(ifp)->if_start)(ifp); return (1); } void if_register_com_alloc(u_char type, if_com_alloc_t *a, if_com_free_t *f) { KASSERT(if_com_alloc[type] == NULL, ("if_register_com_alloc: %d already registered", type)); KASSERT(if_com_free[type] == NULL, ("if_register_com_alloc: %d free already registered", type)); if_com_alloc[type] = a; if_com_free[type] = f; } void if_deregister_com_alloc(u_char type) { KASSERT(if_com_alloc[type] != NULL, ("if_deregister_com_alloc: %d not registered", type)); KASSERT(if_com_free[type] != NULL, ("if_deregister_com_alloc: %d free not registered", type)); if_com_alloc[type] = NULL; if_com_free[type] = NULL; } /* API for driver access to network stack owned ifnet.*/ uint64_t if_setbaudrate(struct ifnet *ifp, uint64_t baudrate) { uint64_t oldbrate; oldbrate = ifp->if_baudrate; ifp->if_baudrate = baudrate; return (oldbrate); } uint64_t if_getbaudrate(if_t ifp) { return (((struct ifnet *)ifp)->if_baudrate); } int if_setcapabilities(if_t ifp, int capabilities) { ((struct ifnet *)ifp)->if_capabilities = capabilities; return (0); } int if_setcapabilitiesbit(if_t ifp, int setbit, int clearbit) { ((struct ifnet *)ifp)->if_capabilities |= setbit; ((struct ifnet *)ifp)->if_capabilities &= ~clearbit; return (0); } int if_getcapabilities(if_t ifp) { return ((struct ifnet *)ifp)->if_capabilities; } int if_setcapenable(if_t ifp, int capabilities) { ((struct ifnet *)ifp)->if_capenable = capabilities; return (0); } int if_setcapenablebit(if_t ifp, int setcap, int clearcap) { if(setcap) ((struct ifnet *)ifp)->if_capenable |= setcap; if(clearcap) ((struct ifnet *)ifp)->if_capenable &= ~clearcap; return (0); } const char * if_getdname(if_t ifp) { return ((struct ifnet *)ifp)->if_dname; } int if_togglecapenable(if_t ifp, int togglecap) { ((struct ifnet *)ifp)->if_capenable ^= togglecap; return (0); } int if_getcapenable(if_t ifp) { return ((struct ifnet *)ifp)->if_capenable; } /* * This is largely undesirable because it ties ifnet to a device, but does * provide flexiblity for an embedded product vendor. Should be used with * the understanding that it violates the interface boundaries, and should be * a last resort only. */ int if_setdev(if_t ifp, void *dev) { return (0); } int if_setdrvflagbits(if_t ifp, int set_flags, int clear_flags) { ((struct ifnet *)ifp)->if_drv_flags |= set_flags; ((struct ifnet *)ifp)->if_drv_flags &= ~clear_flags; return (0); } int if_getdrvflags(if_t ifp) { return ((struct ifnet *)ifp)->if_drv_flags; } int if_setdrvflags(if_t ifp, int flags) { ((struct ifnet *)ifp)->if_drv_flags = flags; return (0); } int if_setflags(if_t ifp, int flags) { ((struct ifnet *)ifp)->if_flags = flags; return (0); } int if_setflagbits(if_t ifp, int set, int clear) { ((struct ifnet *)ifp)->if_flags |= set; ((struct ifnet *)ifp)->if_flags &= ~clear; return (0); } int if_getflags(if_t ifp) { return ((struct ifnet *)ifp)->if_flags; } int if_clearhwassist(if_t ifp) { ((struct ifnet *)ifp)->if_hwassist = 0; return (0); } int if_sethwassistbits(if_t ifp, int toset, int toclear) { ((struct ifnet *)ifp)->if_hwassist |= toset; ((struct ifnet *)ifp)->if_hwassist &= ~toclear; return (0); } int if_sethwassist(if_t ifp, int hwassist_bit) { ((struct ifnet *)ifp)->if_hwassist = hwassist_bit; return (0); } int if_gethwassist(if_t ifp) { return ((struct ifnet *)ifp)->if_hwassist; } int if_setmtu(if_t ifp, int mtu) { ((struct ifnet *)ifp)->if_mtu = mtu; return (0); } int if_getmtu(if_t ifp) { return ((struct ifnet *)ifp)->if_mtu; } int if_getmtu_family(if_t ifp, int family) { struct domain *dp; for (dp = domains; dp; dp = dp->dom_next) { if (dp->dom_family == family && dp->dom_ifmtu != NULL) return (dp->dom_ifmtu((struct ifnet *)ifp)); } return (((struct ifnet *)ifp)->if_mtu); } int if_setsoftc(if_t ifp, void *softc) { ((struct ifnet *)ifp)->if_softc = softc; return (0); } void * if_getsoftc(if_t ifp) { return ((struct ifnet *)ifp)->if_softc; } void if_setrcvif(struct mbuf *m, if_t ifp) { m->m_pkthdr.rcvif = (struct ifnet *)ifp; } void if_setvtag(struct mbuf *m, uint16_t tag) { m->m_pkthdr.ether_vtag = tag; } uint16_t if_getvtag(struct mbuf *m) { return (m->m_pkthdr.ether_vtag); } int if_sendq_empty(if_t ifp) { return IFQ_DRV_IS_EMPTY(&((struct ifnet *)ifp)->if_snd); } struct ifaddr * if_getifaddr(if_t ifp) { return ((struct ifnet *)ifp)->if_addr; } int if_getamcount(if_t ifp) { return ((struct ifnet *)ifp)->if_amcount; } int if_setsendqready(if_t ifp) { IFQ_SET_READY(&((struct ifnet *)ifp)->if_snd); return (0); } int if_setsendqlen(if_t ifp, int tx_desc_count) { IFQ_SET_MAXLEN(&((struct ifnet *)ifp)->if_snd, tx_desc_count); ((struct ifnet *)ifp)->if_snd.ifq_drv_maxlen = tx_desc_count; return (0); } int if_vlantrunkinuse(if_t ifp) { return ((struct ifnet *)ifp)->if_vlantrunk != NULL?1:0; } int if_input(if_t ifp, struct mbuf* sendmp) { (*((struct ifnet *)ifp)->if_input)((struct ifnet *)ifp, sendmp); return (0); } /* XXX */ #ifndef ETH_ADDR_LEN #define ETH_ADDR_LEN 6 #endif int if_setupmultiaddr(if_t ifp, void *mta, int *cnt, int max) { struct ifmultiaddr *ifma; uint8_t *lmta = (uint8_t *)mta; int mcnt = 0; CK_STAILQ_FOREACH(ifma, &((struct ifnet *)ifp)->if_multiaddrs, ifma_link) { if (ifma->ifma_addr->sa_family != AF_LINK) continue; if (mcnt == max) break; bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr), &lmta[mcnt * ETH_ADDR_LEN], ETH_ADDR_LEN); mcnt++; } *cnt = mcnt; return (0); } int if_multiaddr_array(if_t ifp, void *mta, int *cnt, int max) { int error; if_maddr_rlock(ifp); error = if_setupmultiaddr(ifp, mta, cnt, max); if_maddr_runlock(ifp); return (error); } int if_multiaddr_count(if_t ifp, int max) { struct ifmultiaddr *ifma; int count; count = 0; if_maddr_rlock(ifp); CK_STAILQ_FOREACH(ifma, &((struct ifnet *)ifp)->if_multiaddrs, ifma_link) { if (ifma->ifma_addr->sa_family != AF_LINK) continue; count++; if (count == max) break; } if_maddr_runlock(ifp); return (count); } int if_multi_apply(struct ifnet *ifp, int (*filter)(void *, struct ifmultiaddr *, int), void *arg) { struct ifmultiaddr *ifma; int cnt = 0; if_maddr_rlock(ifp); CK_STAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) cnt += filter(arg, ifma, cnt); if_maddr_runlock(ifp); return (cnt); } struct mbuf * if_dequeue(if_t ifp) { struct mbuf *m; IFQ_DRV_DEQUEUE(&((struct ifnet *)ifp)->if_snd, m); return (m); } int if_sendq_prepend(if_t ifp, struct mbuf *m) { IFQ_DRV_PREPEND(&((struct ifnet *)ifp)->if_snd, m); return (0); } int if_setifheaderlen(if_t ifp, int len) { ((struct ifnet *)ifp)->if_hdrlen = len; return (0); } caddr_t if_getlladdr(if_t ifp) { return (IF_LLADDR((struct ifnet *)ifp)); } void * if_gethandle(u_char type) { return (if_alloc(type)); } void if_bpfmtap(if_t ifh, struct mbuf *m) { struct ifnet *ifp = (struct ifnet *)ifh; BPF_MTAP(ifp, m); } void if_etherbpfmtap(if_t ifh, struct mbuf *m) { struct ifnet *ifp = (struct ifnet *)ifh; ETHER_BPF_MTAP(ifp, m); } void if_vlancap(if_t ifh) { struct ifnet *ifp = (struct ifnet *)ifh; VLAN_CAPABILITIES(ifp); } int if_sethwtsomax(if_t ifp, u_int if_hw_tsomax) { ((struct ifnet *)ifp)->if_hw_tsomax = if_hw_tsomax; return (0); } int if_sethwtsomaxsegcount(if_t ifp, u_int if_hw_tsomaxsegcount) { ((struct ifnet *)ifp)->if_hw_tsomaxsegcount = if_hw_tsomaxsegcount; return (0); } int if_sethwtsomaxsegsize(if_t ifp, u_int if_hw_tsomaxsegsize) { ((struct ifnet *)ifp)->if_hw_tsomaxsegsize = if_hw_tsomaxsegsize; return (0); } u_int if_gethwtsomax(if_t ifp) { return (((struct ifnet *)ifp)->if_hw_tsomax); } u_int if_gethwtsomaxsegcount(if_t ifp) { return (((struct ifnet *)ifp)->if_hw_tsomaxsegcount); } u_int if_gethwtsomaxsegsize(if_t ifp) { return (((struct ifnet *)ifp)->if_hw_tsomaxsegsize); } void if_setinitfn(if_t ifp, void (*init_fn)(void *)) { ((struct ifnet *)ifp)->if_init = init_fn; } void if_setioctlfn(if_t ifp, int (*ioctl_fn)(if_t, u_long, caddr_t)) { ((struct ifnet *)ifp)->if_ioctl = (void *)ioctl_fn; } void if_setstartfn(if_t ifp, void (*start_fn)(if_t)) { ((struct ifnet *)ifp)->if_start = (void *)start_fn; } void if_settransmitfn(if_t ifp, if_transmit_fn_t start_fn) { ((struct ifnet *)ifp)->if_transmit = start_fn; } void if_setqflushfn(if_t ifp, if_qflush_fn_t flush_fn) { ((struct ifnet *)ifp)->if_qflush = flush_fn; } void if_setgetcounterfn(if_t ifp, if_get_counter_t fn) { ifp->if_get_counter = fn; } /* Revisit these - These are inline functions originally. */ int drbr_inuse_drv(if_t ifh, struct buf_ring *br) { return drbr_inuse(ifh, br); } struct mbuf* drbr_dequeue_drv(if_t ifh, struct buf_ring *br) { return drbr_dequeue(ifh, br); } int drbr_needs_enqueue_drv(if_t ifh, struct buf_ring *br) { return drbr_needs_enqueue(ifh, br); } int drbr_enqueue_drv(if_t ifh, struct buf_ring *br, struct mbuf *m) { return drbr_enqueue(ifh, br, m); } Index: head/sys/net/if_llatbl.c =================================================================== --- head/sys/net/if_llatbl.c (revision 334117) +++ head/sys/net/if_llatbl.c (revision 334118) @@ -1,965 +1,965 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2004 Luigi Rizzo, Alessandro Cerri. All rights reserved. * Copyright (c) 2004-2008 Qing Li. All rights reserved. * Copyright (c) 2008 Kip Macy. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #ifdef DDB #include #endif #include #include #include #include #include #include #include #include #include #include #include MALLOC_DEFINE(M_LLTABLE, "lltable", "link level address tables"); static VNET_DEFINE(SLIST_HEAD(, lltable), lltables) = SLIST_HEAD_INITIALIZER(lltables); #define V_lltables VNET(lltables) static struct rwlock lltable_list_lock; RW_SYSINIT(lltable_list_lock, &lltable_list_lock, "lltable_list_lock"); #define LLTABLE_LIST_RLOCK() rw_rlock(&lltable_list_lock) #define LLTABLE_LIST_RUNLOCK() rw_runlock(&lltable_list_lock) #define LLTABLE_LIST_WLOCK() rw_wlock(&lltable_list_lock) #define LLTABLE_LIST_WUNLOCK() rw_wunlock(&lltable_list_lock) #define LLTABLE_LIST_LOCK_ASSERT() rw_assert(&lltable_list_lock, RA_LOCKED) static void lltable_unlink(struct lltable *llt); static void llentries_unlink(struct lltable *llt, struct llentries *head); static void htable_unlink_entry(struct llentry *lle); static void htable_link_entry(struct lltable *llt, struct llentry *lle); static int htable_foreach_lle(struct lltable *llt, llt_foreach_cb_t *f, void *farg); /* * Dump lle state for a specific address family. */ static int lltable_dump_af(struct lltable *llt, struct sysctl_req *wr) { int error; LLTABLE_LIST_LOCK_ASSERT(); if (llt->llt_ifp->if_flags & IFF_LOOPBACK) return (0); error = 0; IF_AFDATA_RLOCK(llt->llt_ifp); error = lltable_foreach_lle(llt, (llt_foreach_cb_t *)llt->llt_dump_entry, wr); IF_AFDATA_RUNLOCK(llt->llt_ifp); return (error); } /* * Dump arp state for a specific address family. */ int lltable_sysctl_dumparp(int af, struct sysctl_req *wr) { struct lltable *llt; int error = 0; LLTABLE_LIST_RLOCK(); SLIST_FOREACH(llt, &V_lltables, llt_link) { if (llt->llt_af == af) { error = lltable_dump_af(llt, wr); if (error != 0) goto done; } } done: LLTABLE_LIST_RUNLOCK(); return (error); } /* * Common function helpers for chained hash table. */ /* * Runs specified callback for each entry in @llt. * Caller does the locking. * */ static int htable_foreach_lle(struct lltable *llt, llt_foreach_cb_t *f, void *farg) { struct llentry *lle, *next; int i, error; error = 0; for (i = 0; i < llt->llt_hsize; i++) { - LIST_FOREACH_SAFE(lle, &llt->lle_head[i], lle_next, next) { + CK_LIST_FOREACH_SAFE(lle, &llt->lle_head[i], lle_next, next) { error = f(llt, lle, farg); if (error != 0) break; } } return (error); } static void htable_link_entry(struct lltable *llt, struct llentry *lle) { struct llentries *lleh; uint32_t hashidx; if ((lle->la_flags & LLE_LINKED) != 0) return; IF_AFDATA_WLOCK_ASSERT(llt->llt_ifp); hashidx = llt->llt_hash(lle, llt->llt_hsize); lleh = &llt->lle_head[hashidx]; lle->lle_tbl = llt; lle->lle_head = lleh; lle->la_flags |= LLE_LINKED; - LIST_INSERT_HEAD(lleh, lle, lle_next); + CK_LIST_INSERT_HEAD(lleh, lle, lle_next); } static void htable_unlink_entry(struct llentry *lle) { if ((lle->la_flags & LLE_LINKED) != 0) { IF_AFDATA_WLOCK_ASSERT(lle->lle_tbl->llt_ifp); - LIST_REMOVE(lle, lle_next); + CK_LIST_REMOVE(lle, lle_next); lle->la_flags &= ~(LLE_VALID | LLE_LINKED); #if 0 lle->lle_tbl = NULL; lle->lle_head = NULL; #endif } } struct prefix_match_data { const struct sockaddr *addr; const struct sockaddr *mask; struct llentries dchain; u_int flags; }; static int htable_prefix_free_cb(struct lltable *llt, struct llentry *lle, void *farg) { struct prefix_match_data *pmd; pmd = (struct prefix_match_data *)farg; if (llt->llt_match_prefix(pmd->addr, pmd->mask, pmd->flags, lle)) { LLE_WLOCK(lle); LIST_INSERT_HEAD(&pmd->dchain, lle, lle_chain); } return (0); } static void htable_prefix_free(struct lltable *llt, const struct sockaddr *addr, const struct sockaddr *mask, u_int flags) { struct llentry *lle, *next; struct prefix_match_data pmd; bzero(&pmd, sizeof(pmd)); pmd.addr = addr; pmd.mask = mask; pmd.flags = flags; - LIST_INIT(&pmd.dchain); + CK_LIST_INIT(&pmd.dchain); IF_AFDATA_WLOCK(llt->llt_ifp); /* Push matching lles to chain */ lltable_foreach_lle(llt, htable_prefix_free_cb, &pmd); llentries_unlink(llt, &pmd.dchain); IF_AFDATA_WUNLOCK(llt->llt_ifp); LIST_FOREACH_SAFE(lle, &pmd.dchain, lle_chain, next) lltable_free_entry(llt, lle); } static void htable_free_tbl(struct lltable *llt) { free(llt->lle_head, M_LLTABLE); free(llt, M_LLTABLE); } static void llentries_unlink(struct lltable *llt, struct llentries *head) { struct llentry *lle, *next; LIST_FOREACH_SAFE(lle, head, lle_chain, next) llt->llt_unlink_entry(lle); } /* * Helper function used to drop all mbufs in hold queue. * * Returns the number of held packets, if any, that were dropped. */ size_t lltable_drop_entry_queue(struct llentry *lle) { size_t pkts_dropped; struct mbuf *next; LLE_WLOCK_ASSERT(lle); pkts_dropped = 0; while ((lle->la_numheld > 0) && (lle->la_hold != NULL)) { next = lle->la_hold->m_nextpkt; m_freem(lle->la_hold); lle->la_hold = next; lle->la_numheld--; pkts_dropped++; } KASSERT(lle->la_numheld == 0, ("%s: la_numheld %d > 0, pkts_droped %zd", __func__, lle->la_numheld, pkts_dropped)); return (pkts_dropped); } void lltable_set_entry_addr(struct ifnet *ifp, struct llentry *lle, const char *linkhdr, size_t linkhdrsize, int lladdr_off) { memcpy(lle->r_linkdata, linkhdr, linkhdrsize); lle->r_hdrlen = linkhdrsize; lle->ll_addr = &lle->r_linkdata[lladdr_off]; lle->la_flags |= LLE_VALID; lle->r_flags |= RLLE_VALID; } /* * Tries to update @lle link-level address. * Since update requires AFDATA WLOCK, function * drops @lle lock, acquires AFDATA lock and then acquires * @lle lock to maintain lock order. * * Returns 1 on success. */ int lltable_try_set_entry_addr(struct ifnet *ifp, struct llentry *lle, const char *linkhdr, size_t linkhdrsize, int lladdr_off) { /* Perform real LLE update */ /* use afdata WLOCK to update fields */ LLE_WLOCK_ASSERT(lle); LLE_ADDREF(lle); LLE_WUNLOCK(lle); IF_AFDATA_WLOCK(ifp); LLE_WLOCK(lle); /* * Since we droppped LLE lock, other thread might have deleted * this lle. Check and return */ if ((lle->la_flags & LLE_DELETED) != 0) { IF_AFDATA_WUNLOCK(ifp); LLE_FREE_LOCKED(lle); return (0); } /* Update data */ lltable_set_entry_addr(ifp, lle, linkhdr, linkhdrsize, lladdr_off); IF_AFDATA_WUNLOCK(ifp); LLE_REMREF(lle); return (1); } /* * Helper function used to pre-compute full/partial link-layer * header data suitable for feeding into if_output(). */ int lltable_calc_llheader(struct ifnet *ifp, int family, char *lladdr, char *buf, size_t *bufsize, int *lladdr_off) { struct if_encap_req ereq; int error; bzero(buf, *bufsize); bzero(&ereq, sizeof(ereq)); ereq.buf = buf; ereq.bufsize = *bufsize; ereq.rtype = IFENCAP_LL; ereq.family = family; ereq.lladdr = lladdr; ereq.lladdr_len = ifp->if_addrlen; error = ifp->if_requestencap(ifp, &ereq); if (error == 0) { *bufsize = ereq.bufsize; *lladdr_off = ereq.lladdr_off; } return (error); } /* * Update link-layer header for given @lle after * interface lladdr was changed. */ static int llentry_update_ifaddr(struct lltable *llt, struct llentry *lle, void *farg) { struct ifnet *ifp; u_char linkhdr[LLE_MAX_LINKHDR]; size_t linkhdrsize; u_char *lladdr; int lladdr_off; ifp = (struct ifnet *)farg; lladdr = lle->ll_addr; LLE_WLOCK(lle); if ((lle->la_flags & LLE_VALID) == 0) { LLE_WUNLOCK(lle); return (0); } if ((lle->la_flags & LLE_IFADDR) != 0) lladdr = IF_LLADDR(ifp); linkhdrsize = sizeof(linkhdr); lltable_calc_llheader(ifp, llt->llt_af, lladdr, linkhdr, &linkhdrsize, &lladdr_off); memcpy(lle->r_linkdata, linkhdr, linkhdrsize); LLE_WUNLOCK(lle); return (0); } /* * Update all calculated headers for given @llt */ void lltable_update_ifaddr(struct lltable *llt) { if (llt->llt_ifp->if_flags & IFF_LOOPBACK) return; IF_AFDATA_WLOCK(llt->llt_ifp); lltable_foreach_lle(llt, llentry_update_ifaddr, llt->llt_ifp); IF_AFDATA_WUNLOCK(llt->llt_ifp); } /* * * Performs generic cleanup routines and frees lle. * * Called for non-linked entries, with callouts and * other AF-specific cleanups performed. * * @lle must be passed WLOCK'ed * * Returns the number of held packets, if any, that were dropped. */ size_t llentry_free(struct llentry *lle) { size_t pkts_dropped; LLE_WLOCK_ASSERT(lle); KASSERT((lle->la_flags & LLE_LINKED) == 0, ("freeing linked lle")); pkts_dropped = lltable_drop_entry_queue(lle); LLE_FREE_LOCKED(lle); return (pkts_dropped); } /* * (al)locate an llentry for address dst (equivalent to rtalloc for new-arp). * * If found the llentry * is returned referenced and unlocked. */ struct llentry * llentry_alloc(struct ifnet *ifp, struct lltable *lt, struct sockaddr_storage *dst) { struct llentry *la, *la_tmp; IF_AFDATA_RLOCK(ifp); la = lla_lookup(lt, LLE_EXCLUSIVE, (struct sockaddr *)dst); IF_AFDATA_RUNLOCK(ifp); if (la != NULL) { LLE_ADDREF(la); LLE_WUNLOCK(la); return (la); } if ((ifp->if_flags & (IFF_NOARP | IFF_STATICARP)) == 0) { la = lltable_alloc_entry(lt, 0, (struct sockaddr *)dst); if (la == NULL) return (NULL); IF_AFDATA_WLOCK(ifp); LLE_WLOCK(la); /* Prefer any existing LLE over newly-created one */ la_tmp = lla_lookup(lt, LLE_EXCLUSIVE, (struct sockaddr *)dst); if (la_tmp == NULL) lltable_link_entry(lt, la); IF_AFDATA_WUNLOCK(ifp); if (la_tmp != NULL) { lltable_free_entry(lt, la); la = la_tmp; } LLE_ADDREF(la); LLE_WUNLOCK(la); } return (la); } /* * Free all entries from given table and free itself. */ static int lltable_free_cb(struct lltable *llt, struct llentry *lle, void *farg) { struct llentries *dchain; dchain = (struct llentries *)farg; LLE_WLOCK(lle); LIST_INSERT_HEAD(dchain, lle, lle_chain); return (0); } /* * Free all entries from given table and free itself. */ void lltable_free(struct lltable *llt) { struct llentry *lle, *next; struct llentries dchain; KASSERT(llt != NULL, ("%s: llt is NULL", __func__)); lltable_unlink(llt); - LIST_INIT(&dchain); + CK_LIST_INIT(&dchain); IF_AFDATA_WLOCK(llt->llt_ifp); /* Push all lles to @dchain */ lltable_foreach_lle(llt, lltable_free_cb, &dchain); llentries_unlink(llt, &dchain); IF_AFDATA_WUNLOCK(llt->llt_ifp); LIST_FOREACH_SAFE(lle, &dchain, lle_chain, next) { if (callout_stop(&lle->lle_timer) > 0) LLE_REMREF(lle); llentry_free(lle); } llt->llt_free_tbl(llt); } #if 0 void lltable_drain(int af) { struct lltable *llt; struct llentry *lle; int i; LLTABLE_LIST_RLOCK(); SLIST_FOREACH(llt, &V_lltables, llt_link) { if (llt->llt_af != af) continue; for (i=0; i < llt->llt_hsize; i++) { - LIST_FOREACH(lle, &llt->lle_head[i], lle_next) { + CK_LIST_FOREACH(lle, &llt->lle_head[i], lle_next) { LLE_WLOCK(lle); if (lle->la_hold) { m_freem(lle->la_hold); lle->la_hold = NULL; } LLE_WUNLOCK(lle); } } } LLTABLE_LIST_RUNLOCK(); } #endif /* * Deletes an address from given lltable. * Used for userland interaction to remove * individual entries. Skips entries added by OS. */ int lltable_delete_addr(struct lltable *llt, u_int flags, const struct sockaddr *l3addr) { struct llentry *lle; struct ifnet *ifp; ifp = llt->llt_ifp; IF_AFDATA_WLOCK(ifp); lle = lla_lookup(llt, LLE_EXCLUSIVE, l3addr); if (lle == NULL) { IF_AFDATA_WUNLOCK(ifp); return (ENOENT); } if ((lle->la_flags & LLE_IFADDR) != 0 && (flags & LLE_IFADDR) == 0) { IF_AFDATA_WUNLOCK(ifp); LLE_WUNLOCK(lle); return (EPERM); } lltable_unlink_entry(llt, lle); IF_AFDATA_WUNLOCK(ifp); llt->llt_delete_entry(llt, lle); return (0); } void lltable_prefix_free(int af, struct sockaddr *addr, struct sockaddr *mask, u_int flags) { struct lltable *llt; LLTABLE_LIST_RLOCK(); SLIST_FOREACH(llt, &V_lltables, llt_link) { if (llt->llt_af != af) continue; llt->llt_prefix_free(llt, addr, mask, flags); } LLTABLE_LIST_RUNLOCK(); } struct lltable * lltable_allocate_htbl(uint32_t hsize) { struct lltable *llt; int i; llt = malloc(sizeof(struct lltable), M_LLTABLE, M_WAITOK | M_ZERO); llt->llt_hsize = hsize; llt->lle_head = malloc(sizeof(struct llentries) * hsize, M_LLTABLE, M_WAITOK | M_ZERO); for (i = 0; i < llt->llt_hsize; i++) - LIST_INIT(&llt->lle_head[i]); + CK_LIST_INIT(&llt->lle_head[i]); /* Set some default callbacks */ llt->llt_link_entry = htable_link_entry; llt->llt_unlink_entry = htable_unlink_entry; llt->llt_prefix_free = htable_prefix_free; llt->llt_foreach_entry = htable_foreach_lle; llt->llt_free_tbl = htable_free_tbl; return (llt); } /* * Links lltable to global llt list. */ void lltable_link(struct lltable *llt) { LLTABLE_LIST_WLOCK(); SLIST_INSERT_HEAD(&V_lltables, llt, llt_link); LLTABLE_LIST_WUNLOCK(); } static void lltable_unlink(struct lltable *llt) { LLTABLE_LIST_WLOCK(); SLIST_REMOVE(&V_lltables, llt, lltable, llt_link); LLTABLE_LIST_WUNLOCK(); } /* * External methods used by lltable consumers */ int lltable_foreach_lle(struct lltable *llt, llt_foreach_cb_t *f, void *farg) { return (llt->llt_foreach_entry(llt, f, farg)); } struct llentry * lltable_alloc_entry(struct lltable *llt, u_int flags, const struct sockaddr *l3addr) { return (llt->llt_alloc_entry(llt, flags, l3addr)); } void lltable_free_entry(struct lltable *llt, struct llentry *lle) { llt->llt_free_entry(llt, lle); } void lltable_link_entry(struct lltable *llt, struct llentry *lle) { llt->llt_link_entry(llt, lle); } void lltable_unlink_entry(struct lltable *llt, struct llentry *lle) { llt->llt_unlink_entry(lle); } void lltable_fill_sa_entry(const struct llentry *lle, struct sockaddr *sa) { struct lltable *llt; llt = lle->lle_tbl; llt->llt_fill_sa_entry(lle, sa); } struct ifnet * lltable_get_ifp(const struct lltable *llt) { return (llt->llt_ifp); } int lltable_get_af(const struct lltable *llt) { return (llt->llt_af); } /* * Called in route_output when rtm_flags contains RTF_LLDATA. */ int lla_rt_output(struct rt_msghdr *rtm, struct rt_addrinfo *info) { struct sockaddr_dl *dl = (struct sockaddr_dl *)info->rti_info[RTAX_GATEWAY]; struct sockaddr *dst = (struct sockaddr *)info->rti_info[RTAX_DST]; struct ifnet *ifp; struct lltable *llt; struct llentry *lle, *lle_tmp; uint8_t linkhdr[LLE_MAX_LINKHDR]; size_t linkhdrsize; int lladdr_off; u_int laflags = 0; int error; KASSERT(dl != NULL && dl->sdl_family == AF_LINK, ("%s: invalid dl\n", __func__)); ifp = ifnet_byindex(dl->sdl_index); if (ifp == NULL) { log(LOG_INFO, "%s: invalid ifp (sdl_index %d)\n", __func__, dl->sdl_index); return EINVAL; } /* XXX linked list may be too expensive */ LLTABLE_LIST_RLOCK(); SLIST_FOREACH(llt, &V_lltables, llt_link) { if (llt->llt_af == dst->sa_family && llt->llt_ifp == ifp) break; } LLTABLE_LIST_RUNLOCK(); KASSERT(llt != NULL, ("Yep, ugly hacks are bad\n")); error = 0; switch (rtm->rtm_type) { case RTM_ADD: /* Add static LLE */ laflags = 0; if (rtm->rtm_rmx.rmx_expire == 0) laflags = LLE_STATIC; lle = lltable_alloc_entry(llt, laflags, dst); if (lle == NULL) return (ENOMEM); linkhdrsize = sizeof(linkhdr); if (lltable_calc_llheader(ifp, dst->sa_family, LLADDR(dl), linkhdr, &linkhdrsize, &lladdr_off) != 0) return (EINVAL); lltable_set_entry_addr(ifp, lle, linkhdr, linkhdrsize, lladdr_off); if ((rtm->rtm_flags & RTF_ANNOUNCE)) lle->la_flags |= LLE_PUB; lle->la_expire = rtm->rtm_rmx.rmx_expire; laflags = lle->la_flags; /* Try to link new entry */ lle_tmp = NULL; IF_AFDATA_WLOCK(ifp); LLE_WLOCK(lle); lle_tmp = lla_lookup(llt, LLE_EXCLUSIVE, dst); if (lle_tmp != NULL) { /* Check if we are trying to replace immutable entry */ if ((lle_tmp->la_flags & LLE_IFADDR) != 0) { IF_AFDATA_WUNLOCK(ifp); LLE_WUNLOCK(lle_tmp); lltable_free_entry(llt, lle); return (EPERM); } /* Unlink existing entry from table */ lltable_unlink_entry(llt, lle_tmp); } lltable_link_entry(llt, lle); IF_AFDATA_WUNLOCK(ifp); if (lle_tmp != NULL) { EVENTHANDLER_INVOKE(lle_event, lle_tmp,LLENTRY_EXPIRED); lltable_free_entry(llt, lle_tmp); } /* * By invoking LLE handler here we might get * two events on static LLE entry insertion * in routing socket. However, since we might have * other subscribers we need to generate this event. */ EVENTHANDLER_INVOKE(lle_event, lle, LLENTRY_RESOLVED); LLE_WUNLOCK(lle); #ifdef INET /* gratuitous ARP */ if ((laflags & LLE_PUB) && dst->sa_family == AF_INET) arprequest(ifp, &((struct sockaddr_in *)dst)->sin_addr, &((struct sockaddr_in *)dst)->sin_addr, (u_char *)LLADDR(dl)); #endif break; case RTM_DELETE: return (lltable_delete_addr(llt, 0, dst)); default: error = EINVAL; } return (error); } #ifdef DDB struct llentry_sa { struct llentry base; struct sockaddr l3_addr; }; static void llatbl_lle_show(struct llentry_sa *la) { struct llentry *lle; uint8_t octet[6]; lle = &la->base; db_printf("lle=%p\n", lle); db_printf(" lle_next=%p\n", lle->lle_next.le_next); db_printf(" lle_lock=%p\n", &lle->lle_lock); db_printf(" lle_tbl=%p\n", lle->lle_tbl); db_printf(" lle_head=%p\n", lle->lle_head); db_printf(" la_hold=%p\n", lle->la_hold); db_printf(" la_numheld=%d\n", lle->la_numheld); db_printf(" la_expire=%ju\n", (uintmax_t)lle->la_expire); db_printf(" la_flags=0x%04x\n", lle->la_flags); db_printf(" la_asked=%u\n", lle->la_asked); db_printf(" la_preempt=%u\n", lle->la_preempt); db_printf(" ln_state=%d\n", lle->ln_state); db_printf(" ln_router=%u\n", lle->ln_router); db_printf(" ln_ntick=%ju\n", (uintmax_t)lle->ln_ntick); db_printf(" lle_refcnt=%d\n", lle->lle_refcnt); bcopy(lle->ll_addr, octet, sizeof(octet)); db_printf(" ll_addr=%02x:%02x:%02x:%02x:%02x:%02x\n", octet[0], octet[1], octet[2], octet[3], octet[4], octet[5]); db_printf(" lle_timer=%p\n", &lle->lle_timer); switch (la->l3_addr.sa_family) { #ifdef INET case AF_INET: { struct sockaddr_in *sin; char l3s[INET_ADDRSTRLEN]; sin = (struct sockaddr_in *)&la->l3_addr; inet_ntoa_r(sin->sin_addr, l3s); db_printf(" l3_addr=%s\n", l3s); break; } #endif #ifdef INET6 case AF_INET6: { struct sockaddr_in6 *sin6; char l3s[INET6_ADDRSTRLEN]; sin6 = (struct sockaddr_in6 *)&la->l3_addr; ip6_sprintf(l3s, &sin6->sin6_addr); db_printf(" l3_addr=%s\n", l3s); break; } #endif default: db_printf(" l3_addr=N/A (af=%d)\n", la->l3_addr.sa_family); break; } } DB_SHOW_COMMAND(llentry, db_show_llentry) { if (!have_addr) { db_printf("usage: show llentry \n"); return; } llatbl_lle_show((struct llentry_sa *)addr); } static void llatbl_llt_show(struct lltable *llt) { int i; struct llentry *lle; db_printf("llt=%p llt_af=%d llt_ifp=%p\n", llt, llt->llt_af, llt->llt_ifp); for (i = 0; i < llt->llt_hsize; i++) { - LIST_FOREACH(lle, &llt->lle_head[i], lle_next) { + CK_LIST_FOREACH(lle, &llt->lle_head[i], lle_next) { llatbl_lle_show((struct llentry_sa *)lle); if (db_pager_quit) return; } } } DB_SHOW_COMMAND(lltable, db_show_lltable) { if (!have_addr) { db_printf("usage: show lltable \n"); return; } llatbl_llt_show((struct lltable *)addr); } DB_SHOW_ALL_COMMAND(lltables, db_show_all_lltables) { VNET_ITERATOR_DECL(vnet_iter); struct lltable *llt; VNET_FOREACH(vnet_iter) { CURVNET_SET_QUIET(vnet_iter); #ifdef VIMAGE db_printf("vnet=%p\n", curvnet); #endif SLIST_FOREACH(llt, &V_lltables, llt_link) { db_printf("llt=%p llt_af=%d llt_ifp=%p(%s)\n", llt, llt->llt_af, llt->llt_ifp, (llt->llt_ifp != NULL) ? llt->llt_ifp->if_xname : "?"); if (have_addr && addr != 0) /* verbose */ llatbl_llt_show(llt); if (db_pager_quit) { CURVNET_RESTORE(); return; } } CURVNET_RESTORE(); } } #endif Index: head/sys/net/if_llatbl.h =================================================================== --- head/sys/net/if_llatbl.h (revision 334117) +++ head/sys/net/if_llatbl.h (revision 334118) @@ -1,276 +1,278 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2004 Luigi Rizzo, Alessandro Cerri. All rights reserved. * Copyright (c) 2004-2008 Qing Li. All rights reserved. * Copyright (c) 2008 Kip Macy. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #ifndef _NET_IF_LLATBL_H_ #define _NET_IF_LLATBL_H_ #include #include +#include struct ifnet; struct sysctl_req; struct rt_msghdr; struct rt_addrinfo; struct llentry; LIST_HEAD(llentries, llentry); #define LLE_MAX_LINKHDR 24 /* Full IB header */ /* * Code referencing llentry must at least hold * a shared lock */ struct llentry { LIST_ENTRY(llentry) lle_next; union { struct in_addr addr4; struct in6_addr addr6; } r_l3addr; char r_linkdata[LLE_MAX_LINKHDR]; /* L2 data */ uint8_t r_hdrlen; /* length for LL header */ uint8_t spare0[3]; uint16_t r_flags; /* LLE runtime flags */ uint16_t r_skip_req; /* feedback from fast path */ struct lltable *lle_tbl; struct llentries *lle_head; void (*lle_free)(struct llentry *); struct mbuf *la_hold; int la_numheld; /* # of packets currently held */ time_t la_expire; uint16_t la_flags; uint16_t la_asked; uint16_t la_preempt; int16_t ln_state; /* IPv6 has ND6_LLINFO_NOSTATE == -2 */ uint16_t ln_router; time_t ln_ntick; time_t lle_remtime; /* Real time remaining */ time_t lle_hittime; /* Time when r_skip_req was unset */ int lle_refcnt; char *ll_addr; /* link-layer address */ LIST_ENTRY(llentry) lle_chain; /* chain of deleted items */ struct callout lle_timer; struct rwlock lle_lock; struct mtx req_mtx; + struct epoch_context lle_epoch_ctx; }; #define LLE_WLOCK(lle) rw_wlock(&(lle)->lle_lock) #define LLE_RLOCK(lle) rw_rlock(&(lle)->lle_lock) #define LLE_WUNLOCK(lle) rw_wunlock(&(lle)->lle_lock) #define LLE_RUNLOCK(lle) rw_runlock(&(lle)->lle_lock) #define LLE_DOWNGRADE(lle) rw_downgrade(&(lle)->lle_lock) #define LLE_TRY_UPGRADE(lle) rw_try_upgrade(&(lle)->lle_lock) #define LLE_LOCK_INIT(lle) rw_init_flags(&(lle)->lle_lock, "lle", RW_DUPOK) #define LLE_LOCK_DESTROY(lle) rw_destroy(&(lle)->lle_lock) #define LLE_WLOCK_ASSERT(lle) rw_assert(&(lle)->lle_lock, RA_WLOCKED) #define LLE_REQ_INIT(lle) mtx_init(&(lle)->req_mtx, "lle req", \ NULL, MTX_DEF) #define LLE_REQ_DESTROY(lle) mtx_destroy(&(lle)->req_mtx) #define LLE_REQ_LOCK(lle) mtx_lock(&(lle)->req_mtx) #define LLE_REQ_UNLOCK(lle) mtx_unlock(&(lle)->req_mtx) #define LLE_IS_VALID(lle) (((lle) != NULL) && ((lle) != (void *)-1)) #define LLE_ADDREF(lle) do { \ LLE_WLOCK_ASSERT(lle); \ KASSERT((lle)->lle_refcnt >= 0, \ ("negative refcnt %d on lle %p", \ (lle)->lle_refcnt, (lle))); \ (lle)->lle_refcnt++; \ } while (0) #define LLE_REMREF(lle) do { \ LLE_WLOCK_ASSERT(lle); \ KASSERT((lle)->lle_refcnt > 0, \ ("bogus refcnt %d on lle %p", \ (lle)->lle_refcnt, (lle))); \ (lle)->lle_refcnt--; \ } while (0) #define LLE_FREE_LOCKED(lle) do { \ if ((lle)->lle_refcnt == 1) \ (lle)->lle_free(lle); \ else { \ LLE_REMREF(lle); \ LLE_WUNLOCK(lle); \ } \ /* guard against invalid refs */ \ (lle) = NULL; \ } while (0) #define LLE_FREE(lle) do { \ LLE_WLOCK(lle); \ LLE_FREE_LOCKED(lle); \ } while (0) typedef struct llentry *(llt_lookup_t)(struct lltable *, u_int flags, const struct sockaddr *l3addr); typedef struct llentry *(llt_alloc_t)(struct lltable *, u_int flags, const struct sockaddr *l3addr); typedef void (llt_delete_t)(struct lltable *, struct llentry *); typedef void (llt_prefix_free_t)(struct lltable *, const struct sockaddr *addr, const struct sockaddr *mask, u_int flags); typedef int (llt_dump_entry_t)(struct lltable *, struct llentry *, struct sysctl_req *); typedef uint32_t (llt_hash_t)(const struct llentry *, uint32_t); typedef int (llt_match_prefix_t)(const struct sockaddr *, const struct sockaddr *, u_int, struct llentry *); typedef void (llt_free_entry_t)(struct lltable *, struct llentry *); typedef void (llt_fill_sa_entry_t)(const struct llentry *, struct sockaddr *); typedef void (llt_free_tbl_t)(struct lltable *); typedef void (llt_link_entry_t)(struct lltable *, struct llentry *); typedef void (llt_unlink_entry_t)(struct llentry *); typedef void (llt_mark_used_t)(struct llentry *); typedef int (llt_foreach_cb_t)(struct lltable *, struct llentry *, void *); typedef int (llt_foreach_entry_t)(struct lltable *, llt_foreach_cb_t *, void *); struct lltable { SLIST_ENTRY(lltable) llt_link; int llt_af; int llt_hsize; struct llentries *lle_head; struct ifnet *llt_ifp; llt_lookup_t *llt_lookup; llt_alloc_t *llt_alloc_entry; llt_delete_t *llt_delete_entry; llt_prefix_free_t *llt_prefix_free; llt_dump_entry_t *llt_dump_entry; llt_hash_t *llt_hash; llt_match_prefix_t *llt_match_prefix; llt_free_entry_t *llt_free_entry; llt_foreach_entry_t *llt_foreach_entry; llt_link_entry_t *llt_link_entry; llt_unlink_entry_t *llt_unlink_entry; llt_fill_sa_entry_t *llt_fill_sa_entry; llt_free_tbl_t *llt_free_tbl; llt_mark_used_t *llt_mark_used; }; MALLOC_DECLARE(M_LLTABLE); /* * LLentry flags */ #define LLE_DELETED 0x0001 /* entry must be deleted */ #define LLE_STATIC 0x0002 /* entry is static */ #define LLE_IFADDR 0x0004 /* entry is interface addr */ #define LLE_VALID 0x0008 /* ll_addr is valid */ #define LLE_REDIRECT 0x0010 /* installed by redirect; has host rtentry */ #define LLE_PUB 0x0020 /* publish entry ??? */ #define LLE_LINKED 0x0040 /* linked to lookup structure */ /* LLE request flags */ #define LLE_EXCLUSIVE 0x2000 /* return lle xlocked */ #define LLE_UNLOCKED 0x4000 /* return lle unlocked */ #define LLE_ADDRONLY 0x4000 /* return lladdr instead of full header */ #define LLE_CREATE 0x8000 /* hint to avoid lle lookup */ /* LLE flags used by fastpath code */ #define RLLE_VALID 0x0001 /* entry is valid */ #define RLLE_IFADDR LLE_IFADDR /* entry is ifaddr */ #define LLATBL_HASH(key, mask) \ (((((((key >> 8) ^ key) >> 8) ^ key) >> 8) ^ key) & mask) struct lltable *lltable_allocate_htbl(uint32_t hsize); void lltable_free(struct lltable *); void lltable_link(struct lltable *llt); void lltable_prefix_free(int, struct sockaddr *, struct sockaddr *, u_int); #if 0 void lltable_drain(int); #endif int lltable_sysctl_dumparp(int, struct sysctl_req *); size_t llentry_free(struct llentry *); struct llentry *llentry_alloc(struct ifnet *, struct lltable *, struct sockaddr_storage *); /* helper functions */ size_t lltable_drop_entry_queue(struct llentry *); void lltable_set_entry_addr(struct ifnet *ifp, struct llentry *lle, const char *linkhdr, size_t linkhdrsize, int lladdr_off); int lltable_try_set_entry_addr(struct ifnet *ifp, struct llentry *lle, const char *linkhdr, size_t linkhdrsize, int lladdr_off); int lltable_calc_llheader(struct ifnet *ifp, int family, char *lladdr, char *buf, size_t *bufsize, int *lladdr_off); void lltable_update_ifaddr(struct lltable *llt); struct llentry *lltable_alloc_entry(struct lltable *llt, u_int flags, const struct sockaddr *l4addr); void lltable_free_entry(struct lltable *llt, struct llentry *lle); int lltable_delete_addr(struct lltable *llt, u_int flags, const struct sockaddr *l3addr); void lltable_link_entry(struct lltable *llt, struct llentry *lle); void lltable_unlink_entry(struct lltable *llt, struct llentry *lle); void lltable_fill_sa_entry(const struct llentry *lle, struct sockaddr *sa); struct ifnet *lltable_get_ifp(const struct lltable *llt); int lltable_get_af(const struct lltable *llt); int lltable_foreach_lle(struct lltable *llt, llt_foreach_cb_t *f, void *farg); /* * Generic link layer address lookup function. */ static __inline struct llentry * lla_lookup(struct lltable *llt, u_int flags, const struct sockaddr *l3addr) { return (llt->llt_lookup(llt, flags, l3addr)); } /* * Notify the LLE code that the entry was used by datapath. */ static __inline void llentry_mark_used(struct llentry *lle) { if (lle->r_skip_req == 0) return; if ((lle->r_flags & RLLE_VALID) != 0) lle->lle_tbl->llt_mark_used(lle); } int lla_rt_output(struct rt_msghdr *, struct rt_addrinfo *); #include enum { LLENTRY_RESOLVED, LLENTRY_TIMEDOUT, LLENTRY_DELETED, LLENTRY_EXPIRED, }; typedef void (*lle_event_fn)(void *, struct llentry *, int); EVENTHANDLER_DECLARE(lle_event, lle_event_fn); #endif /* _NET_IF_LLATBL_H_ */ Index: head/sys/net/if_var.h =================================================================== --- head/sys/net/if_var.h (revision 334117) +++ head/sys/net/if_var.h (revision 334118) @@ -1,763 +1,767 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * From: @(#)if.h 8.1 (Berkeley) 6/10/93 * $FreeBSD$ */ #ifndef _NET_IF_VAR_H_ #define _NET_IF_VAR_H_ /* * Structures defining a network interface, providing a packet * transport mechanism (ala level 0 of the PUP protocols). * * Each interface accepts output datagrams of a specified maximum * length, and provides higher level routines with input datagrams * received from its medium. * * Output occurs when the routine if_output is called, with three parameters: * (*ifp->if_output)(ifp, m, dst, rt) * Here m is the mbuf chain to be sent and dst is the destination address. * The output routine encapsulates the supplied datagram if necessary, * and then transmits it on its medium. * * On input, each interface unwraps the data received by it, and either * places it on the input queue of an internetwork datagram routine * and posts the associated software interrupt, or passes the datagram to a raw * packet input routine. * * Routines exist for locating interfaces by their addresses * or for locating an interface on a certain network, as well as more general * routing and gateway routines maintaining information used to locate * interfaces. These routines live in the files if.c and route.c */ struct rtentry; /* ifa_rtrequest */ struct rt_addrinfo; /* ifa_rtrequest */ struct socket; struct carp_if; struct carp_softc; struct ifvlantrunk; struct route; /* if_output */ struct vnet; struct ifmedia; struct netmap_adapter; struct netdump_methods; #ifdef _KERNEL #include /* ifqueue only? */ #include #include #endif /* _KERNEL */ #include #include #include #include /* XXX */ #include /* struct ifqueue */ #include /* XXX */ #include /* XXX */ #include /* if_link_task */ #define IF_DUNIT_NONE -1 #include -TAILQ_HEAD(ifnethead, ifnet); /* we use TAILQs so that the order of */ +CK_STAILQ_HEAD(ifnethead, ifnet); /* we use TAILQs so that the order of */ CK_STAILQ_HEAD(ifaddrhead, ifaddr); /* instantiation is preserved in the list */ CK_STAILQ_HEAD(ifmultihead, ifmultiaddr); -TAILQ_HEAD(ifgrouphead, ifg_group); +CK_STAILQ_HEAD(ifgrouphead, ifg_group); #ifdef _KERNEL VNET_DECLARE(struct pfil_head, link_pfil_hook); /* packet filter hooks */ #define V_link_pfil_hook VNET(link_pfil_hook) #define HHOOK_IPSEC_INET 0 #define HHOOK_IPSEC_INET6 1 #define HHOOK_IPSEC_COUNT 2 VNET_DECLARE(struct hhook_head *, ipsec_hhh_in[HHOOK_IPSEC_COUNT]); VNET_DECLARE(struct hhook_head *, ipsec_hhh_out[HHOOK_IPSEC_COUNT]); #define V_ipsec_hhh_in VNET(ipsec_hhh_in) #define V_ipsec_hhh_out VNET(ipsec_hhh_out) extern epoch_t net_epoch_preempt; extern epoch_t net_epoch; #endif /* _KERNEL */ typedef enum { IFCOUNTER_IPACKETS = 0, IFCOUNTER_IERRORS, IFCOUNTER_OPACKETS, IFCOUNTER_OERRORS, IFCOUNTER_COLLISIONS, IFCOUNTER_IBYTES, IFCOUNTER_OBYTES, IFCOUNTER_IMCASTS, IFCOUNTER_OMCASTS, IFCOUNTER_IQDROPS, IFCOUNTER_OQDROPS, IFCOUNTER_NOPROTO, IFCOUNTERS /* Array size. */ } ift_counter; typedef struct ifnet * if_t; typedef void (*if_start_fn_t)(if_t); typedef int (*if_ioctl_fn_t)(if_t, u_long, caddr_t); typedef void (*if_init_fn_t)(void *); typedef void (*if_qflush_fn_t)(if_t); typedef int (*if_transmit_fn_t)(if_t, struct mbuf *); typedef uint64_t (*if_get_counter_t)(if_t, ift_counter); struct ifnet_hw_tsomax { u_int tsomaxbytes; /* TSO total burst length limit in bytes */ u_int tsomaxsegcount; /* TSO maximum segment count */ u_int tsomaxsegsize; /* TSO maximum segment size in bytes */ }; /* Interface encap request types */ typedef enum { IFENCAP_LL = 1 /* pre-calculate link-layer header */ } ife_type; /* * The structure below allows to request various pre-calculated L2/L3 headers * for different media. Requests varies by type (rtype field). * * IFENCAP_LL type: pre-calculates link header based on address family * and destination lladdr. * * Input data fields: * buf: pointer to destination buffer * bufsize: buffer size * flags: IFENCAP_FLAG_BROADCAST if destination is broadcast * family: address family defined by AF_ constant. * lladdr: pointer to link-layer address * lladdr_len: length of link-layer address * hdata: pointer to L3 header (optional, used for ARP requests). * Output data fields: * buf: encap data is stored here * bufsize: resulting encap length is stored here * lladdr_off: offset of link-layer address from encap hdr start * hdata: L3 header may be altered if necessary */ struct if_encap_req { u_char *buf; /* Destination buffer (w) */ size_t bufsize; /* size of provided buffer (r) */ ife_type rtype; /* request type (r) */ uint32_t flags; /* Request flags (r) */ int family; /* Address family AF_* (r) */ int lladdr_off; /* offset from header start (w) */ int lladdr_len; /* lladdr length (r) */ char *lladdr; /* link-level address pointer (r) */ char *hdata; /* Upper layer header data (rw) */ }; #define IFENCAP_FLAG_BROADCAST 0x02 /* Destination is broadcast */ /* * Network interface send tag support. The storage of "struct * m_snd_tag" comes from the network driver and it is free to allocate * as much additional space as it wants for its own use. */ struct m_snd_tag; #define IF_SND_TAG_TYPE_RATE_LIMIT 0 #define IF_SND_TAG_TYPE_UNLIMITED 1 #define IF_SND_TAG_TYPE_MAX 2 struct if_snd_tag_alloc_header { uint32_t type; /* send tag type, see IF_SND_TAG_XXX */ uint32_t flowid; /* mbuf hash value */ uint32_t flowtype; /* mbuf hash type */ }; struct if_snd_tag_alloc_rate_limit { struct if_snd_tag_alloc_header hdr; uint64_t max_rate; /* in bytes/s */ }; struct if_snd_tag_rate_limit_params { uint64_t max_rate; /* in bytes/s */ uint32_t queue_level; /* 0 (empty) .. 65535 (full) */ #define IF_SND_QUEUE_LEVEL_MIN 0 #define IF_SND_QUEUE_LEVEL_MAX 65535 uint32_t reserved; /* padding */ }; union if_snd_tag_alloc_params { struct if_snd_tag_alloc_header hdr; struct if_snd_tag_alloc_rate_limit rate_limit; struct if_snd_tag_alloc_rate_limit unlimited; }; union if_snd_tag_modify_params { struct if_snd_tag_rate_limit_params rate_limit; struct if_snd_tag_rate_limit_params unlimited; }; union if_snd_tag_query_params { struct if_snd_tag_rate_limit_params rate_limit; struct if_snd_tag_rate_limit_params unlimited; }; typedef int (if_snd_tag_alloc_t)(struct ifnet *, union if_snd_tag_alloc_params *, struct m_snd_tag **); typedef int (if_snd_tag_modify_t)(struct m_snd_tag *, union if_snd_tag_modify_params *); typedef int (if_snd_tag_query_t)(struct m_snd_tag *, union if_snd_tag_query_params *); typedef void (if_snd_tag_free_t)(struct m_snd_tag *); /* * Structure defining a network interface. */ struct ifnet { /* General book keeping of interface lists. */ - TAILQ_ENTRY(ifnet) if_link; /* all struct ifnets are chained */ + STAILQ_ENTRY(ifnet) if_link; /* all struct ifnets are chained (CK_) */ LIST_ENTRY(ifnet) if_clones; /* interfaces of a cloner */ - TAILQ_HEAD(, ifg_list) if_groups; /* linked list of groups per if */ + STAILQ_HEAD(, ifg_list) if_groups; /* linked list of groups per if (CK_) */ /* protected by if_addr_lock */ u_char if_alloctype; /* if_type at time of allocation */ /* Driver and protocol specific information that remains stable. */ void *if_softc; /* pointer to driver state */ void *if_llsoftc; /* link layer softc */ void *if_l2com; /* pointer to protocol bits */ const char *if_dname; /* driver name */ int if_dunit; /* unit or IF_DUNIT_NONE */ u_short if_index; /* numeric abbreviation for this if */ short if_index_reserved; /* spare space to grow if_index */ char if_xname[IFNAMSIZ]; /* external name (name + unit) */ char *if_description; /* interface description */ /* Variable fields that are touched by the stack and drivers. */ int if_flags; /* up/down, broadcast, etc. */ int if_drv_flags; /* driver-managed status flags */ int if_capabilities; /* interface features & capabilities */ int if_capenable; /* enabled features & capabilities */ void *if_linkmib; /* link-type-specific MIB data */ size_t if_linkmiblen; /* length of above data */ u_int if_refcount; /* reference count */ /* These fields are shared with struct if_data. */ uint8_t if_type; /* ethernet, tokenring, etc */ uint8_t if_addrlen; /* media address length */ uint8_t if_hdrlen; /* media header length */ uint8_t if_link_state; /* current link state */ uint32_t if_mtu; /* maximum transmission unit */ uint32_t if_metric; /* routing metric (external only) */ uint64_t if_baudrate; /* linespeed */ uint64_t if_hwassist; /* HW offload capabilities, see IFCAP */ time_t if_epoch; /* uptime at attach or stat reset */ struct timeval if_lastchange; /* time of last administrative change */ struct ifaltq if_snd; /* output queue (includes altq) */ struct task if_linktask; /* task for link change events */ /* Addresses of different protocol families assigned to this if. */ struct mtx if_addr_lock; /* lock to protect address lists */ /* * if_addrhead is the list of all addresses associated to * an interface. * Some code in the kernel assumes that first element * of the list has type AF_LINK, and contains sockaddr_dl * addresses which store the link-level address and the name * of the interface. * However, access to the AF_LINK address through this * field is deprecated. Use if_addr or ifaddr_byindex() instead. */ struct ifaddrhead if_addrhead; /* linked list of addresses per if */ struct ifmultihead if_multiaddrs; /* multicast addresses configured */ int if_amcount; /* number of all-multicast requests */ struct ifaddr *if_addr; /* pointer to link-level address */ void *if_hw_addr; /* hardware link-level address */ const u_int8_t *if_broadcastaddr; /* linklevel broadcast bytestring */ - struct rwlock if_afdata_lock; + struct mtx if_afdata_lock; void *if_afdata[AF_MAX]; int if_afdata_initialized; /* Additional features hung off the interface. */ u_int if_fib; /* interface FIB */ struct vnet *if_vnet; /* pointer to network stack instance */ struct vnet *if_home_vnet; /* where this ifnet originates from */ struct ifvlantrunk *if_vlantrunk; /* pointer to 802.1q data */ struct bpf_if *if_bpf; /* packet filter structure */ int if_pcount; /* number of promiscuous listeners */ void *if_bridge; /* bridge glue */ void *if_lagg; /* lagg glue */ void *if_pf_kif; /* pf glue */ struct carp_if *if_carp; /* carp interface structure */ struct label *if_label; /* interface MAC label */ struct netmap_adapter *if_netmap; /* netmap(4) softc */ /* Various procedures of the layer2 encapsulation and drivers. */ int (*if_output) /* output routine (enqueue) */ (struct ifnet *, struct mbuf *, const struct sockaddr *, struct route *); void (*if_input) /* input routine (from h/w driver) */ (struct ifnet *, struct mbuf *); struct mbuf *(*if_bridge_input)(struct ifnet *, struct mbuf *); int (*if_bridge_output)(struct ifnet *, struct mbuf *, struct sockaddr *, struct rtentry *); void (*if_bridge_linkstate)(struct ifnet *ifp); if_start_fn_t if_start; /* initiate output routine */ if_ioctl_fn_t if_ioctl; /* ioctl routine */ if_init_fn_t if_init; /* Init routine */ int (*if_resolvemulti) /* validate/resolve multicast */ (struct ifnet *, struct sockaddr **, struct sockaddr *); if_qflush_fn_t if_qflush; /* flush any queue */ if_transmit_fn_t if_transmit; /* initiate output routine */ void (*if_reassign) /* reassign to vnet routine */ (struct ifnet *, struct vnet *, char *); if_get_counter_t if_get_counter; /* get counter values */ int (*if_requestencap) /* make link header from request */ (struct ifnet *, struct if_encap_req *); /* Statistics. */ counter_u64_t if_counters[IFCOUNTERS]; /* Stuff that's only temporary and doesn't belong here. */ /* * Network adapter TSO limits: * =========================== * * If the "if_hw_tsomax" field is zero the maximum segment * length limit does not apply. If the "if_hw_tsomaxsegcount" * or the "if_hw_tsomaxsegsize" field is zero the TSO segment * count limit does not apply. If all three fields are zero, * there is no TSO limit. * * NOTE: The TSO limits should reflect the values used in the * BUSDMA tag a network adapter is using to load a mbuf chain * for transmission. The TCP/IP network stack will subtract * space for all linklevel and protocol level headers and * ensure that the full mbuf chain passed to the network * adapter fits within the given limits. */ u_int if_hw_tsomax; /* TSO maximum size in bytes */ u_int if_hw_tsomaxsegcount; /* TSO maximum segment count */ u_int if_hw_tsomaxsegsize; /* TSO maximum segment size in bytes */ /* * Network adapter send tag support: */ if_snd_tag_alloc_t *if_snd_tag_alloc; if_snd_tag_modify_t *if_snd_tag_modify; if_snd_tag_query_t *if_snd_tag_query; if_snd_tag_free_t *if_snd_tag_free; /* Ethernet PCP */ uint8_t if_pcp; /* * Netdump hooks to be called while dumping. */ struct netdump_methods *if_netdump_methods; + struct epoch_context if_epoch_ctx; /* * Spare fields to be added before branching a stable branch, so * that structure can be enhanced without changing the kernel * binary interface. */ int if_ispare[4]; /* general use */ }; /* for compatibility with other BSDs */ #define if_name(ifp) ((ifp)->if_xname) /* * Locks for address lists on the network interface. */ #define IF_ADDR_LOCK_INIT(if) mtx_init(&(if)->if_addr_lock, "if_addr_lock", NULL, MTX_DEF) #define IF_ADDR_LOCK_DESTROY(if) mtx_destroy(&(if)->if_addr_lock) #define IF_ADDR_RLOCK(if) epoch_enter_preempt(net_epoch_preempt); #define IF_ADDR_RUNLOCK(if) epoch_exit_preempt(net_epoch_preempt); #define IF_ADDR_WLOCK(if) mtx_lock(&(if)->if_addr_lock) #define IF_ADDR_WUNLOCK(if) mtx_unlock(&(if)->if_addr_lock) #define IF_ADDR_LOCK_ASSERT(if) MPASS(in_epoch() || mtx_owned(&(if)->if_addr_lock)) #define IF_ADDR_WLOCK_ASSERT(if) mtx_assert(&(if)->if_addr_lock, MA_OWNED) +#define NET_EPOCH_ENTER() epoch_enter_preempt(net_epoch_preempt) +#define NET_EPOCH_EXIT() epoch_exit_preempt(net_epoch_preempt) + /* * Function variations on locking macros intended to be used by loadable * kernel modules in order to divorce them from the internals of address list * locking. */ void if_addr_rlock(struct ifnet *ifp); /* if_addrhead */ void if_addr_runlock(struct ifnet *ifp); /* if_addrhead */ void if_maddr_rlock(if_t ifp); /* if_multiaddrs */ void if_maddr_runlock(if_t ifp); /* if_multiaddrs */ #ifdef _KERNEL #ifdef _SYS_EVENTHANDLER_H_ /* interface link layer address change event */ typedef void (*iflladdr_event_handler_t)(void *, struct ifnet *); EVENTHANDLER_DECLARE(iflladdr_event, iflladdr_event_handler_t); /* interface address change event */ typedef void (*ifaddr_event_handler_t)(void *, struct ifnet *); EVENTHANDLER_DECLARE(ifaddr_event, ifaddr_event_handler_t); /* new interface arrival event */ typedef void (*ifnet_arrival_event_handler_t)(void *, struct ifnet *); EVENTHANDLER_DECLARE(ifnet_arrival_event, ifnet_arrival_event_handler_t); /* interface departure event */ typedef void (*ifnet_departure_event_handler_t)(void *, struct ifnet *); EVENTHANDLER_DECLARE(ifnet_departure_event, ifnet_departure_event_handler_t); /* Interface link state change event */ typedef void (*ifnet_link_event_handler_t)(void *, struct ifnet *, int); EVENTHANDLER_DECLARE(ifnet_link_event, ifnet_link_event_handler_t); /* Interface up/down event */ #define IFNET_EVENT_UP 0 #define IFNET_EVENT_DOWN 1 #define IFNET_EVENT_PCP 2 /* priority code point, PCP */ typedef void (*ifnet_event_fn)(void *, struct ifnet *ifp, int event); EVENTHANDLER_DECLARE(ifnet_event, ifnet_event_fn); #endif /* _SYS_EVENTHANDLER_H_ */ /* * interface groups */ struct ifg_group { char ifg_group[IFNAMSIZ]; u_int ifg_refcnt; void *ifg_pf_kif; - TAILQ_HEAD(, ifg_member) ifg_members; - TAILQ_ENTRY(ifg_group) ifg_next; + STAILQ_HEAD(, ifg_member) ifg_members; /* (CK_) */ + STAILQ_ENTRY(ifg_group) ifg_next; /* (CK_) */ }; struct ifg_member { - TAILQ_ENTRY(ifg_member) ifgm_next; + STAILQ_ENTRY(ifg_member) ifgm_next; /* (CK_) */ struct ifnet *ifgm_ifp; }; struct ifg_list { struct ifg_group *ifgl_group; - TAILQ_ENTRY(ifg_list) ifgl_next; + STAILQ_ENTRY(ifg_list) ifgl_next; /* (CK_) */ }; #ifdef _SYS_EVENTHANDLER_H_ /* group attach event */ typedef void (*group_attach_event_handler_t)(void *, struct ifg_group *); EVENTHANDLER_DECLARE(group_attach_event, group_attach_event_handler_t); /* group detach event */ typedef void (*group_detach_event_handler_t)(void *, struct ifg_group *); EVENTHANDLER_DECLARE(group_detach_event, group_detach_event_handler_t); /* group change event */ typedef void (*group_change_event_handler_t)(void *, const char *); EVENTHANDLER_DECLARE(group_change_event, group_change_event_handler_t); #endif /* _SYS_EVENTHANDLER_H_ */ #define IF_AFDATA_LOCK_INIT(ifp) \ - rw_init(&(ifp)->if_afdata_lock, "if_afdata") + mtx_init(&(ifp)->if_afdata_lock, "if_afdata", NULL, MTX_DEF) -#define IF_AFDATA_WLOCK(ifp) rw_wlock(&(ifp)->if_afdata_lock) -#define IF_AFDATA_RLOCK(ifp) rw_rlock(&(ifp)->if_afdata_lock) -#define IF_AFDATA_WUNLOCK(ifp) rw_wunlock(&(ifp)->if_afdata_lock) -#define IF_AFDATA_RUNLOCK(ifp) rw_runlock(&(ifp)->if_afdata_lock) +#define IF_AFDATA_WLOCK(ifp) mtx_lock(&(ifp)->if_afdata_lock) +#define IF_AFDATA_RLOCK(ifp) epoch_enter_preempt(net_epoch_preempt) +#define IF_AFDATA_WUNLOCK(ifp) mtx_unlock(&(ifp)->if_afdata_lock) +#define IF_AFDATA_RUNLOCK(ifp) epoch_exit_preempt(net_epoch_preempt) #define IF_AFDATA_LOCK(ifp) IF_AFDATA_WLOCK(ifp) #define IF_AFDATA_UNLOCK(ifp) IF_AFDATA_WUNLOCK(ifp) -#define IF_AFDATA_TRYLOCK(ifp) rw_try_wlock(&(ifp)->if_afdata_lock) -#define IF_AFDATA_DESTROY(ifp) rw_destroy(&(ifp)->if_afdata_lock) +#define IF_AFDATA_TRYLOCK(ifp) mtx_trylock(&(ifp)->if_afdata_lock) +#define IF_AFDATA_DESTROY(ifp) mtx_destroy(&(ifp)->if_afdata_lock) -#define IF_AFDATA_LOCK_ASSERT(ifp) rw_assert(&(ifp)->if_afdata_lock, RA_LOCKED) -#define IF_AFDATA_RLOCK_ASSERT(ifp) rw_assert(&(ifp)->if_afdata_lock, RA_RLOCKED) -#define IF_AFDATA_WLOCK_ASSERT(ifp) rw_assert(&(ifp)->if_afdata_lock, RA_WLOCKED) -#define IF_AFDATA_UNLOCK_ASSERT(ifp) rw_assert(&(ifp)->if_afdata_lock, RA_UNLOCKED) +#define IF_AFDATA_LOCK_ASSERT(ifp) MPASS(in_epoch() || mtx_owned(&(ifp)->if_afdata_lock)) +#define IF_AFDATA_RLOCK_ASSERT(ifp) MPASS(in_epoch()); +#define IF_AFDATA_WLOCK_ASSERT(ifp) mtx_assert(&(ifp)->if_afdata_lock, MA_OWNED) +#define IF_AFDATA_UNLOCK_ASSERT(ifp) mtx_assert(&(ifp)->if_afdata_lock, MA_NOTOWNED) /* * 72 was chosen below because it is the size of a TCP/IP * header (40) + the minimum mss (32). */ #define IF_MINMTU 72 #define IF_MAXMTU 65535 #define TOEDEV(ifp) ((ifp)->if_llsoftc) /* * The ifaddr structure contains information about one address * of an interface. They are maintained by the different address families, * are allocated and attached when an address is set, and are linked * together so all addresses for an interface can be located. * * NOTE: a 'struct ifaddr' is always at the beginning of a larger * chunk of malloc'ed memory, where we store the three addresses * (ifa_addr, ifa_dstaddr and ifa_netmask) referenced here. */ struct ifaddr { struct sockaddr *ifa_addr; /* address of interface */ struct sockaddr *ifa_dstaddr; /* other end of p-to-p link */ #define ifa_broadaddr ifa_dstaddr /* broadcast address interface */ struct sockaddr *ifa_netmask; /* used to determine subnet */ struct ifnet *ifa_ifp; /* back-pointer to interface */ struct carp_softc *ifa_carp; /* pointer to CARP data */ CK_STAILQ_ENTRY(ifaddr) ifa_link; /* queue macro glue */ void (*ifa_rtrequest) /* check or clean routes (+ or -)'d */ (int, struct rtentry *, struct rt_addrinfo *); u_short ifa_flags; /* mostly rt_flags for cloning */ #define IFA_ROUTE RTF_UP /* route installed */ #define IFA_RTSELF RTF_HOST /* loopback route to self installed */ u_int ifa_refcnt; /* references to this structure */ counter_u64_t ifa_ipackets; counter_u64_t ifa_opackets; counter_u64_t ifa_ibytes; counter_u64_t ifa_obytes; struct epoch_context ifa_epoch_ctx; }; struct ifaddr * ifa_alloc(size_t size, int flags); void ifa_free(struct ifaddr *ifa); void ifa_ref(struct ifaddr *ifa); /* * Multicast address structure. This is analogous to the ifaddr * structure except that it keeps track of multicast addresses. */ struct ifmultiaddr { CK_STAILQ_ENTRY(ifmultiaddr) ifma_link; /* queue macro glue */ struct sockaddr *ifma_addr; /* address this membership is for */ struct sockaddr *ifma_lladdr; /* link-layer translation, if any */ struct ifnet *ifma_ifp; /* back-pointer to interface */ u_int ifma_refcount; /* reference count */ void *ifma_protospec; /* protocol-specific state, if any */ struct ifmultiaddr *ifma_llifma; /* pointer to ifma for ifma_lladdr */ struct epoch_context ifma_epoch_ctx; }; extern struct rwlock ifnet_rwlock; extern struct sx ifnet_sxlock; #define IFNET_WLOCK() do { \ sx_xlock(&ifnet_sxlock); \ rw_wlock(&ifnet_rwlock); \ } while (0) #define IFNET_WUNLOCK() do { \ rw_wunlock(&ifnet_rwlock); \ sx_xunlock(&ifnet_sxlock); \ } while (0) /* * To assert the ifnet lock, you must know not only whether it's for read or * write, but also whether it was acquired with sleep support or not. */ #define IFNET_RLOCK_ASSERT() sx_assert(&ifnet_sxlock, SA_SLOCKED) -#define IFNET_RLOCK_NOSLEEP_ASSERT() rw_assert(&ifnet_rwlock, RA_RLOCKED) +#define IFNET_RLOCK_NOSLEEP_ASSERT() MPASS(in_epoch()) #define IFNET_WLOCK_ASSERT() do { \ sx_assert(&ifnet_sxlock, SA_XLOCKED); \ rw_assert(&ifnet_rwlock, RA_WLOCKED); \ } while (0) #define IFNET_RLOCK() sx_slock(&ifnet_sxlock) -#define IFNET_RLOCK_NOSLEEP() rw_rlock(&ifnet_rwlock) +#define IFNET_RLOCK_NOSLEEP() epoch_enter_preempt(net_epoch_preempt) #define IFNET_RUNLOCK() sx_sunlock(&ifnet_sxlock) -#define IFNET_RUNLOCK_NOSLEEP() rw_runlock(&ifnet_rwlock) +#define IFNET_RUNLOCK_NOSLEEP() epoch_exit_preempt(net_epoch_preempt) /* * Look up an ifnet given its index; the _ref variant also acquires a * reference that must be freed using if_rele(). It is almost always a bug * to call ifnet_byindex() instead of ifnet_byindex_ref(). */ struct ifnet *ifnet_byindex(u_short idx); struct ifnet *ifnet_byindex_locked(u_short idx); struct ifnet *ifnet_byindex_ref(u_short idx); /* * Given the index, ifaddr_byindex() returns the one and only * link-level ifaddr for the interface. You are not supposed to use * it to traverse the list of addresses associated to the interface. */ struct ifaddr *ifaddr_byindex(u_short idx); VNET_DECLARE(struct ifnethead, ifnet); VNET_DECLARE(struct ifgrouphead, ifg_head); VNET_DECLARE(int, if_index); VNET_DECLARE(struct ifnet *, loif); /* first loopback interface */ #define V_ifnet VNET(ifnet) #define V_ifg_head VNET(ifg_head) #define V_if_index VNET(if_index) #define V_loif VNET(loif) #ifdef MCAST_VERBOSE #define MCDPRINTF printf #else #define MCDPRINTF(...) #endif int if_addgroup(struct ifnet *, const char *); int if_delgroup(struct ifnet *, const char *); int if_addmulti(struct ifnet *, struct sockaddr *, struct ifmultiaddr **); int if_allmulti(struct ifnet *, int); struct ifnet* if_alloc(u_char); void if_attach(struct ifnet *); void if_dead(struct ifnet *); int if_delmulti(struct ifnet *, struct sockaddr *); void if_delmulti_ifma(struct ifmultiaddr *); void if_delmulti_ifma_flags(struct ifmultiaddr *, int flags); void if_detach(struct ifnet *); void if_purgeaddrs(struct ifnet *); void if_delallmulti(struct ifnet *); void if_down(struct ifnet *); struct ifmultiaddr * if_findmulti(struct ifnet *, const struct sockaddr *); void if_freemulti(struct ifmultiaddr *ifma); void if_free(struct ifnet *); void if_initname(struct ifnet *, const char *, int); void if_link_state_change(struct ifnet *, int); int if_printf(struct ifnet *, const char *, ...) __printflike(2, 3); void if_ref(struct ifnet *); void if_rele(struct ifnet *); int if_setlladdr(struct ifnet *, const u_char *, int); void if_up(struct ifnet *); int ifioctl(struct socket *, u_long, caddr_t, struct thread *); int ifpromisc(struct ifnet *, int); struct ifnet *ifunit(const char *); struct ifnet *ifunit_ref(const char *); int ifa_add_loopback_route(struct ifaddr *, struct sockaddr *); int ifa_del_loopback_route(struct ifaddr *, struct sockaddr *); int ifa_switch_loopback_route(struct ifaddr *, struct sockaddr *); struct ifaddr *ifa_ifwithaddr(const struct sockaddr *); int ifa_ifwithaddr_check(const struct sockaddr *); struct ifaddr *ifa_ifwithbroadaddr(const struct sockaddr *, int); struct ifaddr *ifa_ifwithdstaddr(const struct sockaddr *, int); struct ifaddr *ifa_ifwithnet(const struct sockaddr *, int, int); struct ifaddr *ifa_ifwithroute(int, const struct sockaddr *, struct sockaddr *, u_int); struct ifaddr *ifaof_ifpforaddr(const struct sockaddr *, struct ifnet *); int ifa_preferred(struct ifaddr *, struct ifaddr *); int if_simloop(struct ifnet *ifp, struct mbuf *m, int af, int hlen); typedef void *if_com_alloc_t(u_char type, struct ifnet *ifp); typedef void if_com_free_t(void *com, u_char type); void if_register_com_alloc(u_char type, if_com_alloc_t *a, if_com_free_t *f); void if_deregister_com_alloc(u_char type); void if_data_copy(struct ifnet *, struct if_data *); uint64_t if_get_counter_default(struct ifnet *, ift_counter); void if_inc_counter(struct ifnet *, ift_counter, int64_t); #define IF_LLADDR(ifp) \ LLADDR((struct sockaddr_dl *)((ifp)->if_addr->ifa_addr)) uint64_t if_setbaudrate(if_t ifp, uint64_t baudrate); uint64_t if_getbaudrate(if_t ifp); int if_setcapabilities(if_t ifp, int capabilities); int if_setcapabilitiesbit(if_t ifp, int setbit, int clearbit); int if_getcapabilities(if_t ifp); int if_togglecapenable(if_t ifp, int togglecap); int if_setcapenable(if_t ifp, int capenable); int if_setcapenablebit(if_t ifp, int setcap, int clearcap); int if_getcapenable(if_t ifp); const char *if_getdname(if_t ifp); int if_setdev(if_t ifp, void *dev); int if_setdrvflagbits(if_t ifp, int if_setflags, int clear_flags); int if_getdrvflags(if_t ifp); int if_setdrvflags(if_t ifp, int flags); int if_clearhwassist(if_t ifp); int if_sethwassistbits(if_t ifp, int toset, int toclear); int if_sethwassist(if_t ifp, int hwassist_bit); int if_gethwassist(if_t ifp); int if_setsoftc(if_t ifp, void *softc); void *if_getsoftc(if_t ifp); int if_setflags(if_t ifp, int flags); int if_gethwaddr(if_t ifp, struct ifreq *); int if_setmtu(if_t ifp, int mtu); int if_getmtu(if_t ifp); int if_getmtu_family(if_t ifp, int family); int if_setflagbits(if_t ifp, int set, int clear); int if_getflags(if_t ifp); int if_sendq_empty(if_t ifp); int if_setsendqready(if_t ifp); int if_setsendqlen(if_t ifp, int tx_desc_count); int if_sethwtsomax(if_t ifp, u_int if_hw_tsomax); int if_sethwtsomaxsegcount(if_t ifp, u_int if_hw_tsomaxsegcount); int if_sethwtsomaxsegsize(if_t ifp, u_int if_hw_tsomaxsegsize); u_int if_gethwtsomax(if_t ifp); u_int if_gethwtsomaxsegcount(if_t ifp); u_int if_gethwtsomaxsegsize(if_t ifp); int if_input(if_t ifp, struct mbuf* sendmp); int if_sendq_prepend(if_t ifp, struct mbuf *m); struct mbuf *if_dequeue(if_t ifp); int if_setifheaderlen(if_t ifp, int len); void if_setrcvif(struct mbuf *m, if_t ifp); void if_setvtag(struct mbuf *m, u_int16_t tag); u_int16_t if_getvtag(struct mbuf *m); int if_vlantrunkinuse(if_t ifp); caddr_t if_getlladdr(if_t ifp); void *if_gethandle(u_char); void if_bpfmtap(if_t ifp, struct mbuf *m); void if_etherbpfmtap(if_t ifp, struct mbuf *m); void if_vlancap(if_t ifp); int if_setupmultiaddr(if_t ifp, void *mta, int *cnt, int max); int if_multiaddr_array(if_t ifp, void *mta, int *cnt, int max); int if_multiaddr_count(if_t ifp, int max); int if_multi_apply(struct ifnet *ifp, int (*filter)(void *, struct ifmultiaddr *, int), void *arg); int if_getamcount(if_t ifp); struct ifaddr * if_getifaddr(if_t ifp); /* Functions */ void if_setinitfn(if_t ifp, void (*)(void *)); void if_setioctlfn(if_t ifp, int (*)(if_t, u_long, caddr_t)); void if_setstartfn(if_t ifp, void (*)(if_t)); void if_settransmitfn(if_t ifp, if_transmit_fn_t); void if_setqflushfn(if_t ifp, if_qflush_fn_t); void if_setgetcounterfn(if_t ifp, if_get_counter_t); /* Revisit the below. These are inline functions originally */ int drbr_inuse_drv(if_t ifp, struct buf_ring *br); struct mbuf* drbr_dequeue_drv(if_t ifp, struct buf_ring *br); int drbr_needs_enqueue_drv(if_t ifp, struct buf_ring *br); int drbr_enqueue_drv(if_t ifp, struct buf_ring *br, struct mbuf *m); /* TSO */ void if_hw_tsomax_common(if_t ifp, struct ifnet_hw_tsomax *); int if_hw_tsomax_update(if_t ifp, struct ifnet_hw_tsomax *); /* accessors for struct ifreq */ void *ifr_data_get_ptr(void *ifrp); #ifdef DEVICE_POLLING enum poll_cmd { POLL_ONLY, POLL_AND_CHECK_STATUS }; typedef int poll_handler_t(if_t ifp, enum poll_cmd cmd, int count); int ether_poll_register(poll_handler_t *h, if_t ifp); int ether_poll_deregister(if_t ifp); #endif /* DEVICE_POLLING */ #endif /* _KERNEL */ #include /* XXXAO: temporary unconditional include */ #endif /* !_NET_IF_VAR_H_ */ Index: head/sys/net/route.c =================================================================== --- head/sys/net/route.c (revision 334117) +++ head/sys/net/route.c (revision 334118) @@ -1,2256 +1,2256 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1980, 1986, 1991, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)route.c 8.3.1.1 (Berkeley) 2/23/95 * $FreeBSD$ */ /************************************************************************ * Note: In this file a 'fib' is a "forwarding information base" * * Which is the new name for an in kernel routing (next hop) table. * ***********************************************************************/ #include "opt_inet.h" #include "opt_inet6.h" #include "opt_route.h" #include "opt_sctp.h" #include "opt_mrouting.h" #include "opt_mpath.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef RADIX_MPATH #include #endif #include #include #include #define RT_MAXFIBS UINT16_MAX /* Kernel config default option. */ #ifdef ROUTETABLES #if ROUTETABLES <= 0 #error "ROUTETABLES defined too low" #endif #if ROUTETABLES > RT_MAXFIBS #error "ROUTETABLES defined too big" #endif #define RT_NUMFIBS ROUTETABLES #endif /* ROUTETABLES */ /* Initialize to default if not otherwise set. */ #ifndef RT_NUMFIBS #define RT_NUMFIBS 1 #endif #if defined(INET) || defined(INET6) #ifdef SCTP extern void sctp_addr_change(struct ifaddr *ifa, int cmd); #endif /* SCTP */ #endif /* This is read-only.. */ u_int rt_numfibs = RT_NUMFIBS; SYSCTL_UINT(_net, OID_AUTO, fibs, CTLFLAG_RDTUN, &rt_numfibs, 0, ""); /* * By default add routes to all fibs for new interfaces. * Once this is set to 0 then only allocate routes on interface * changes for the FIB of the caller when adding a new set of addresses * to an interface. XXX this is a shotgun aproach to a problem that needs * a more fine grained solution.. that will come. * XXX also has the problems getting the FIB from curthread which will not * always work given the fib can be overridden and prefixes can be added * from the network stack context. */ VNET_DEFINE(u_int, rt_add_addr_allfibs) = 1; SYSCTL_UINT(_net, OID_AUTO, add_addr_allfibs, CTLFLAG_RWTUN | CTLFLAG_VNET, &VNET_NAME(rt_add_addr_allfibs), 0, ""); VNET_DEFINE(struct rtstat, rtstat); #define V_rtstat VNET(rtstat) VNET_DEFINE(struct rib_head *, rt_tables); #define V_rt_tables VNET(rt_tables) VNET_DEFINE(int, rttrash); /* routes not in table but not freed */ #define V_rttrash VNET(rttrash) /* * Convert a 'struct radix_node *' to a 'struct rtentry *'. * The operation can be done safely (in this code) because a * 'struct rtentry' starts with two 'struct radix_node''s, the first * one representing leaf nodes in the routing tree, which is * what the code in radix.c passes us as a 'struct radix_node'. * * But because there are a lot of assumptions in this conversion, * do not cast explicitly, but always use the macro below. */ #define RNTORT(p) ((struct rtentry *)(p)) static VNET_DEFINE(uma_zone_t, rtzone); /* Routing table UMA zone. */ #define V_rtzone VNET(rtzone) static int rtrequest1_fib_change(struct rib_head *, struct rt_addrinfo *, struct rtentry **, u_int); static void rt_setmetrics(const struct rt_addrinfo *, struct rtentry *); static int rt_ifdelroute(const struct rtentry *rt, void *arg); static struct rtentry *rt_unlinkrte(struct rib_head *rnh, struct rt_addrinfo *info, int *perror); static void rt_notifydelete(struct rtentry *rt, struct rt_addrinfo *info); #ifdef RADIX_MPATH static struct radix_node *rt_mpath_unlink(struct rib_head *rnh, struct rt_addrinfo *info, struct rtentry *rto, int *perror); #endif static int rt_exportinfo(struct rtentry *rt, struct rt_addrinfo *info, int flags); struct if_mtuinfo { struct ifnet *ifp; int mtu; }; static int if_updatemtu_cb(struct radix_node *, void *); /* * handler for net.my_fibnum */ static int sysctl_my_fibnum(SYSCTL_HANDLER_ARGS) { int fibnum; int error; fibnum = curthread->td_proc->p_fibnum; error = sysctl_handle_int(oidp, &fibnum, 0, req); return (error); } SYSCTL_PROC(_net, OID_AUTO, my_fibnum, CTLTYPE_INT|CTLFLAG_RD, NULL, 0, &sysctl_my_fibnum, "I", "default FIB of caller"); static __inline struct rib_head ** rt_tables_get_rnh_ptr(int table, int fam) { struct rib_head **rnh; KASSERT(table >= 0 && table < rt_numfibs, ("%s: table out of bounds.", __func__)); KASSERT(fam >= 0 && fam < (AF_MAX+1), ("%s: fam out of bounds.", __func__)); /* rnh is [fib=0][af=0]. */ rnh = (struct rib_head **)V_rt_tables; /* Get the offset to the requested table and fam. */ rnh += table * (AF_MAX+1) + fam; return (rnh); } struct rib_head * rt_tables_get_rnh(int table, int fam) { return (*rt_tables_get_rnh_ptr(table, fam)); } u_int rt_tables_get_gen(int table, int fam) { struct rib_head *rnh; rnh = *rt_tables_get_rnh_ptr(table, fam); KASSERT(rnh != NULL, ("%s: NULL rib_head pointer table %d fam %d", __func__, table, fam)); return (rnh->rnh_gen); } /* * route initialization must occur before ip6_init2(), which happenas at * SI_ORDER_MIDDLE. */ static void route_init(void) { /* whack the tunable ints into line. */ if (rt_numfibs > RT_MAXFIBS) rt_numfibs = RT_MAXFIBS; if (rt_numfibs == 0) rt_numfibs = 1; } SYSINIT(route_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, route_init, NULL); static int rtentry_zinit(void *mem, int size, int how) { struct rtentry *rt = mem; rt->rt_pksent = counter_u64_alloc(how); if (rt->rt_pksent == NULL) return (ENOMEM); RT_LOCK_INIT(rt); return (0); } static void rtentry_zfini(void *mem, int size) { struct rtentry *rt = mem; RT_LOCK_DESTROY(rt); counter_u64_free(rt->rt_pksent); } static int rtentry_ctor(void *mem, int size, void *arg, int how) { struct rtentry *rt = mem; bzero(rt, offsetof(struct rtentry, rt_endzero)); counter_u64_zero(rt->rt_pksent); rt->rt_chain = NULL; return (0); } static void rtentry_dtor(void *mem, int size, void *arg) { struct rtentry *rt = mem; RT_UNLOCK_COND(rt); } static void vnet_route_init(const void *unused __unused) { struct domain *dom; struct rib_head **rnh; int table; int fam; V_rt_tables = malloc(rt_numfibs * (AF_MAX+1) * sizeof(struct rib_head *), M_RTABLE, M_WAITOK|M_ZERO); V_rtzone = uma_zcreate("rtentry", sizeof(struct rtentry), rtentry_ctor, rtentry_dtor, rtentry_zinit, rtentry_zfini, UMA_ALIGN_PTR, 0); for (dom = domains; dom; dom = dom->dom_next) { if (dom->dom_rtattach == NULL) continue; for (table = 0; table < rt_numfibs; table++) { fam = dom->dom_family; if (table != 0 && fam != AF_INET6 && fam != AF_INET) break; rnh = rt_tables_get_rnh_ptr(table, fam); if (rnh == NULL) panic("%s: rnh NULL", __func__); dom->dom_rtattach((void **)rnh, 0); } } } VNET_SYSINIT(vnet_route_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_FOURTH, vnet_route_init, 0); #ifdef VIMAGE static void vnet_route_uninit(const void *unused __unused) { int table; int fam; struct domain *dom; struct rib_head **rnh; for (dom = domains; dom; dom = dom->dom_next) { if (dom->dom_rtdetach == NULL) continue; for (table = 0; table < rt_numfibs; table++) { fam = dom->dom_family; if (table != 0 && fam != AF_INET6 && fam != AF_INET) break; rnh = rt_tables_get_rnh_ptr(table, fam); if (rnh == NULL) panic("%s: rnh NULL", __func__); dom->dom_rtdetach((void **)rnh, 0); } } free(V_rt_tables, M_RTABLE); uma_zdestroy(V_rtzone); } VNET_SYSUNINIT(vnet_route_uninit, SI_SUB_PROTO_DOMAIN, SI_ORDER_FIRST, vnet_route_uninit, 0); #endif struct rib_head * rt_table_init(int offset) { struct rib_head *rh; rh = malloc(sizeof(struct rib_head), M_RTABLE, M_WAITOK | M_ZERO); /* TODO: These details should be hidded inside radix.c */ /* Init masks tree */ rn_inithead_internal(&rh->head, rh->rnh_nodes, offset); rn_inithead_internal(&rh->rmhead.head, rh->rmhead.mask_nodes, 0); rh->head.rnh_masks = &rh->rmhead; /* Init locks */ RIB_LOCK_INIT(rh); /* Finally, set base callbacks */ rh->rnh_addaddr = rn_addroute; rh->rnh_deladdr = rn_delete; rh->rnh_matchaddr = rn_match; rh->rnh_lookup = rn_lookup; rh->rnh_walktree = rn_walktree; rh->rnh_walktree_from = rn_walktree_from; return (rh); } static int rt_freeentry(struct radix_node *rn, void *arg) { struct radix_head * const rnh = arg; struct radix_node *x; x = (struct radix_node *)rn_delete(rn + 2, NULL, rnh); if (x != NULL) R_Free(x); return (0); } void rt_table_destroy(struct rib_head *rh) { rn_walktree(&rh->rmhead.head, rt_freeentry, &rh->rmhead.head); /* Assume table is already empty */ RIB_LOCK_DESTROY(rh); free(rh, M_RTABLE); } #ifndef _SYS_SYSPROTO_H_ struct setfib_args { int fibnum; }; #endif int sys_setfib(struct thread *td, struct setfib_args *uap) { if (uap->fibnum < 0 || uap->fibnum >= rt_numfibs) return EINVAL; td->td_proc->p_fibnum = uap->fibnum; return (0); } /* * Packet routing routines. */ void rtalloc_ign_fib(struct route *ro, u_long ignore, u_int fibnum) { struct rtentry *rt; if ((rt = ro->ro_rt) != NULL) { if (rt->rt_ifp != NULL && rt->rt_flags & RTF_UP) return; RTFREE(rt); ro->ro_rt = NULL; } ro->ro_rt = rtalloc1_fib(&ro->ro_dst, 1, ignore, fibnum); if (ro->ro_rt) RT_UNLOCK(ro->ro_rt); } /* * Look up the route that matches the address given * Or, at least try.. Create a cloned route if needed. * * The returned route, if any, is locked. */ struct rtentry * rtalloc1(struct sockaddr *dst, int report, u_long ignflags) { return (rtalloc1_fib(dst, report, ignflags, RT_DEFAULT_FIB)); } struct rtentry * rtalloc1_fib(struct sockaddr *dst, int report, u_long ignflags, u_int fibnum) { struct rib_head *rh; struct radix_node *rn; struct rtentry *newrt; struct rt_addrinfo info; int err = 0, msgtype = RTM_MISS; KASSERT((fibnum < rt_numfibs), ("rtalloc1_fib: bad fibnum")); rh = rt_tables_get_rnh(fibnum, dst->sa_family); newrt = NULL; if (rh == NULL) goto miss; /* * Look up the address in the table for that Address Family */ if ((ignflags & RTF_RNH_LOCKED) == 0) RIB_RLOCK(rh); #ifdef INVARIANTS else RIB_LOCK_ASSERT(rh); #endif rn = rh->rnh_matchaddr(dst, &rh->head); if (rn && ((rn->rn_flags & RNF_ROOT) == 0)) { newrt = RNTORT(rn); RT_LOCK(newrt); RT_ADDREF(newrt); if ((ignflags & RTF_RNH_LOCKED) == 0) RIB_RUNLOCK(rh); return (newrt); } else if ((ignflags & RTF_RNH_LOCKED) == 0) RIB_RUNLOCK(rh); /* * Either we hit the root or could not find any match, * which basically means: "cannot get there from here". */ miss: V_rtstat.rts_unreach++; if (report) { /* * If required, report the failure to the supervising * Authorities. * For a delete, this is not an error. (report == 0) */ bzero(&info, sizeof(info)); info.rti_info[RTAX_DST] = dst; rt_missmsg_fib(msgtype, &info, 0, err, fibnum); } return (newrt); } /* * Remove a reference count from an rtentry. * If the count gets low enough, take it out of the routing table */ void rtfree(struct rtentry *rt) { struct rib_head *rnh; KASSERT(rt != NULL,("%s: NULL rt", __func__)); rnh = rt_tables_get_rnh(rt->rt_fibnum, rt_key(rt)->sa_family); KASSERT(rnh != NULL,("%s: NULL rnh", __func__)); RT_LOCK_ASSERT(rt); /* * The callers should use RTFREE_LOCKED() or RTFREE(), so * we should come here exactly with the last reference. */ RT_REMREF(rt); if (rt->rt_refcnt > 0) { log(LOG_DEBUG, "%s: %p has %d refs\n", __func__, rt, rt->rt_refcnt); goto done; } /* * On last reference give the "close method" a chance * to cleanup private state. This also permits (for * IPv4 and IPv6) a chance to decide if the routing table * entry should be purged immediately or at a later time. * When an immediate purge is to happen the close routine * typically calls rtexpunge which clears the RTF_UP flag * on the entry so that the code below reclaims the storage. */ if (rt->rt_refcnt == 0 && rnh->rnh_close) rnh->rnh_close((struct radix_node *)rt, &rnh->head); /* * If we are no longer "up" (and ref == 0) * then we can free the resources associated * with the route. */ if ((rt->rt_flags & RTF_UP) == 0) { if (rt->rt_nodes->rn_flags & (RNF_ACTIVE | RNF_ROOT)) panic("rtfree 2"); /* * the rtentry must have been removed from the routing table * so it is represented in rttrash.. remove that now. */ V_rttrash--; #ifdef DIAGNOSTIC if (rt->rt_refcnt < 0) { printf("rtfree: %p not freed (neg refs)\n", rt); goto done; } #endif /* * release references on items we hold them on.. * e.g other routes and ifaddrs. */ if (rt->rt_ifa) ifa_free(rt->rt_ifa); /* * The key is separatly alloc'd so free it (see rt_setgate()). * This also frees the gateway, as they are always malloc'd * together. */ R_Free(rt_key(rt)); /* * and the rtentry itself of course */ uma_zfree(V_rtzone, rt); return; } done: RT_UNLOCK(rt); } /* * Force a routing table entry to the specified * destination to go through the given gateway. * Normally called as a result of a routing redirect * message from the network layer. */ void rtredirect_fib(struct sockaddr *dst, struct sockaddr *gateway, struct sockaddr *netmask, int flags, struct sockaddr *src, u_int fibnum) { struct rtentry *rt; int error = 0; short *stat = NULL; struct rt_addrinfo info; struct ifaddr *ifa; struct rib_head *rnh; ifa = NULL; + NET_EPOCH_ENTER(); rnh = rt_tables_get_rnh(fibnum, dst->sa_family); if (rnh == NULL) { error = EAFNOSUPPORT; goto out; } - /* verify the gateway is directly reachable */ if ((ifa = ifa_ifwithnet(gateway, 0, fibnum)) == NULL) { error = ENETUNREACH; goto out; } rt = rtalloc1_fib(dst, 0, 0UL, fibnum); /* NB: rt is locked */ /* * If the redirect isn't from our current router for this dst, * it's either old or wrong. If it redirects us to ourselves, * we have a routing loop, perhaps as a result of an interface * going down recently. */ if (!(flags & RTF_DONE) && rt) { if (!sa_equal(src, rt->rt_gateway)) { error = EINVAL; goto done; } if (rt->rt_ifa != ifa && ifa->ifa_addr->sa_family != AF_LINK) { error = EINVAL; goto done; } } if ((flags & RTF_GATEWAY) && ifa_ifwithaddr_check(gateway)) { error = EHOSTUNREACH; goto done; } /* * Create a new entry if we just got back a wildcard entry * or the lookup failed. This is necessary for hosts * which use routing redirects generated by smart gateways * to dynamically build the routing tables. */ if (rt == NULL || (rt_mask(rt) && rt_mask(rt)->sa_len < 2)) goto create; /* * Don't listen to the redirect if it's * for a route to an interface. */ if (rt->rt_flags & RTF_GATEWAY) { if (((rt->rt_flags & RTF_HOST) == 0) && (flags & RTF_HOST)) { /* * Changing from route to net => route to host. * Create new route, rather than smashing route to net. */ create: if (rt != NULL) RTFREE_LOCKED(rt); flags |= RTF_DYNAMIC; bzero((caddr_t)&info, sizeof(info)); info.rti_info[RTAX_DST] = dst; info.rti_info[RTAX_GATEWAY] = gateway; info.rti_info[RTAX_NETMASK] = netmask; info.rti_ifa = ifa; info.rti_flags = flags; error = rtrequest1_fib(RTM_ADD, &info, &rt, fibnum); if (rt != NULL) { RT_LOCK(rt); flags = rt->rt_flags; } stat = &V_rtstat.rts_dynamic; } else { /* * Smash the current notion of the gateway to * this destination. Should check about netmask!!! */ if ((flags & RTF_GATEWAY) == 0) rt->rt_flags &= ~RTF_GATEWAY; rt->rt_flags |= RTF_MODIFIED; flags |= RTF_MODIFIED; stat = &V_rtstat.rts_newgateway; /* * add the key and gateway (in one malloc'd chunk). */ RT_UNLOCK(rt); RIB_WLOCK(rnh); RT_LOCK(rt); rt_setgate(rt, rt_key(rt), gateway); RIB_WUNLOCK(rnh); } } else error = EHOSTUNREACH; done: if (rt) RTFREE_LOCKED(rt); -out: + out: + NET_EPOCH_EXIT(); if (error) V_rtstat.rts_badredirect++; else if (stat != NULL) (*stat)++; bzero((caddr_t)&info, sizeof(info)); info.rti_info[RTAX_DST] = dst; info.rti_info[RTAX_GATEWAY] = gateway; info.rti_info[RTAX_NETMASK] = netmask; info.rti_info[RTAX_AUTHOR] = src; rt_missmsg_fib(RTM_REDIRECT, &info, flags, error, fibnum); - if (ifa != NULL) - ifa_free(ifa); } /* * Routing table ioctl interface. */ int rtioctl_fib(u_long req, caddr_t data, u_int fibnum) { /* * If more ioctl commands are added here, make sure the proper * super-user checks are being performed because it is possible for * prison-root to make it this far if raw sockets have been enabled * in jails. */ #ifdef INET /* Multicast goop, grrr... */ return mrt_ioctl ? mrt_ioctl(req, data, fibnum) : EOPNOTSUPP; #else /* INET */ return ENXIO; #endif /* INET */ } struct ifaddr * ifa_ifwithroute(int flags, const struct sockaddr *dst, struct sockaddr *gateway, u_int fibnum) { struct ifaddr *ifa; int not_found = 0; + MPASS(in_epoch()); if ((flags & RTF_GATEWAY) == 0) { /* * If we are adding a route to an interface, * and the interface is a pt to pt link * we should search for the destination * as our clue to the interface. Otherwise * we can use the local address. */ ifa = NULL; if (flags & RTF_HOST) ifa = ifa_ifwithdstaddr(dst, fibnum); if (ifa == NULL) ifa = ifa_ifwithaddr(gateway); } else { /* * If we are adding a route to a remote net * or host, the gateway may still be on the * other end of a pt to pt link. */ ifa = ifa_ifwithdstaddr(gateway, fibnum); } if (ifa == NULL) ifa = ifa_ifwithnet(gateway, 0, fibnum); if (ifa == NULL) { struct rtentry *rt; rt = rtalloc1_fib(gateway, 0, flags, fibnum); if (rt == NULL) - return (NULL); + goto out; /* * dismiss a gateway that is reachable only * through the default router */ switch (gateway->sa_family) { case AF_INET: if (satosin(rt_key(rt))->sin_addr.s_addr == INADDR_ANY) not_found = 1; break; case AF_INET6: if (IN6_IS_ADDR_UNSPECIFIED(&satosin6(rt_key(rt))->sin6_addr)) not_found = 1; break; default: break; } if (!not_found && rt->rt_ifa != NULL) { ifa = rt->rt_ifa; - ifa_ref(ifa); } RT_REMREF(rt); RT_UNLOCK(rt); if (not_found || ifa == NULL) - return (NULL); + goto out; } if (ifa->ifa_addr->sa_family != dst->sa_family) { struct ifaddr *oifa = ifa; ifa = ifaof_ifpforaddr(dst, ifa->ifa_ifp); if (ifa == NULL) ifa = oifa; - else - ifa_free(oifa); } + out: return (ifa); } /* * Do appropriate manipulations of a routing tree given * all the bits of info needed */ int rtrequest_fib(int req, struct sockaddr *dst, struct sockaddr *gateway, struct sockaddr *netmask, int flags, struct rtentry **ret_nrt, u_int fibnum) { struct rt_addrinfo info; if (dst->sa_len == 0) return(EINVAL); bzero((caddr_t)&info, sizeof(info)); info.rti_flags = flags; info.rti_info[RTAX_DST] = dst; info.rti_info[RTAX_GATEWAY] = gateway; info.rti_info[RTAX_NETMASK] = netmask; return rtrequest1_fib(req, &info, ret_nrt, fibnum); } /* * Copy most of @rt data into @info. * * If @flags contains NHR_COPY, copies dst,netmask and gw to the * pointers specified by @info structure. Assume such pointers * are zeroed sockaddr-like structures with sa_len field initialized * to reflect size of the provided buffer. if no NHR_COPY is specified, * point dst,netmask and gw @info fields to appropriate @rt values. * * if @flags contains NHR_REF, do refcouting on rt_ifp. * * Returns 0 on success. */ int rt_exportinfo(struct rtentry *rt, struct rt_addrinfo *info, int flags) { struct rt_metrics *rmx; struct sockaddr *src, *dst; int sa_len; if (flags & NHR_COPY) { /* Copy destination if dst is non-zero */ src = rt_key(rt); dst = info->rti_info[RTAX_DST]; sa_len = src->sa_len; if (dst != NULL) { if (src->sa_len > dst->sa_len) return (ENOMEM); memcpy(dst, src, src->sa_len); info->rti_addrs |= RTA_DST; } /* Copy mask if set && dst is non-zero */ src = rt_mask(rt); dst = info->rti_info[RTAX_NETMASK]; if (src != NULL && dst != NULL) { /* * Radix stores different value in sa_len, * assume rt_mask() to have the same length * as rt_key() */ if (sa_len > dst->sa_len) return (ENOMEM); memcpy(dst, src, src->sa_len); info->rti_addrs |= RTA_NETMASK; } /* Copy gateway is set && dst is non-zero */ src = rt->rt_gateway; dst = info->rti_info[RTAX_GATEWAY]; if ((rt->rt_flags & RTF_GATEWAY) && src != NULL && dst != NULL){ if (src->sa_len > dst->sa_len) return (ENOMEM); memcpy(dst, src, src->sa_len); info->rti_addrs |= RTA_GATEWAY; } } else { info->rti_info[RTAX_DST] = rt_key(rt); info->rti_addrs |= RTA_DST; if (rt_mask(rt) != NULL) { info->rti_info[RTAX_NETMASK] = rt_mask(rt); info->rti_addrs |= RTA_NETMASK; } if (rt->rt_flags & RTF_GATEWAY) { info->rti_info[RTAX_GATEWAY] = rt->rt_gateway; info->rti_addrs |= RTA_GATEWAY; } } rmx = info->rti_rmx; if (rmx != NULL) { info->rti_mflags |= RTV_MTU; rmx->rmx_mtu = rt->rt_mtu; } info->rti_flags = rt->rt_flags; info->rti_ifp = rt->rt_ifp; info->rti_ifa = rt->rt_ifa; if (flags & NHR_REF) { /* Do 'traditional' refcouting */ if_ref(info->rti_ifp); } return (0); } /* * Lookups up route entry for @dst in RIB database for fib @fibnum. * Exports entry data to @info using rt_exportinfo(). * * if @flags contains NHR_REF, refcouting is performed on rt_ifp. * All references can be released later by calling rib_free_info() * * Returns 0 on success. * Returns ENOENT for lookup failure, ENOMEM for export failure. */ int rib_lookup_info(uint32_t fibnum, const struct sockaddr *dst, uint32_t flags, uint32_t flowid, struct rt_addrinfo *info) { struct rib_head *rh; struct radix_node *rn; struct rtentry *rt; int error; KASSERT((fibnum < rt_numfibs), ("rib_lookup_rte: bad fibnum")); rh = rt_tables_get_rnh(fibnum, dst->sa_family); if (rh == NULL) return (ENOENT); RIB_RLOCK(rh); rn = rh->rnh_matchaddr(__DECONST(void *, dst), &rh->head); if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) { rt = RNTORT(rn); /* Ensure route & ifp is UP */ if (RT_LINK_IS_UP(rt->rt_ifp)) { flags = (flags & NHR_REF) | NHR_COPY; error = rt_exportinfo(rt, info, flags); RIB_RUNLOCK(rh); return (error); } } RIB_RUNLOCK(rh); return (ENOENT); } /* * Releases all references acquired by rib_lookup_info() when * called with NHR_REF flags. */ void rib_free_info(struct rt_addrinfo *info) { if_rele(info->rti_ifp); } /* * Iterates over all existing fibs in system calling * @setwa_f function prior to traversing each fib. * Calls @wa_f function for each element in current fib. * If af is not AF_UNSPEC, iterates over fibs in particular * address family. */ void rt_foreach_fib_walk(int af, rt_setwarg_t *setwa_f, rt_walktree_f_t *wa_f, void *arg) { struct rib_head *rnh; uint32_t fibnum; int i; for (fibnum = 0; fibnum < rt_numfibs; fibnum++) { /* Do we want some specific family? */ if (af != AF_UNSPEC) { rnh = rt_tables_get_rnh(fibnum, af); if (rnh == NULL) continue; if (setwa_f != NULL) setwa_f(rnh, fibnum, af, arg); RIB_WLOCK(rnh); rnh->rnh_walktree(&rnh->head, (walktree_f_t *)wa_f,arg); RIB_WUNLOCK(rnh); continue; } for (i = 1; i <= AF_MAX; i++) { rnh = rt_tables_get_rnh(fibnum, i); if (rnh == NULL) continue; if (setwa_f != NULL) setwa_f(rnh, fibnum, i, arg); RIB_WLOCK(rnh); rnh->rnh_walktree(&rnh->head, (walktree_f_t *)wa_f,arg); RIB_WUNLOCK(rnh); } } } struct rt_delinfo { struct rt_addrinfo info; struct rib_head *rnh; struct rtentry *head; }; /* * Conditionally unlinks @rn from radix tree based * on info data passed in @arg. */ static int rt_checkdelroute(struct radix_node *rn, void *arg) { struct rt_delinfo *di; struct rt_addrinfo *info; struct rtentry *rt; int error; di = (struct rt_delinfo *)arg; rt = (struct rtentry *)rn; info = &di->info; error = 0; info->rti_info[RTAX_DST] = rt_key(rt); info->rti_info[RTAX_NETMASK] = rt_mask(rt); info->rti_info[RTAX_GATEWAY] = rt->rt_gateway; rt = rt_unlinkrte(di->rnh, info, &error); if (rt == NULL) { /* Either not allowed or not matched. Skip entry */ return (0); } /* Entry was unlinked. Add to the list and return */ rt->rt_chain = di->head; di->head = rt; return (0); } /* * Iterates over all existing fibs in system. * Deletes each element for which @filter_f function returned * non-zero value. * If @af is not AF_UNSPEC, iterates over fibs in particular * address family. */ void rt_foreach_fib_walk_del(int af, rt_filter_f_t *filter_f, void *arg) { struct rib_head *rnh; struct rt_delinfo di; struct rtentry *rt; uint32_t fibnum; int i, start, end; bzero(&di, sizeof(di)); di.info.rti_filter = filter_f; di.info.rti_filterdata = arg; for (fibnum = 0; fibnum < rt_numfibs; fibnum++) { /* Do we want some specific family? */ if (af != AF_UNSPEC) { start = af; end = af; } else { start = 1; end = AF_MAX; } for (i = start; i <= end; i++) { rnh = rt_tables_get_rnh(fibnum, i); if (rnh == NULL) continue; di.rnh = rnh; RIB_WLOCK(rnh); rnh->rnh_walktree(&rnh->head, rt_checkdelroute, &di); RIB_WUNLOCK(rnh); if (di.head == NULL) continue; /* We might have something to reclaim */ while (di.head != NULL) { rt = di.head; di.head = rt->rt_chain; rt->rt_chain = NULL; /* TODO std rt -> rt_addrinfo export */ di.info.rti_info[RTAX_DST] = rt_key(rt); di.info.rti_info[RTAX_NETMASK] = rt_mask(rt); rt_notifydelete(rt, &di.info); RTFREE_LOCKED(rt); } } } } /* * Delete Routes for a Network Interface * * Called for each routing entry via the rnh->rnh_walktree() call above * to delete all route entries referencing a detaching network interface. * * Arguments: * rt pointer to rtentry * arg argument passed to rnh->rnh_walktree() - detaching interface * * Returns: * 0 successful * errno failed - reason indicated */ static int rt_ifdelroute(const struct rtentry *rt, void *arg) { struct ifnet *ifp = arg; if (rt->rt_ifp != ifp) return (0); /* * Protect (sorta) against walktree recursion problems * with cloned routes */ if ((rt->rt_flags & RTF_UP) == 0) return (0); return (1); } /* * Delete all remaining routes using this interface * Unfortuneatly the only way to do this is to slog through * the entire routing table looking for routes which point * to this interface...oh well... */ void rt_flushifroutes_af(struct ifnet *ifp, int af) { KASSERT((af >= 1 && af <= AF_MAX), ("%s: af %d not >= 1 and <= %d", __func__, af, AF_MAX)); rt_foreach_fib_walk_del(af, rt_ifdelroute, ifp); } void rt_flushifroutes(struct ifnet *ifp) { rt_foreach_fib_walk_del(AF_UNSPEC, rt_ifdelroute, ifp); } /* * Conditionally unlinks rtentry matching data inside @info from @rnh. * Returns unlinked, locked and referenced @rtentry on success, * Returns NULL and sets @perror to: * ESRCH - if prefix was not found, * EADDRINUSE - if trying to delete PINNED route without appropriate flag. * ENOENT - if supplied filter function returned 0 (not matched). */ static struct rtentry * rt_unlinkrte(struct rib_head *rnh, struct rt_addrinfo *info, int *perror) { struct sockaddr *dst, *netmask; struct rtentry *rt; struct radix_node *rn; dst = info->rti_info[RTAX_DST]; netmask = info->rti_info[RTAX_NETMASK]; rt = (struct rtentry *)rnh->rnh_lookup(dst, netmask, &rnh->head); if (rt == NULL) { *perror = ESRCH; return (NULL); } if ((info->rti_flags & RTF_PINNED) == 0) { /* Check if target route can be deleted */ if (rt->rt_flags & RTF_PINNED) { *perror = EADDRINUSE; return (NULL); } } if (info->rti_filter != NULL) { if (info->rti_filter(rt, info->rti_filterdata) == 0) { /* Not matched */ *perror = ENOENT; return (NULL); } /* * Filter function requested rte deletion. * Ease the caller work by filling in remaining info * from that particular entry. */ info->rti_info[RTAX_GATEWAY] = rt->rt_gateway; } /* * Remove the item from the tree and return it. * Complain if it is not there and do no more processing. */ *perror = ESRCH; #ifdef RADIX_MPATH if (rt_mpath_capable(rnh)) rn = rt_mpath_unlink(rnh, info, rt, perror); else #endif rn = rnh->rnh_deladdr(dst, netmask, &rnh->head); if (rn == NULL) return (NULL); if (rn->rn_flags & (RNF_ACTIVE | RNF_ROOT)) panic ("rtrequest delete"); rt = RNTORT(rn); RT_LOCK(rt); RT_ADDREF(rt); rt->rt_flags &= ~RTF_UP; *perror = 0; return (rt); } static void rt_notifydelete(struct rtentry *rt, struct rt_addrinfo *info) { struct ifaddr *ifa; /* * give the protocol a chance to keep things in sync. */ ifa = rt->rt_ifa; if (ifa != NULL && ifa->ifa_rtrequest != NULL) ifa->ifa_rtrequest(RTM_DELETE, rt, info); /* * One more rtentry floating around that is not * linked to the routing table. rttrash will be decremented * when RTFREE(rt) is eventually called. */ V_rttrash++; } /* * These (questionable) definitions of apparent local variables apply * to the next two functions. XXXXXX!!! */ #define dst info->rti_info[RTAX_DST] #define gateway info->rti_info[RTAX_GATEWAY] #define netmask info->rti_info[RTAX_NETMASK] #define ifaaddr info->rti_info[RTAX_IFA] #define ifpaddr info->rti_info[RTAX_IFP] #define flags info->rti_flags /* * Look up rt_addrinfo for a specific fib. Note that if rti_ifa is defined, * it will be referenced so the caller must free it. */ int rt_getifa_fib(struct rt_addrinfo *info, u_int fibnum) { struct ifaddr *ifa; int error = 0; /* * ifp may be specified by sockaddr_dl * when protocol address is ambiguous. */ + NET_EPOCH_ENTER(); if (info->rti_ifp == NULL && ifpaddr != NULL && ifpaddr->sa_family == AF_LINK && (ifa = ifa_ifwithnet(ifpaddr, 0, fibnum)) != NULL) { info->rti_ifp = ifa->ifa_ifp; - ifa_free(ifa); } if (info->rti_ifa == NULL && ifaaddr != NULL) info->rti_ifa = ifa_ifwithaddr(ifaaddr); if (info->rti_ifa == NULL) { struct sockaddr *sa; sa = ifaaddr != NULL ? ifaaddr : (gateway != NULL ? gateway : dst); if (sa != NULL && info->rti_ifp != NULL) info->rti_ifa = ifaof_ifpforaddr(sa, info->rti_ifp); else if (dst != NULL && gateway != NULL) info->rti_ifa = ifa_ifwithroute(flags, dst, gateway, fibnum); else if (sa != NULL) info->rti_ifa = ifa_ifwithroute(flags, sa, sa, fibnum); } if ((ifa = info->rti_ifa) != NULL) { if (info->rti_ifp == NULL) info->rti_ifp = ifa->ifa_ifp; + ifa_ref(info->rti_ifa); } else error = ENETUNREACH; + NET_EPOCH_EXIT(); return (error); } static int if_updatemtu_cb(struct radix_node *rn, void *arg) { struct rtentry *rt; struct if_mtuinfo *ifmtu; rt = (struct rtentry *)rn; ifmtu = (struct if_mtuinfo *)arg; if (rt->rt_ifp != ifmtu->ifp) return (0); if (rt->rt_mtu >= ifmtu->mtu) { /* We have to decrease mtu regardless of flags */ rt->rt_mtu = ifmtu->mtu; return (0); } /* * New MTU is bigger. Check if are allowed to alter it */ if ((rt->rt_flags & (RTF_FIXEDMTU | RTF_GATEWAY | RTF_HOST)) != 0) { /* * Skip routes with user-supplied MTU and * non-interface routes */ return (0); } /* We are safe to update route MTU */ rt->rt_mtu = ifmtu->mtu; return (0); } void rt_updatemtu(struct ifnet *ifp) { struct if_mtuinfo ifmtu; struct rib_head *rnh; int i, j; ifmtu.ifp = ifp; /* * Try to update rt_mtu for all routes using this interface * Unfortunately the only way to do this is to traverse all * routing tables in all fibs/domains. */ for (i = 1; i <= AF_MAX; i++) { ifmtu.mtu = if_getmtu_family(ifp, i); for (j = 0; j < rt_numfibs; j++) { rnh = rt_tables_get_rnh(j, i); if (rnh == NULL) continue; RIB_WLOCK(rnh); rnh->rnh_walktree(&rnh->head, if_updatemtu_cb, &ifmtu); RIB_WUNLOCK(rnh); } } } #if 0 int p_sockaddr(char *buf, int buflen, struct sockaddr *s); int rt_print(char *buf, int buflen, struct rtentry *rt); int p_sockaddr(char *buf, int buflen, struct sockaddr *s) { void *paddr = NULL; switch (s->sa_family) { case AF_INET: paddr = &((struct sockaddr_in *)s)->sin_addr; break; case AF_INET6: paddr = &((struct sockaddr_in6 *)s)->sin6_addr; break; } if (paddr == NULL) return (0); if (inet_ntop(s->sa_family, paddr, buf, buflen) == NULL) return (0); return (strlen(buf)); } int rt_print(char *buf, int buflen, struct rtentry *rt) { struct sockaddr *addr, *mask; int i = 0; addr = rt_key(rt); mask = rt_mask(rt); i = p_sockaddr(buf, buflen, addr); if (!(rt->rt_flags & RTF_HOST)) { buf[i++] = '/'; i += p_sockaddr(buf + i, buflen - i, mask); } if (rt->rt_flags & RTF_GATEWAY) { buf[i++] = '>'; i += p_sockaddr(buf + i, buflen - i, rt->rt_gateway); } return (i); } #endif #ifdef RADIX_MPATH /* * Deletes key for single-path routes, unlinks rtentry with * gateway specified in @info from multi-path routes. * * Returnes unlinked entry. In case of failure, returns NULL * and sets @perror to ESRCH. */ static struct radix_node * rt_mpath_unlink(struct rib_head *rnh, struct rt_addrinfo *info, struct rtentry *rto, int *perror) { /* * if we got multipath routes, we require users to specify * a matching RTAX_GATEWAY. */ struct rtentry *rt; // *rto = NULL; struct radix_node *rn; struct sockaddr *gw; gw = info->rti_info[RTAX_GATEWAY]; rt = rt_mpath_matchgate(rto, gw); if (rt == NULL) { *perror = ESRCH; return (NULL); } /* * this is the first entry in the chain */ if (rto == rt) { rn = rn_mpath_next((struct radix_node *)rt); /* * there is another entry, now it's active */ if (rn) { rto = RNTORT(rn); RT_LOCK(rto); rto->rt_flags |= RTF_UP; RT_UNLOCK(rto); } else if (rt->rt_flags & RTF_GATEWAY) { /* * For gateway routes, we need to * make sure that we we are deleting * the correct gateway. * rt_mpath_matchgate() does not * check the case when there is only * one route in the chain. */ if (gw && (rt->rt_gateway->sa_len != gw->sa_len || memcmp(rt->rt_gateway, gw, gw->sa_len))) { *perror = ESRCH; return (NULL); } } /* * use the normal delete code to remove * the first entry */ rn = rnh->rnh_deladdr(dst, netmask, &rnh->head); *perror = 0; return (rn); } /* * if the entry is 2nd and on up */ if (rt_mpath_deldup(rto, rt) == 0) panic ("rtrequest1: rt_mpath_deldup"); *perror = 0; rn = (struct radix_node *)rt; return (rn); } #endif int rtrequest1_fib(int req, struct rt_addrinfo *info, struct rtentry **ret_nrt, u_int fibnum) { int error = 0; struct rtentry *rt, *rt_old; struct radix_node *rn; struct rib_head *rnh; struct ifaddr *ifa; struct sockaddr *ndst; struct sockaddr_storage mdst; KASSERT((fibnum < rt_numfibs), ("rtrequest1_fib: bad fibnum")); KASSERT((flags & RTF_RNH_LOCKED) == 0, ("rtrequest1_fib: locked")); switch (dst->sa_family) { case AF_INET6: case AF_INET: /* We support multiple FIBs. */ break; default: fibnum = RT_DEFAULT_FIB; break; } /* * Find the correct routing tree to use for this Address Family */ rnh = rt_tables_get_rnh(fibnum, dst->sa_family); if (rnh == NULL) return (EAFNOSUPPORT); /* * If we are adding a host route then we don't want to put * a netmask in the tree, nor do we want to clone it. */ if (flags & RTF_HOST) netmask = NULL; switch (req) { case RTM_DELETE: if (netmask) { rt_maskedcopy(dst, (struct sockaddr *)&mdst, netmask); dst = (struct sockaddr *)&mdst; } RIB_WLOCK(rnh); rt = rt_unlinkrte(rnh, info, &error); RIB_WUNLOCK(rnh); if (error != 0) return (error); rt_notifydelete(rt, info); /* * If the caller wants it, then it can have it, * but it's up to it to free the rtentry as we won't be * doing it. */ if (ret_nrt) { *ret_nrt = rt; RT_UNLOCK(rt); } else RTFREE_LOCKED(rt); break; case RTM_RESOLVE: /* * resolve was only used for route cloning * here for compat */ break; case RTM_ADD: if ((flags & RTF_GATEWAY) && !gateway) return (EINVAL); if (dst && gateway && (dst->sa_family != gateway->sa_family) && (gateway->sa_family != AF_UNSPEC) && (gateway->sa_family != AF_LINK)) return (EINVAL); if (info->rti_ifa == NULL) { error = rt_getifa_fib(info, fibnum); if (error) return (error); } else ifa_ref(info->rti_ifa); ifa = info->rti_ifa; rt = uma_zalloc(V_rtzone, M_NOWAIT); if (rt == NULL) { ifa_free(ifa); return (ENOBUFS); } rt->rt_flags = RTF_UP | flags; rt->rt_fibnum = fibnum; /* * Add the gateway. Possibly re-malloc-ing the storage for it. */ if ((error = rt_setgate(rt, dst, gateway)) != 0) { ifa_free(ifa); uma_zfree(V_rtzone, rt); return (error); } /* * point to the (possibly newly malloc'd) dest address. */ ndst = (struct sockaddr *)rt_key(rt); /* * make sure it contains the value we want (masked if needed). */ if (netmask) { rt_maskedcopy(dst, ndst, netmask); } else bcopy(dst, ndst, dst->sa_len); /* * We use the ifa reference returned by rt_getifa_fib(). * This moved from below so that rnh->rnh_addaddr() can * examine the ifa and ifa->ifa_ifp if it so desires. */ rt->rt_ifa = ifa; rt->rt_ifp = ifa->ifa_ifp; rt->rt_weight = 1; rt_setmetrics(info, rt); RIB_WLOCK(rnh); RT_LOCK(rt); #ifdef RADIX_MPATH /* do not permit exactly the same dst/mask/gw pair */ if (rt_mpath_capable(rnh) && rt_mpath_conflict(rnh, rt, netmask)) { RIB_WUNLOCK(rnh); ifa_free(rt->rt_ifa); R_Free(rt_key(rt)); uma_zfree(V_rtzone, rt); return (EEXIST); } #endif /* XXX mtu manipulation will be done in rnh_addaddr -- itojun */ rn = rnh->rnh_addaddr(ndst, netmask, &rnh->head, rt->rt_nodes); rt_old = NULL; if (rn == NULL && (info->rti_flags & RTF_PINNED) != 0) { /* * Force removal and re-try addition * TODO: better multipath&pinned support */ struct sockaddr *info_dst = info->rti_info[RTAX_DST]; info->rti_info[RTAX_DST] = ndst; /* Do not delete existing PINNED(interface) routes */ info->rti_flags &= ~RTF_PINNED; rt_old = rt_unlinkrte(rnh, info, &error); info->rti_flags |= RTF_PINNED; info->rti_info[RTAX_DST] = info_dst; if (rt_old != NULL) rn = rnh->rnh_addaddr(ndst, netmask, &rnh->head, rt->rt_nodes); } RIB_WUNLOCK(rnh); if (rt_old != NULL) RT_UNLOCK(rt_old); /* * If it still failed to go into the tree, * then un-make it (this should be a function) */ if (rn == NULL) { ifa_free(rt->rt_ifa); R_Free(rt_key(rt)); uma_zfree(V_rtzone, rt); return (EEXIST); } if (rt_old != NULL) { rt_notifydelete(rt_old, info); RTFREE(rt_old); } /* * If this protocol has something to add to this then * allow it to do that as well. */ if (ifa->ifa_rtrequest) ifa->ifa_rtrequest(req, rt, info); /* * actually return a resultant rtentry and * give the caller a single reference. */ if (ret_nrt) { *ret_nrt = rt; RT_ADDREF(rt); } rnh->rnh_gen++; /* Routing table updated */ RT_UNLOCK(rt); break; case RTM_CHANGE: RIB_WLOCK(rnh); error = rtrequest1_fib_change(rnh, info, ret_nrt, fibnum); RIB_WUNLOCK(rnh); break; default: error = EOPNOTSUPP; } return (error); } #undef dst #undef gateway #undef netmask #undef ifaaddr #undef ifpaddr #undef flags static int rtrequest1_fib_change(struct rib_head *rnh, struct rt_addrinfo *info, struct rtentry **ret_nrt, u_int fibnum) { struct rtentry *rt = NULL; int error = 0; int free_ifa = 0; int family, mtu; struct if_mtuinfo ifmtu; RIB_WLOCK_ASSERT(rnh); rt = (struct rtentry *)rnh->rnh_lookup(info->rti_info[RTAX_DST], info->rti_info[RTAX_NETMASK], &rnh->head); if (rt == NULL) return (ESRCH); #ifdef RADIX_MPATH /* * If we got multipath routes, * we require users to specify a matching RTAX_GATEWAY. */ if (rt_mpath_capable(rnh)) { rt = rt_mpath_matchgate(rt, info->rti_info[RTAX_GATEWAY]); if (rt == NULL) return (ESRCH); } #endif RT_LOCK(rt); rt_setmetrics(info, rt); /* * New gateway could require new ifaddr, ifp; * flags may also be different; ifp may be specified * by ll sockaddr when protocol address is ambiguous */ if (((rt->rt_flags & RTF_GATEWAY) && info->rti_info[RTAX_GATEWAY] != NULL) || info->rti_info[RTAX_IFP] != NULL || (info->rti_info[RTAX_IFA] != NULL && !sa_equal(info->rti_info[RTAX_IFA], rt->rt_ifa->ifa_addr))) { /* * XXX: Temporarily set RTF_RNH_LOCKED flag in the rti_flags * to avoid rlock in the ifa_ifwithroute(). */ info->rti_flags |= RTF_RNH_LOCKED; error = rt_getifa_fib(info, fibnum); info->rti_flags &= ~RTF_RNH_LOCKED; if (info->rti_ifa != NULL) free_ifa = 1; if (error != 0) goto bad; } /* Check if outgoing interface has changed */ if (info->rti_ifa != NULL && info->rti_ifa != rt->rt_ifa && rt->rt_ifa != NULL) { if (rt->rt_ifa->ifa_rtrequest != NULL) rt->rt_ifa->ifa_rtrequest(RTM_DELETE, rt, info); ifa_free(rt->rt_ifa); } /* Update gateway address */ if (info->rti_info[RTAX_GATEWAY] != NULL) { error = rt_setgate(rt, rt_key(rt), info->rti_info[RTAX_GATEWAY]); if (error != 0) goto bad; rt->rt_flags &= ~RTF_GATEWAY; rt->rt_flags |= (RTF_GATEWAY & info->rti_flags); } if (info->rti_ifa != NULL && info->rti_ifa != rt->rt_ifa) { ifa_ref(info->rti_ifa); rt->rt_ifa = info->rti_ifa; rt->rt_ifp = info->rti_ifp; } /* Allow some flags to be toggled on change. */ rt->rt_flags &= ~RTF_FMASK; rt->rt_flags |= info->rti_flags & RTF_FMASK; if (rt->rt_ifa && rt->rt_ifa->ifa_rtrequest != NULL) rt->rt_ifa->ifa_rtrequest(RTM_ADD, rt, info); /* Alter route MTU if necessary */ if (rt->rt_ifp != NULL) { family = info->rti_info[RTAX_DST]->sa_family; mtu = if_getmtu_family(rt->rt_ifp, family); /* Set default MTU */ if (rt->rt_mtu == 0) rt->rt_mtu = mtu; if (rt->rt_mtu != mtu) { /* Check if we really need to update */ ifmtu.ifp = rt->rt_ifp; ifmtu.mtu = mtu; if_updatemtu_cb(rt->rt_nodes, &ifmtu); } } /* * This route change may have modified the route's gateway. In that * case, any inpcbs that have cached this route need to invalidate their * llentry cache. */ rnh->rnh_gen++; if (ret_nrt) { *ret_nrt = rt; RT_ADDREF(rt); } bad: RT_UNLOCK(rt); if (free_ifa != 0) ifa_free(info->rti_ifa); return (error); } static void rt_setmetrics(const struct rt_addrinfo *info, struct rtentry *rt) { if (info->rti_mflags & RTV_MTU) { if (info->rti_rmx->rmx_mtu != 0) { /* * MTU was explicitly provided by user. * Keep it. */ rt->rt_flags |= RTF_FIXEDMTU; } else { /* * User explicitly sets MTU to 0. * Assume rollback to default. */ rt->rt_flags &= ~RTF_FIXEDMTU; } rt->rt_mtu = info->rti_rmx->rmx_mtu; } if (info->rti_mflags & RTV_WEIGHT) rt->rt_weight = info->rti_rmx->rmx_weight; /* Kernel -> userland timebase conversion. */ if (info->rti_mflags & RTV_EXPIRE) rt->rt_expire = info->rti_rmx->rmx_expire ? info->rti_rmx->rmx_expire - time_second + time_uptime : 0; } int rt_setgate(struct rtentry *rt, struct sockaddr *dst, struct sockaddr *gate) { /* XXX dst may be overwritten, can we move this to below */ int dlen = SA_SIZE(dst), glen = SA_SIZE(gate); /* * Prepare to store the gateway in rt->rt_gateway. * Both dst and gateway are stored one after the other in the same * malloc'd chunk. If we have room, we can reuse the old buffer, * rt_gateway already points to the right place. * Otherwise, malloc a new block and update the 'dst' address. */ if (rt->rt_gateway == NULL || glen > SA_SIZE(rt->rt_gateway)) { caddr_t new; R_Malloc(new, caddr_t, dlen + glen); if (new == NULL) return ENOBUFS; /* * XXX note, we copy from *dst and not *rt_key(rt) because * rt_setgate() can be called to initialize a newly * allocated route entry, in which case rt_key(rt) == NULL * (and also rt->rt_gateway == NULL). * Free()/free() handle a NULL argument just fine. */ bcopy(dst, new, dlen); R_Free(rt_key(rt)); /* free old block, if any */ rt_key(rt) = (struct sockaddr *)new; rt->rt_gateway = (struct sockaddr *)(new + dlen); } /* * Copy the new gateway value into the memory chunk. */ bcopy(gate, rt->rt_gateway, glen); return (0); } void rt_maskedcopy(struct sockaddr *src, struct sockaddr *dst, struct sockaddr *netmask) { u_char *cp1 = (u_char *)src; u_char *cp2 = (u_char *)dst; u_char *cp3 = (u_char *)netmask; u_char *cplim = cp2 + *cp3; u_char *cplim2 = cp2 + *cp1; *cp2++ = *cp1++; *cp2++ = *cp1++; /* copies sa_len & sa_family */ cp3 += 2; if (cplim > cplim2) cplim = cplim2; while (cp2 < cplim) *cp2++ = *cp1++ & *cp3++; if (cp2 < cplim2) bzero((caddr_t)cp2, (unsigned)(cplim2 - cp2)); } /* * Set up a routing table entry, normally * for an interface. */ #define _SOCKADDR_TMPSIZE 128 /* Not too big.. kernel stack size is limited */ static inline int rtinit1(struct ifaddr *ifa, int cmd, int flags, int fibnum) { struct sockaddr *dst; struct sockaddr *netmask; struct rtentry *rt = NULL; struct rt_addrinfo info; int error = 0; int startfib, endfib; char tempbuf[_SOCKADDR_TMPSIZE]; int didwork = 0; int a_failure = 0; static struct sockaddr_dl null_sdl = {sizeof(null_sdl), AF_LINK}; struct rib_head *rnh; if (flags & RTF_HOST) { dst = ifa->ifa_dstaddr; netmask = NULL; } else { dst = ifa->ifa_addr; netmask = ifa->ifa_netmask; } if (dst->sa_len == 0) return(EINVAL); switch (dst->sa_family) { case AF_INET6: case AF_INET: /* We support multiple FIBs. */ break; default: fibnum = RT_DEFAULT_FIB; break; } if (fibnum == RT_ALL_FIBS) { if (V_rt_add_addr_allfibs == 0 && cmd == (int)RTM_ADD) startfib = endfib = ifa->ifa_ifp->if_fib; else { startfib = 0; endfib = rt_numfibs - 1; } } else { KASSERT((fibnum < rt_numfibs), ("rtinit1: bad fibnum")); startfib = fibnum; endfib = fibnum; } /* * If it's a delete, check that if it exists, * it's on the correct interface or we might scrub * a route to another ifa which would * be confusing at best and possibly worse. */ if (cmd == RTM_DELETE) { /* * It's a delete, so it should already exist.. * If it's a net, mask off the host bits * (Assuming we have a mask) * XXX this is kinda inet specific.. */ if (netmask != NULL) { rt_maskedcopy(dst, (struct sockaddr *)tempbuf, netmask); dst = (struct sockaddr *)tempbuf; } } /* * Now go through all the requested tables (fibs) and do the * requested action. Realistically, this will either be fib 0 * for protocols that don't do multiple tables or all the * tables for those that do. */ for ( fibnum = startfib; fibnum <= endfib; fibnum++) { if (cmd == RTM_DELETE) { struct radix_node *rn; /* * Look up an rtentry that is in the routing tree and * contains the correct info. */ rnh = rt_tables_get_rnh(fibnum, dst->sa_family); if (rnh == NULL) /* this table doesn't exist but others might */ continue; RIB_RLOCK(rnh); rn = rnh->rnh_lookup(dst, netmask, &rnh->head); #ifdef RADIX_MPATH if (rt_mpath_capable(rnh)) { if (rn == NULL) error = ESRCH; else { rt = RNTORT(rn); /* * for interface route the * rt->rt_gateway is sockaddr_intf * for cloning ARP entries, so * rt_mpath_matchgate must use the * interface address */ rt = rt_mpath_matchgate(rt, ifa->ifa_addr); if (rt == NULL) error = ESRCH; } } #endif error = (rn == NULL || (rn->rn_flags & RNF_ROOT) || RNTORT(rn)->rt_ifa != ifa); RIB_RUNLOCK(rnh); if (error) { /* this is only an error if bad on ALL tables */ continue; } } /* * Do the actual request */ bzero((caddr_t)&info, sizeof(info)); info.rti_ifa = ifa; info.rti_flags = flags | (ifa->ifa_flags & ~IFA_RTSELF) | RTF_PINNED; info.rti_info[RTAX_DST] = dst; /* * doing this for compatibility reasons */ if (cmd == RTM_ADD) info.rti_info[RTAX_GATEWAY] = (struct sockaddr *)&null_sdl; else info.rti_info[RTAX_GATEWAY] = ifa->ifa_addr; info.rti_info[RTAX_NETMASK] = netmask; error = rtrequest1_fib(cmd, &info, &rt, fibnum); if (error == 0 && rt != NULL) { /* * notify any listening routing agents of the change */ RT_LOCK(rt); #ifdef RADIX_MPATH /* * in case address alias finds the first address * e.g. ifconfig bge0 192.0.2.246/24 * e.g. ifconfig bge0 192.0.2.247/24 * the address set in the route is 192.0.2.246 * so we need to replace it with 192.0.2.247 */ if (memcmp(rt->rt_ifa->ifa_addr, ifa->ifa_addr, ifa->ifa_addr->sa_len)) { ifa_free(rt->rt_ifa); ifa_ref(ifa); rt->rt_ifp = ifa->ifa_ifp; rt->rt_ifa = ifa; } #endif /* * doing this for compatibility reasons */ if (cmd == RTM_ADD) { ((struct sockaddr_dl *)rt->rt_gateway)->sdl_type = rt->rt_ifp->if_type; ((struct sockaddr_dl *)rt->rt_gateway)->sdl_index = rt->rt_ifp->if_index; } RT_ADDREF(rt); RT_UNLOCK(rt); rt_newaddrmsg_fib(cmd, ifa, error, rt, fibnum); RT_LOCK(rt); RT_REMREF(rt); if (cmd == RTM_DELETE) { /* * If we are deleting, and we found an entry, * then it's been removed from the tree.. * now throw it away. */ RTFREE_LOCKED(rt); } else { if (cmd == RTM_ADD) { /* * We just wanted to add it.. * we don't actually need a reference. */ RT_REMREF(rt); } RT_UNLOCK(rt); } didwork = 1; } if (error) a_failure = error; } if (cmd == RTM_DELETE) { if (didwork) { error = 0; } else { /* we only give an error if it wasn't in any table */ error = ((flags & RTF_HOST) ? EHOSTUNREACH : ENETUNREACH); } } else { if (a_failure) { /* return an error if any of them failed */ error = a_failure; } } return (error); } /* * Set up a routing table entry, normally * for an interface. */ int rtinit(struct ifaddr *ifa, int cmd, int flags) { struct sockaddr *dst; int fib = RT_DEFAULT_FIB; if (flags & RTF_HOST) { dst = ifa->ifa_dstaddr; } else { dst = ifa->ifa_addr; } switch (dst->sa_family) { case AF_INET6: case AF_INET: /* We do support multiple FIBs. */ fib = RT_ALL_FIBS; break; } return (rtinit1(ifa, cmd, flags, fib)); } /* * Announce interface address arrival/withdraw * Returns 0 on success. */ int rt_addrmsg(int cmd, struct ifaddr *ifa, int fibnum) { KASSERT(cmd == RTM_ADD || cmd == RTM_DELETE, ("unexpected cmd %d", cmd)); KASSERT(fibnum == RT_ALL_FIBS || (fibnum >= 0 && fibnum < rt_numfibs), ("%s: fib out of range 0 <=%d<%d", __func__, fibnum, rt_numfibs)); #if defined(INET) || defined(INET6) #ifdef SCTP /* * notify the SCTP stack * this will only get called when an address is added/deleted * XXX pass the ifaddr struct instead if ifa->ifa_addr... */ sctp_addr_change(ifa, cmd); #endif /* SCTP */ #endif return (rtsock_addrmsg(cmd, ifa, fibnum)); } /* * Announce route addition/removal. * Users of this function MUST validate input data BEFORE calling. * However we have to be able to handle invalid data: * if some userland app sends us "invalid" route message (invalid mask, * no dst, wrong address families, etc...) we need to pass it back * to app (and any other rtsock consumers) with rtm_errno field set to * non-zero value. * Returns 0 on success. */ int rt_routemsg(int cmd, struct ifnet *ifp, int error, struct rtentry *rt, int fibnum) { KASSERT(cmd == RTM_ADD || cmd == RTM_DELETE, ("unexpected cmd %d", cmd)); KASSERT(fibnum == RT_ALL_FIBS || (fibnum >= 0 && fibnum < rt_numfibs), ("%s: fib out of range 0 <=%d<%d", __func__, fibnum, rt_numfibs)); KASSERT(rt_key(rt) != NULL, (":%s: rt_key must be supplied", __func__)); return (rtsock_routemsg(cmd, ifp, error, rt, fibnum)); } void rt_newaddrmsg(int cmd, struct ifaddr *ifa, int error, struct rtentry *rt) { rt_newaddrmsg_fib(cmd, ifa, error, rt, RT_ALL_FIBS); } /* * This is called to generate messages from the routing socket * indicating a network interface has had addresses associated with it. */ void rt_newaddrmsg_fib(int cmd, struct ifaddr *ifa, int error, struct rtentry *rt, int fibnum) { KASSERT(cmd == RTM_ADD || cmd == RTM_DELETE, ("unexpected cmd %u", cmd)); KASSERT(fibnum == RT_ALL_FIBS || (fibnum >= 0 && fibnum < rt_numfibs), ("%s: fib out of range 0 <=%d<%d", __func__, fibnum, rt_numfibs)); if (cmd == RTM_ADD) { rt_addrmsg(cmd, ifa, fibnum); if (rt != NULL) rt_routemsg(cmd, ifa->ifa_ifp, error, rt, fibnum); } else { if (rt != NULL) rt_routemsg(cmd, ifa->ifa_ifp, error, rt, fibnum); rt_addrmsg(cmd, ifa, fibnum); } } Index: head/sys/net/rtsock.c =================================================================== --- head/sys/net/rtsock.c (revision 334117) +++ head/sys/net/rtsock.c (revision 334118) @@ -1,1972 +1,1974 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1988, 1991, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)rtsock.c 8.7 (Berkeley) 10/12/95 * $FreeBSD$ */ #include "opt_mpath.h" #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET6 #include #include #endif #ifdef COMPAT_FREEBSD32 #include #include struct if_msghdr32 { uint16_t ifm_msglen; uint8_t ifm_version; uint8_t ifm_type; int32_t ifm_addrs; int32_t ifm_flags; uint16_t ifm_index; struct if_data ifm_data; }; struct if_msghdrl32 { uint16_t ifm_msglen; uint8_t ifm_version; uint8_t ifm_type; int32_t ifm_addrs; int32_t ifm_flags; uint16_t ifm_index; uint16_t _ifm_spare1; uint16_t ifm_len; uint16_t ifm_data_off; struct if_data ifm_data; }; struct ifa_msghdrl32 { uint16_t ifam_msglen; uint8_t ifam_version; uint8_t ifam_type; int32_t ifam_addrs; int32_t ifam_flags; uint16_t ifam_index; uint16_t _ifam_spare1; uint16_t ifam_len; uint16_t ifam_data_off; int32_t ifam_metric; struct if_data ifam_data; }; #define SA_SIZE32(sa) \ ( (((struct sockaddr *)(sa))->sa_len == 0) ? \ sizeof(int) : \ 1 + ( (((struct sockaddr *)(sa))->sa_len - 1) | (sizeof(int) - 1) ) ) #endif /* COMPAT_FREEBSD32 */ MALLOC_DEFINE(M_RTABLE, "routetbl", "routing tables"); /* NB: these are not modified */ static struct sockaddr route_src = { 2, PF_ROUTE, }; static struct sockaddr sa_zero = { sizeof(sa_zero), AF_INET, }; /* These are external hooks for CARP. */ int (*carp_get_vhid_p)(struct ifaddr *); /* * Used by rtsock/raw_input callback code to decide whether to filter the update * notification to a socket bound to a particular FIB. */ #define RTS_FILTER_FIB M_PROTO8 typedef struct { int ip_count; /* attached w/ AF_INET */ int ip6_count; /* attached w/ AF_INET6 */ int any_count; /* total attached */ } route_cb_t; static VNET_DEFINE(route_cb_t, route_cb); #define V_route_cb VNET(route_cb) struct mtx rtsock_mtx; MTX_SYSINIT(rtsock, &rtsock_mtx, "rtsock route_cb lock", MTX_DEF); #define RTSOCK_LOCK() mtx_lock(&rtsock_mtx) #define RTSOCK_UNLOCK() mtx_unlock(&rtsock_mtx) #define RTSOCK_LOCK_ASSERT() mtx_assert(&rtsock_mtx, MA_OWNED) static SYSCTL_NODE(_net, OID_AUTO, route, CTLFLAG_RD, 0, ""); struct walkarg { int w_tmemsize; int w_op, w_arg; caddr_t w_tmem; struct sysctl_req *w_req; }; static void rts_input(struct mbuf *m); static struct mbuf *rtsock_msg_mbuf(int type, struct rt_addrinfo *rtinfo); static int rtsock_msg_buffer(int type, struct rt_addrinfo *rtinfo, struct walkarg *w, int *plen); static int rt_xaddrs(caddr_t cp, caddr_t cplim, struct rt_addrinfo *rtinfo); static int sysctl_dumpentry(struct radix_node *rn, void *vw); static int sysctl_iflist(int af, struct walkarg *w); static int sysctl_ifmalist(int af, struct walkarg *w); static int route_output(struct mbuf *m, struct socket *so, ...); static void rt_getmetrics(const struct rtentry *rt, struct rt_metrics *out); static void rt_dispatch(struct mbuf *, sa_family_t); static struct sockaddr *rtsock_fix_netmask(struct sockaddr *dst, struct sockaddr *smask, struct sockaddr_storage *dmask); static struct netisr_handler rtsock_nh = { .nh_name = "rtsock", .nh_handler = rts_input, .nh_proto = NETISR_ROUTE, .nh_policy = NETISR_POLICY_SOURCE, }; static int sysctl_route_netisr_maxqlen(SYSCTL_HANDLER_ARGS) { int error, qlimit; netisr_getqlimit(&rtsock_nh, &qlimit); error = sysctl_handle_int(oidp, &qlimit, 0, req); if (error || !req->newptr) return (error); if (qlimit < 1) return (EINVAL); return (netisr_setqlimit(&rtsock_nh, qlimit)); } SYSCTL_PROC(_net_route, OID_AUTO, netisr_maxqlen, CTLTYPE_INT|CTLFLAG_RW, 0, 0, sysctl_route_netisr_maxqlen, "I", "maximum routing socket dispatch queue length"); static void vnet_rts_init(void) { int tmp; if (IS_DEFAULT_VNET(curvnet)) { if (TUNABLE_INT_FETCH("net.route.netisr_maxqlen", &tmp)) rtsock_nh.nh_qlimit = tmp; netisr_register(&rtsock_nh); } #ifdef VIMAGE else netisr_register_vnet(&rtsock_nh); #endif } VNET_SYSINIT(vnet_rtsock, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, vnet_rts_init, 0); #ifdef VIMAGE static void vnet_rts_uninit(void) { netisr_unregister_vnet(&rtsock_nh); } VNET_SYSUNINIT(vnet_rts_uninit, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, vnet_rts_uninit, 0); #endif static int raw_input_rts_cb(struct mbuf *m, struct sockproto *proto, struct sockaddr *src, struct rawcb *rp) { int fibnum; KASSERT(m != NULL, ("%s: m is NULL", __func__)); KASSERT(proto != NULL, ("%s: proto is NULL", __func__)); KASSERT(rp != NULL, ("%s: rp is NULL", __func__)); /* No filtering requested. */ if ((m->m_flags & RTS_FILTER_FIB) == 0) return (0); /* Check if it is a rts and the fib matches the one of the socket. */ fibnum = M_GETFIB(m); if (proto->sp_family != PF_ROUTE || rp->rcb_socket == NULL || rp->rcb_socket->so_fibnum == fibnum) return (0); /* Filtering requested and no match, the socket shall be skipped. */ return (1); } static void rts_input(struct mbuf *m) { struct sockproto route_proto; unsigned short *family; struct m_tag *tag; route_proto.sp_family = PF_ROUTE; tag = m_tag_find(m, PACKET_TAG_RTSOCKFAM, NULL); if (tag != NULL) { family = (unsigned short *)(tag + 1); route_proto.sp_protocol = *family; m_tag_delete(m, tag); } else route_proto.sp_protocol = 0; raw_input_ext(m, &route_proto, &route_src, raw_input_rts_cb); } /* * It really doesn't make any sense at all for this code to share much * with raw_usrreq.c, since its functionality is so restricted. XXX */ static void rts_abort(struct socket *so) { raw_usrreqs.pru_abort(so); } static void rts_close(struct socket *so) { raw_usrreqs.pru_close(so); } /* pru_accept is EOPNOTSUPP */ static int rts_attach(struct socket *so, int proto, struct thread *td) { struct rawcb *rp; int error; KASSERT(so->so_pcb == NULL, ("rts_attach: so_pcb != NULL")); /* XXX */ rp = malloc(sizeof *rp, M_PCB, M_WAITOK | M_ZERO); so->so_pcb = (caddr_t)rp; so->so_fibnum = td->td_proc->p_fibnum; error = raw_attach(so, proto); rp = sotorawcb(so); if (error) { so->so_pcb = NULL; free(rp, M_PCB); return error; } RTSOCK_LOCK(); switch(rp->rcb_proto.sp_protocol) { case AF_INET: V_route_cb.ip_count++; break; case AF_INET6: V_route_cb.ip6_count++; break; } V_route_cb.any_count++; RTSOCK_UNLOCK(); soisconnected(so); so->so_options |= SO_USELOOPBACK; return 0; } static int rts_bind(struct socket *so, struct sockaddr *nam, struct thread *td) { return (raw_usrreqs.pru_bind(so, nam, td)); /* xxx just EINVAL */ } static int rts_connect(struct socket *so, struct sockaddr *nam, struct thread *td) { return (raw_usrreqs.pru_connect(so, nam, td)); /* XXX just EINVAL */ } /* pru_connect2 is EOPNOTSUPP */ /* pru_control is EOPNOTSUPP */ static void rts_detach(struct socket *so) { struct rawcb *rp = sotorawcb(so); KASSERT(rp != NULL, ("rts_detach: rp == NULL")); RTSOCK_LOCK(); switch(rp->rcb_proto.sp_protocol) { case AF_INET: V_route_cb.ip_count--; break; case AF_INET6: V_route_cb.ip6_count--; break; } V_route_cb.any_count--; RTSOCK_UNLOCK(); raw_usrreqs.pru_detach(so); } static int rts_disconnect(struct socket *so) { return (raw_usrreqs.pru_disconnect(so)); } /* pru_listen is EOPNOTSUPP */ static int rts_peeraddr(struct socket *so, struct sockaddr **nam) { return (raw_usrreqs.pru_peeraddr(so, nam)); } /* pru_rcvd is EOPNOTSUPP */ /* pru_rcvoob is EOPNOTSUPP */ static int rts_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam, struct mbuf *control, struct thread *td) { return (raw_usrreqs.pru_send(so, flags, m, nam, control, td)); } /* pru_sense is null */ static int rts_shutdown(struct socket *so) { return (raw_usrreqs.pru_shutdown(so)); } static int rts_sockaddr(struct socket *so, struct sockaddr **nam) { return (raw_usrreqs.pru_sockaddr(so, nam)); } static struct pr_usrreqs route_usrreqs = { .pru_abort = rts_abort, .pru_attach = rts_attach, .pru_bind = rts_bind, .pru_connect = rts_connect, .pru_detach = rts_detach, .pru_disconnect = rts_disconnect, .pru_peeraddr = rts_peeraddr, .pru_send = rts_send, .pru_shutdown = rts_shutdown, .pru_sockaddr = rts_sockaddr, .pru_close = rts_close, }; #ifndef _SOCKADDR_UNION_DEFINED #define _SOCKADDR_UNION_DEFINED /* * The union of all possible address formats we handle. */ union sockaddr_union { struct sockaddr sa; struct sockaddr_in sin; struct sockaddr_in6 sin6; }; #endif /* _SOCKADDR_UNION_DEFINED */ static int rtm_get_jailed(struct rt_addrinfo *info, struct ifnet *ifp, struct rtentry *rt, union sockaddr_union *saun, struct ucred *cred) { /* First, see if the returned address is part of the jail. */ if (prison_if(cred, rt->rt_ifa->ifa_addr) == 0) { info->rti_info[RTAX_IFA] = rt->rt_ifa->ifa_addr; return (0); } switch (info->rti_info[RTAX_DST]->sa_family) { #ifdef INET case AF_INET: { struct in_addr ia; struct ifaddr *ifa; int found; found = 0; /* * Try to find an address on the given outgoing interface * that belongs to the jail. */ IF_ADDR_RLOCK(ifp); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { struct sockaddr *sa; sa = ifa->ifa_addr; if (sa->sa_family != AF_INET) continue; ia = ((struct sockaddr_in *)sa)->sin_addr; if (prison_check_ip4(cred, &ia) == 0) { found = 1; break; } } IF_ADDR_RUNLOCK(ifp); if (!found) { /* * As a last resort return the 'default' jail address. */ ia = ((struct sockaddr_in *)rt->rt_ifa->ifa_addr)-> sin_addr; if (prison_get_ip4(cred, &ia) != 0) return (ESRCH); } bzero(&saun->sin, sizeof(struct sockaddr_in)); saun->sin.sin_len = sizeof(struct sockaddr_in); saun->sin.sin_family = AF_INET; saun->sin.sin_addr.s_addr = ia.s_addr; info->rti_info[RTAX_IFA] = (struct sockaddr *)&saun->sin; break; } #endif #ifdef INET6 case AF_INET6: { struct in6_addr ia6; struct ifaddr *ifa; int found; found = 0; /* * Try to find an address on the given outgoing interface * that belongs to the jail. */ IF_ADDR_RLOCK(ifp); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { struct sockaddr *sa; sa = ifa->ifa_addr; if (sa->sa_family != AF_INET6) continue; bcopy(&((struct sockaddr_in6 *)sa)->sin6_addr, &ia6, sizeof(struct in6_addr)); if (prison_check_ip6(cred, &ia6) == 0) { found = 1; break; } } IF_ADDR_RUNLOCK(ifp); if (!found) { /* * As a last resort return the 'default' jail address. */ ia6 = ((struct sockaddr_in6 *)rt->rt_ifa->ifa_addr)-> sin6_addr; if (prison_get_ip6(cred, &ia6) != 0) return (ESRCH); } bzero(&saun->sin6, sizeof(struct sockaddr_in6)); saun->sin6.sin6_len = sizeof(struct sockaddr_in6); saun->sin6.sin6_family = AF_INET6; bcopy(&ia6, &saun->sin6.sin6_addr, sizeof(struct in6_addr)); if (sa6_recoverscope(&saun->sin6) != 0) return (ESRCH); info->rti_info[RTAX_IFA] = (struct sockaddr *)&saun->sin6; break; } #endif default: return (ESRCH); } return (0); } /*ARGSUSED*/ static int route_output(struct mbuf *m, struct socket *so, ...) { struct rt_msghdr *rtm = NULL; struct rtentry *rt = NULL; struct rib_head *rnh; struct rt_addrinfo info; struct sockaddr_storage ss; #ifdef INET6 struct sockaddr_in6 *sin6; int i, rti_need_deembed = 0; #endif int alloc_len = 0, len, error = 0, fibnum; struct ifnet *ifp = NULL; union sockaddr_union saun; sa_family_t saf = AF_UNSPEC; struct rawcb *rp = NULL; struct walkarg w; fibnum = so->so_fibnum; #define senderr(e) { error = e; goto flush;} if (m == NULL || ((m->m_len < sizeof(long)) && (m = m_pullup(m, sizeof(long))) == NULL)) return (ENOBUFS); if ((m->m_flags & M_PKTHDR) == 0) panic("route_output"); len = m->m_pkthdr.len; if (len < sizeof(*rtm) || len != mtod(m, struct rt_msghdr *)->rtm_msglen) senderr(EINVAL); /* * Most of current messages are in range 200-240 bytes, * minimize possible re-allocation on reply using larger size * buffer aligned on 1k boundaty. */ alloc_len = roundup2(len, 1024); if ((rtm = malloc(alloc_len, M_TEMP, M_NOWAIT)) == NULL) senderr(ENOBUFS); m_copydata(m, 0, len, (caddr_t)rtm); bzero(&info, sizeof(info)); bzero(&w, sizeof(w)); if (rtm->rtm_version != RTM_VERSION) { /* Do not touch message since format is unknown */ free(rtm, M_TEMP); rtm = NULL; senderr(EPROTONOSUPPORT); } /* * Starting from here, it is possible * to alter original message and insert * caller PID and error value. */ rtm->rtm_pid = curproc->p_pid; info.rti_addrs = rtm->rtm_addrs; info.rti_mflags = rtm->rtm_inits; info.rti_rmx = &rtm->rtm_rmx; /* * rt_xaddrs() performs s6_addr[2] := sin6_scope_id for AF_INET6 * link-local address because rtrequest requires addresses with * embedded scope id. */ if (rt_xaddrs((caddr_t)(rtm + 1), len + (caddr_t)rtm, &info)) senderr(EINVAL); info.rti_flags = rtm->rtm_flags; if (info.rti_info[RTAX_DST] == NULL || info.rti_info[RTAX_DST]->sa_family >= AF_MAX || (info.rti_info[RTAX_GATEWAY] != NULL && info.rti_info[RTAX_GATEWAY]->sa_family >= AF_MAX)) senderr(EINVAL); saf = info.rti_info[RTAX_DST]->sa_family; /* * Verify that the caller has the appropriate privilege; RTM_GET * is the only operation the non-superuser is allowed. */ if (rtm->rtm_type != RTM_GET) { error = priv_check(curthread, PRIV_NET_ROUTE); if (error) senderr(error); } /* * The given gateway address may be an interface address. * For example, issuing a "route change" command on a route * entry that was created from a tunnel, and the gateway * address given is the local end point. In this case the * RTF_GATEWAY flag must be cleared or the destination will * not be reachable even though there is no error message. */ if (info.rti_info[RTAX_GATEWAY] != NULL && info.rti_info[RTAX_GATEWAY]->sa_family != AF_LINK) { struct rt_addrinfo ginfo; struct sockaddr *gdst; bzero(&ginfo, sizeof(ginfo)); bzero(&ss, sizeof(ss)); ss.ss_len = sizeof(ss); ginfo.rti_info[RTAX_GATEWAY] = (struct sockaddr *)&ss; gdst = info.rti_info[RTAX_GATEWAY]; /* * A host route through the loopback interface is * installed for each interface adddress. In pre 8.0 * releases the interface address of a PPP link type * is not reachable locally. This behavior is fixed as * part of the new L2/L3 redesign and rewrite work. The * signature of this interface address route is the * AF_LINK sa_family type of the rt_gateway, and the * rt_ifp has the IFF_LOOPBACK flag set. */ if (rib_lookup_info(fibnum, gdst, NHR_REF, 0, &ginfo) == 0) { if (ss.ss_family == AF_LINK && ginfo.rti_ifp->if_flags & IFF_LOOPBACK) { info.rti_flags &= ~RTF_GATEWAY; info.rti_flags |= RTF_GWFLAG_COMPAT; } rib_free_info(&ginfo); } } switch (rtm->rtm_type) { struct rtentry *saved_nrt; case RTM_ADD: case RTM_CHANGE: if (rtm->rtm_type == RTM_ADD) { if (info.rti_info[RTAX_GATEWAY] == NULL) senderr(EINVAL); } saved_nrt = NULL; /* support for new ARP code */ if (info.rti_info[RTAX_GATEWAY] != NULL && info.rti_info[RTAX_GATEWAY]->sa_family == AF_LINK && (rtm->rtm_flags & RTF_LLDATA) != 0) { error = lla_rt_output(rtm, &info); #ifdef INET6 if (error == 0) rti_need_deembed = (V_deembed_scopeid) ? 1 : 0; #endif break; } error = rtrequest1_fib(rtm->rtm_type, &info, &saved_nrt, fibnum); if (error == 0 && saved_nrt != NULL) { #ifdef INET6 rti_need_deembed = (V_deembed_scopeid) ? 1 : 0; #endif RT_LOCK(saved_nrt); rtm->rtm_index = saved_nrt->rt_ifp->if_index; RT_REMREF(saved_nrt); RT_UNLOCK(saved_nrt); } break; case RTM_DELETE: saved_nrt = NULL; /* support for new ARP code */ if (info.rti_info[RTAX_GATEWAY] && (info.rti_info[RTAX_GATEWAY]->sa_family == AF_LINK) && (rtm->rtm_flags & RTF_LLDATA) != 0) { error = lla_rt_output(rtm, &info); #ifdef INET6 if (error == 0) rti_need_deembed = (V_deembed_scopeid) ? 1 : 0; #endif break; } error = rtrequest1_fib(RTM_DELETE, &info, &saved_nrt, fibnum); if (error == 0) { RT_LOCK(saved_nrt); rt = saved_nrt; goto report; } #ifdef INET6 /* rt_msg2() will not be used when RTM_DELETE fails. */ rti_need_deembed = (V_deembed_scopeid) ? 1 : 0; #endif break; case RTM_GET: rnh = rt_tables_get_rnh(fibnum, saf); if (rnh == NULL) senderr(EAFNOSUPPORT); RIB_RLOCK(rnh); if (info.rti_info[RTAX_NETMASK] == NULL && rtm->rtm_type == RTM_GET) { /* * Provide longest prefix match for * address lookup (no mask). * 'route -n get addr' */ rt = (struct rtentry *) rnh->rnh_matchaddr( info.rti_info[RTAX_DST], &rnh->head); } else rt = (struct rtentry *) rnh->rnh_lookup( info.rti_info[RTAX_DST], info.rti_info[RTAX_NETMASK], &rnh->head); if (rt == NULL) { RIB_RUNLOCK(rnh); senderr(ESRCH); } #ifdef RADIX_MPATH /* * for RTM_CHANGE/LOCK, if we got multipath routes, * we require users to specify a matching RTAX_GATEWAY. * * for RTM_GET, gate is optional even with multipath. * if gate == NULL the first match is returned. * (no need to call rt_mpath_matchgate if gate == NULL) */ if (rt_mpath_capable(rnh) && (rtm->rtm_type != RTM_GET || info.rti_info[RTAX_GATEWAY])) { rt = rt_mpath_matchgate(rt, info.rti_info[RTAX_GATEWAY]); if (!rt) { RIB_RUNLOCK(rnh); senderr(ESRCH); } } #endif /* * If performing proxied L2 entry insertion, and * the actual PPP host entry is found, perform * another search to retrieve the prefix route of * the local end point of the PPP link. */ if (rtm->rtm_flags & RTF_ANNOUNCE) { struct sockaddr laddr; if (rt->rt_ifp != NULL && rt->rt_ifp->if_type == IFT_PROPVIRTUAL) { struct ifaddr *ifa; + NET_EPOCH_ENTER(); ifa = ifa_ifwithnet(info.rti_info[RTAX_DST], 1, RT_ALL_FIBS); if (ifa != NULL) rt_maskedcopy(ifa->ifa_addr, &laddr, ifa->ifa_netmask); + NET_EPOCH_EXIT(); } else rt_maskedcopy(rt->rt_ifa->ifa_addr, &laddr, rt->rt_ifa->ifa_netmask); /* * refactor rt and no lock operation necessary */ rt = (struct rtentry *)rnh->rnh_matchaddr(&laddr, &rnh->head); if (rt == NULL) { RIB_RUNLOCK(rnh); senderr(ESRCH); } } RT_LOCK(rt); RT_ADDREF(rt); RIB_RUNLOCK(rnh); report: RT_LOCK_ASSERT(rt); if ((rt->rt_flags & RTF_HOST) == 0 ? jailed_without_vnet(curthread->td_ucred) : prison_if(curthread->td_ucred, rt_key(rt)) != 0) { RT_UNLOCK(rt); senderr(ESRCH); } info.rti_info[RTAX_DST] = rt_key(rt); info.rti_info[RTAX_GATEWAY] = rt->rt_gateway; info.rti_info[RTAX_NETMASK] = rtsock_fix_netmask(rt_key(rt), rt_mask(rt), &ss); info.rti_info[RTAX_GENMASK] = 0; if (rtm->rtm_addrs & (RTA_IFP | RTA_IFA)) { ifp = rt->rt_ifp; if (ifp) { info.rti_info[RTAX_IFP] = ifp->if_addr->ifa_addr; error = rtm_get_jailed(&info, ifp, rt, &saun, curthread->td_ucred); if (error != 0) { RT_UNLOCK(rt); senderr(error); } if (ifp->if_flags & IFF_POINTOPOINT) info.rti_info[RTAX_BRD] = rt->rt_ifa->ifa_dstaddr; rtm->rtm_index = ifp->if_index; } else { info.rti_info[RTAX_IFP] = NULL; info.rti_info[RTAX_IFA] = NULL; } } else if ((ifp = rt->rt_ifp) != NULL) { rtm->rtm_index = ifp->if_index; } /* Check if we need to realloc storage */ rtsock_msg_buffer(rtm->rtm_type, &info, NULL, &len); if (len > alloc_len) { struct rt_msghdr *new_rtm; new_rtm = malloc(len, M_TEMP, M_NOWAIT); if (new_rtm == NULL) { RT_UNLOCK(rt); senderr(ENOBUFS); } bcopy(rtm, new_rtm, rtm->rtm_msglen); free(rtm, M_TEMP); rtm = new_rtm; alloc_len = len; } w.w_tmem = (caddr_t)rtm; w.w_tmemsize = alloc_len; rtsock_msg_buffer(rtm->rtm_type, &info, &w, &len); if (rt->rt_flags & RTF_GWFLAG_COMPAT) rtm->rtm_flags = RTF_GATEWAY | (rt->rt_flags & ~RTF_GWFLAG_COMPAT); else rtm->rtm_flags = rt->rt_flags; rt_getmetrics(rt, &rtm->rtm_rmx); rtm->rtm_addrs = info.rti_addrs; RT_UNLOCK(rt); break; default: senderr(EOPNOTSUPP); } flush: if (rt != NULL) RTFREE(rt); /* * Check to see if we don't want our own messages. */ if ((so->so_options & SO_USELOOPBACK) == 0) { if (V_route_cb.any_count <= 1) { if (rtm != NULL) free(rtm, M_TEMP); m_freem(m); return (error); } /* There is another listener, so construct message */ rp = sotorawcb(so); } if (rtm != NULL) { #ifdef INET6 if (rti_need_deembed) { /* sin6_scope_id is recovered before sending rtm. */ sin6 = (struct sockaddr_in6 *)&ss; for (i = 0; i < RTAX_MAX; i++) { if (info.rti_info[i] == NULL) continue; if (info.rti_info[i]->sa_family != AF_INET6) continue; bcopy(info.rti_info[i], sin6, sizeof(*sin6)); if (sa6_recoverscope(sin6) == 0) bcopy(sin6, info.rti_info[i], sizeof(*sin6)); } } #endif if (error != 0) rtm->rtm_errno = error; else rtm->rtm_flags |= RTF_DONE; m_copyback(m, 0, rtm->rtm_msglen, (caddr_t)rtm); if (m->m_pkthdr.len < rtm->rtm_msglen) { m_freem(m); m = NULL; } else if (m->m_pkthdr.len > rtm->rtm_msglen) m_adj(m, rtm->rtm_msglen - m->m_pkthdr.len); free(rtm, M_TEMP); } if (m != NULL) { M_SETFIB(m, fibnum); m->m_flags |= RTS_FILTER_FIB; if (rp) { /* * XXX insure we don't get a copy by * invalidating our protocol */ unsigned short family = rp->rcb_proto.sp_family; rp->rcb_proto.sp_family = 0; rt_dispatch(m, saf); rp->rcb_proto.sp_family = family; } else rt_dispatch(m, saf); } return (error); } static void rt_getmetrics(const struct rtentry *rt, struct rt_metrics *out) { bzero(out, sizeof(*out)); out->rmx_mtu = rt->rt_mtu; out->rmx_weight = rt->rt_weight; out->rmx_pksent = counter_u64_fetch(rt->rt_pksent); /* Kernel -> userland timebase conversion. */ out->rmx_expire = rt->rt_expire ? rt->rt_expire - time_uptime + time_second : 0; } /* * Extract the addresses of the passed sockaddrs. * Do a little sanity checking so as to avoid bad memory references. * This data is derived straight from userland. */ static int rt_xaddrs(caddr_t cp, caddr_t cplim, struct rt_addrinfo *rtinfo) { struct sockaddr *sa; int i; for (i = 0; i < RTAX_MAX && cp < cplim; i++) { if ((rtinfo->rti_addrs & (1 << i)) == 0) continue; sa = (struct sockaddr *)cp; /* * It won't fit. */ if (cp + sa->sa_len > cplim) return (EINVAL); /* * there are no more.. quit now * If there are more bits, they are in error. * I've seen this. route(1) can evidently generate these. * This causes kernel to core dump. * for compatibility, If we see this, point to a safe address. */ if (sa->sa_len == 0) { rtinfo->rti_info[i] = &sa_zero; return (0); /* should be EINVAL but for compat */ } /* accept it */ #ifdef INET6 if (sa->sa_family == AF_INET6) sa6_embedscope((struct sockaddr_in6 *)sa, V_ip6_use_defzone); #endif rtinfo->rti_info[i] = sa; cp += SA_SIZE(sa); } return (0); } /* * Fill in @dmask with valid netmask leaving original @smask * intact. Mostly used with radix netmasks. */ static struct sockaddr * rtsock_fix_netmask(struct sockaddr *dst, struct sockaddr *smask, struct sockaddr_storage *dmask) { if (dst == NULL || smask == NULL) return (NULL); memset(dmask, 0, dst->sa_len); memcpy(dmask, smask, smask->sa_len); dmask->ss_len = dst->sa_len; dmask->ss_family = dst->sa_family; return ((struct sockaddr *)dmask); } /* * Writes information related to @rtinfo object to newly-allocated mbuf. * Assumes MCLBYTES is enough to construct any message. * Used for OS notifications of vaious events (if/ifa announces,etc) * * Returns allocated mbuf or NULL on failure. */ static struct mbuf * rtsock_msg_mbuf(int type, struct rt_addrinfo *rtinfo) { struct rt_msghdr *rtm; struct mbuf *m; int i; struct sockaddr *sa; #ifdef INET6 struct sockaddr_storage ss; struct sockaddr_in6 *sin6; #endif int len, dlen; switch (type) { case RTM_DELADDR: case RTM_NEWADDR: len = sizeof(struct ifa_msghdr); break; case RTM_DELMADDR: case RTM_NEWMADDR: len = sizeof(struct ifma_msghdr); break; case RTM_IFINFO: len = sizeof(struct if_msghdr); break; case RTM_IFANNOUNCE: case RTM_IEEE80211: len = sizeof(struct if_announcemsghdr); break; default: len = sizeof(struct rt_msghdr); } /* XXXGL: can we use MJUMPAGESIZE cluster here? */ KASSERT(len <= MCLBYTES, ("%s: message too big", __func__)); if (len > MHLEN) m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); else m = m_gethdr(M_NOWAIT, MT_DATA); if (m == NULL) return (m); m->m_pkthdr.len = m->m_len = len; rtm = mtod(m, struct rt_msghdr *); bzero((caddr_t)rtm, len); for (i = 0; i < RTAX_MAX; i++) { if ((sa = rtinfo->rti_info[i]) == NULL) continue; rtinfo->rti_addrs |= (1 << i); dlen = SA_SIZE(sa); #ifdef INET6 if (V_deembed_scopeid && sa->sa_family == AF_INET6) { sin6 = (struct sockaddr_in6 *)&ss; bcopy(sa, sin6, sizeof(*sin6)); if (sa6_recoverscope(sin6) == 0) sa = (struct sockaddr *)sin6; } #endif m_copyback(m, len, dlen, (caddr_t)sa); len += dlen; } if (m->m_pkthdr.len != len) { m_freem(m); return (NULL); } rtm->rtm_msglen = len; rtm->rtm_version = RTM_VERSION; rtm->rtm_type = type; return (m); } /* * Writes information related to @rtinfo object to preallocated buffer. * Stores needed size in @plen. If @w is NULL, calculates size without * writing. * Used for sysctl dumps and rtsock answers (RTM_DEL/RTM_GET) generation. * * Returns 0 on success. * */ static int rtsock_msg_buffer(int type, struct rt_addrinfo *rtinfo, struct walkarg *w, int *plen) { int i; int len, buflen = 0, dlen; caddr_t cp = NULL; struct rt_msghdr *rtm = NULL; #ifdef INET6 struct sockaddr_storage ss; struct sockaddr_in6 *sin6; #endif #ifdef COMPAT_FREEBSD32 bool compat32 = false; #endif switch (type) { case RTM_DELADDR: case RTM_NEWADDR: if (w != NULL && w->w_op == NET_RT_IFLISTL) { #ifdef COMPAT_FREEBSD32 if (w->w_req->flags & SCTL_MASK32) { len = sizeof(struct ifa_msghdrl32); compat32 = true; } else #endif len = sizeof(struct ifa_msghdrl); } else len = sizeof(struct ifa_msghdr); break; case RTM_IFINFO: #ifdef COMPAT_FREEBSD32 if (w != NULL && w->w_req->flags & SCTL_MASK32) { if (w->w_op == NET_RT_IFLISTL) len = sizeof(struct if_msghdrl32); else len = sizeof(struct if_msghdr32); compat32 = true; break; } #endif if (w != NULL && w->w_op == NET_RT_IFLISTL) len = sizeof(struct if_msghdrl); else len = sizeof(struct if_msghdr); break; case RTM_NEWMADDR: len = sizeof(struct ifma_msghdr); break; default: len = sizeof(struct rt_msghdr); } if (w != NULL) { rtm = (struct rt_msghdr *)w->w_tmem; buflen = w->w_tmemsize - len; cp = (caddr_t)w->w_tmem + len; } rtinfo->rti_addrs = 0; for (i = 0; i < RTAX_MAX; i++) { struct sockaddr *sa; if ((sa = rtinfo->rti_info[i]) == NULL) continue; rtinfo->rti_addrs |= (1 << i); #ifdef COMPAT_FREEBSD32 if (compat32) dlen = SA_SIZE32(sa); else #endif dlen = SA_SIZE(sa); if (cp != NULL && buflen >= dlen) { #ifdef INET6 if (V_deembed_scopeid && sa->sa_family == AF_INET6) { sin6 = (struct sockaddr_in6 *)&ss; bcopy(sa, sin6, sizeof(*sin6)); if (sa6_recoverscope(sin6) == 0) sa = (struct sockaddr *)sin6; } #endif bcopy((caddr_t)sa, cp, (unsigned)dlen); cp += dlen; buflen -= dlen; } else if (cp != NULL) { /* * Buffer too small. Count needed size * and return with error. */ cp = NULL; } len += dlen; } if (cp != NULL) { dlen = ALIGN(len) - len; if (buflen < dlen) cp = NULL; else buflen -= dlen; } len = ALIGN(len); if (cp != NULL) { /* fill header iff buffer is large enough */ rtm->rtm_version = RTM_VERSION; rtm->rtm_type = type; rtm->rtm_msglen = len; } *plen = len; if (w != NULL && cp == NULL) return (ENOBUFS); return (0); } /* * This routine is called to generate a message from the routing * socket indicating that a redirect has occurred, a routing lookup * has failed, or that a protocol has detected timeouts to a particular * destination. */ void rt_missmsg_fib(int type, struct rt_addrinfo *rtinfo, int flags, int error, int fibnum) { struct rt_msghdr *rtm; struct mbuf *m; struct sockaddr *sa = rtinfo->rti_info[RTAX_DST]; if (V_route_cb.any_count == 0) return; m = rtsock_msg_mbuf(type, rtinfo); if (m == NULL) return; if (fibnum != RT_ALL_FIBS) { KASSERT(fibnum >= 0 && fibnum < rt_numfibs, ("%s: fibnum out " "of range 0 <= %d < %d", __func__, fibnum, rt_numfibs)); M_SETFIB(m, fibnum); m->m_flags |= RTS_FILTER_FIB; } rtm = mtod(m, struct rt_msghdr *); rtm->rtm_flags = RTF_DONE | flags; rtm->rtm_errno = error; rtm->rtm_addrs = rtinfo->rti_addrs; rt_dispatch(m, sa ? sa->sa_family : AF_UNSPEC); } void rt_missmsg(int type, struct rt_addrinfo *rtinfo, int flags, int error) { rt_missmsg_fib(type, rtinfo, flags, error, RT_ALL_FIBS); } /* * This routine is called to generate a message from the routing * socket indicating that the status of a network interface has changed. */ void rt_ifmsg(struct ifnet *ifp) { struct if_msghdr *ifm; struct mbuf *m; struct rt_addrinfo info; if (V_route_cb.any_count == 0) return; bzero((caddr_t)&info, sizeof(info)); m = rtsock_msg_mbuf(RTM_IFINFO, &info); if (m == NULL) return; ifm = mtod(m, struct if_msghdr *); ifm->ifm_index = ifp->if_index; ifm->ifm_flags = ifp->if_flags | ifp->if_drv_flags; if_data_copy(ifp, &ifm->ifm_data); ifm->ifm_addrs = 0; rt_dispatch(m, AF_UNSPEC); } /* * Announce interface address arrival/withdraw. * Please do not call directly, use rt_addrmsg(). * Assume input data to be valid. * Returns 0 on success. */ int rtsock_addrmsg(int cmd, struct ifaddr *ifa, int fibnum) { struct rt_addrinfo info; struct sockaddr *sa; int ncmd; struct mbuf *m; struct ifa_msghdr *ifam; struct ifnet *ifp = ifa->ifa_ifp; struct sockaddr_storage ss; if (V_route_cb.any_count == 0) return (0); ncmd = cmd == RTM_ADD ? RTM_NEWADDR : RTM_DELADDR; bzero((caddr_t)&info, sizeof(info)); info.rti_info[RTAX_IFA] = sa = ifa->ifa_addr; info.rti_info[RTAX_IFP] = ifp->if_addr->ifa_addr; info.rti_info[RTAX_NETMASK] = rtsock_fix_netmask( info.rti_info[RTAX_IFP], ifa->ifa_netmask, &ss); info.rti_info[RTAX_BRD] = ifa->ifa_dstaddr; if ((m = rtsock_msg_mbuf(ncmd, &info)) == NULL) return (ENOBUFS); ifam = mtod(m, struct ifa_msghdr *); ifam->ifam_index = ifp->if_index; ifam->ifam_metric = ifa->ifa_ifp->if_metric; ifam->ifam_flags = ifa->ifa_flags; ifam->ifam_addrs = info.rti_addrs; if (fibnum != RT_ALL_FIBS) { M_SETFIB(m, fibnum); m->m_flags |= RTS_FILTER_FIB; } rt_dispatch(m, sa ? sa->sa_family : AF_UNSPEC); return (0); } /* * Announce route addition/removal. * Please do not call directly, use rt_routemsg(). * Note that @rt data MAY be inconsistent/invalid: * if some userland app sends us "invalid" route message (invalid mask, * no dst, wrong address families, etc...) we need to pass it back * to app (and any other rtsock consumers) with rtm_errno field set to * non-zero value. * * Returns 0 on success. */ int rtsock_routemsg(int cmd, struct ifnet *ifp, int error, struct rtentry *rt, int fibnum) { struct rt_addrinfo info; struct sockaddr *sa; struct mbuf *m; struct rt_msghdr *rtm; struct sockaddr_storage ss; if (V_route_cb.any_count == 0) return (0); bzero((caddr_t)&info, sizeof(info)); info.rti_info[RTAX_DST] = sa = rt_key(rt); info.rti_info[RTAX_NETMASK] = rtsock_fix_netmask(sa, rt_mask(rt), &ss); info.rti_info[RTAX_GATEWAY] = rt->rt_gateway; if ((m = rtsock_msg_mbuf(cmd, &info)) == NULL) return (ENOBUFS); rtm = mtod(m, struct rt_msghdr *); rtm->rtm_index = ifp->if_index; rtm->rtm_flags |= rt->rt_flags; rtm->rtm_errno = error; rtm->rtm_addrs = info.rti_addrs; if (fibnum != RT_ALL_FIBS) { M_SETFIB(m, fibnum); m->m_flags |= RTS_FILTER_FIB; } rt_dispatch(m, sa ? sa->sa_family : AF_UNSPEC); return (0); } /* * This is the analogue to the rt_newaddrmsg which performs the same * function but for multicast group memberhips. This is easier since * there is no route state to worry about. */ void rt_newmaddrmsg(int cmd, struct ifmultiaddr *ifma) { struct rt_addrinfo info; struct mbuf *m = NULL; struct ifnet *ifp = ifma->ifma_ifp; struct ifma_msghdr *ifmam; if (V_route_cb.any_count == 0) return; bzero((caddr_t)&info, sizeof(info)); info.rti_info[RTAX_IFA] = ifma->ifma_addr; if (ifp && ifp->if_addr) info.rti_info[RTAX_IFP] = ifp->if_addr->ifa_addr; else info.rti_info[RTAX_IFP] = NULL; /* * If a link-layer address is present, present it as a ``gateway'' * (similarly to how ARP entries, e.g., are presented). */ info.rti_info[RTAX_GATEWAY] = ifma->ifma_lladdr; m = rtsock_msg_mbuf(cmd, &info); if (m == NULL) return; ifmam = mtod(m, struct ifma_msghdr *); KASSERT(ifp != NULL, ("%s: link-layer multicast address w/o ifp\n", __func__)); ifmam->ifmam_index = ifp->if_index; ifmam->ifmam_addrs = info.rti_addrs; rt_dispatch(m, ifma->ifma_addr ? ifma->ifma_addr->sa_family : AF_UNSPEC); } static struct mbuf * rt_makeifannouncemsg(struct ifnet *ifp, int type, int what, struct rt_addrinfo *info) { struct if_announcemsghdr *ifan; struct mbuf *m; if (V_route_cb.any_count == 0) return NULL; bzero((caddr_t)info, sizeof(*info)); m = rtsock_msg_mbuf(type, info); if (m != NULL) { ifan = mtod(m, struct if_announcemsghdr *); ifan->ifan_index = ifp->if_index; strlcpy(ifan->ifan_name, ifp->if_xname, sizeof(ifan->ifan_name)); ifan->ifan_what = what; } return m; } /* * This is called to generate routing socket messages indicating * IEEE80211 wireless events. * XXX we piggyback on the RTM_IFANNOUNCE msg format in a clumsy way. */ void rt_ieee80211msg(struct ifnet *ifp, int what, void *data, size_t data_len) { struct mbuf *m; struct rt_addrinfo info; m = rt_makeifannouncemsg(ifp, RTM_IEEE80211, what, &info); if (m != NULL) { /* * Append the ieee80211 data. Try to stick it in the * mbuf containing the ifannounce msg; otherwise allocate * a new mbuf and append. * * NB: we assume m is a single mbuf. */ if (data_len > M_TRAILINGSPACE(m)) { struct mbuf *n = m_get(M_NOWAIT, MT_DATA); if (n == NULL) { m_freem(m); return; } bcopy(data, mtod(n, void *), data_len); n->m_len = data_len; m->m_next = n; } else if (data_len > 0) { bcopy(data, mtod(m, u_int8_t *) + m->m_len, data_len); m->m_len += data_len; } if (m->m_flags & M_PKTHDR) m->m_pkthdr.len += data_len; mtod(m, struct if_announcemsghdr *)->ifan_msglen += data_len; rt_dispatch(m, AF_UNSPEC); } } /* * This is called to generate routing socket messages indicating * network interface arrival and departure. */ void rt_ifannouncemsg(struct ifnet *ifp, int what) { struct mbuf *m; struct rt_addrinfo info; m = rt_makeifannouncemsg(ifp, RTM_IFANNOUNCE, what, &info); if (m != NULL) rt_dispatch(m, AF_UNSPEC); } static void rt_dispatch(struct mbuf *m, sa_family_t saf) { struct m_tag *tag; /* * Preserve the family from the sockaddr, if any, in an m_tag for * use when injecting the mbuf into the routing socket buffer from * the netisr. */ if (saf != AF_UNSPEC) { tag = m_tag_get(PACKET_TAG_RTSOCKFAM, sizeof(unsigned short), M_NOWAIT); if (tag == NULL) { m_freem(m); return; } *(unsigned short *)(tag + 1) = saf; m_tag_prepend(m, tag); } #ifdef VIMAGE if (V_loif) m->m_pkthdr.rcvif = V_loif; else { m_freem(m); return; } #endif netisr_queue(NETISR_ROUTE, m); /* mbuf is free'd on failure. */ } /* * This is used in dumping the kernel table via sysctl(). */ static int sysctl_dumpentry(struct radix_node *rn, void *vw) { struct walkarg *w = vw; struct rtentry *rt = (struct rtentry *)rn; int error = 0, size; struct rt_addrinfo info; struct sockaddr_storage ss; if (w->w_op == NET_RT_FLAGS && !(rt->rt_flags & w->w_arg)) return 0; if ((rt->rt_flags & RTF_HOST) == 0 ? jailed_without_vnet(w->w_req->td->td_ucred) : prison_if(w->w_req->td->td_ucred, rt_key(rt)) != 0) return (0); bzero((caddr_t)&info, sizeof(info)); info.rti_info[RTAX_DST] = rt_key(rt); info.rti_info[RTAX_GATEWAY] = rt->rt_gateway; info.rti_info[RTAX_NETMASK] = rtsock_fix_netmask(rt_key(rt), rt_mask(rt), &ss); info.rti_info[RTAX_GENMASK] = 0; if (rt->rt_ifp) { info.rti_info[RTAX_IFP] = rt->rt_ifp->if_addr->ifa_addr; info.rti_info[RTAX_IFA] = rt->rt_ifa->ifa_addr; if (rt->rt_ifp->if_flags & IFF_POINTOPOINT) info.rti_info[RTAX_BRD] = rt->rt_ifa->ifa_dstaddr; } if ((error = rtsock_msg_buffer(RTM_GET, &info, w, &size)) != 0) return (error); if (w->w_req && w->w_tmem) { struct rt_msghdr *rtm = (struct rt_msghdr *)w->w_tmem; if (rt->rt_flags & RTF_GWFLAG_COMPAT) rtm->rtm_flags = RTF_GATEWAY | (rt->rt_flags & ~RTF_GWFLAG_COMPAT); else rtm->rtm_flags = rt->rt_flags; rt_getmetrics(rt, &rtm->rtm_rmx); rtm->rtm_index = rt->rt_ifp->if_index; rtm->rtm_errno = rtm->rtm_pid = rtm->rtm_seq = 0; rtm->rtm_addrs = info.rti_addrs; error = SYSCTL_OUT(w->w_req, (caddr_t)rtm, size); return (error); } return (error); } static int sysctl_iflist_ifml(struct ifnet *ifp, const struct if_data *src_ifd, struct rt_addrinfo *info, struct walkarg *w, int len) { struct if_msghdrl *ifm; struct if_data *ifd; ifm = (struct if_msghdrl *)w->w_tmem; #ifdef COMPAT_FREEBSD32 if (w->w_req->flags & SCTL_MASK32) { struct if_msghdrl32 *ifm32; ifm32 = (struct if_msghdrl32 *)ifm; ifm32->ifm_addrs = info->rti_addrs; ifm32->ifm_flags = ifp->if_flags | ifp->if_drv_flags; ifm32->ifm_index = ifp->if_index; ifm32->_ifm_spare1 = 0; ifm32->ifm_len = sizeof(*ifm32); ifm32->ifm_data_off = offsetof(struct if_msghdrl32, ifm_data); ifd = &ifm32->ifm_data; } else #endif { ifm->ifm_addrs = info->rti_addrs; ifm->ifm_flags = ifp->if_flags | ifp->if_drv_flags; ifm->ifm_index = ifp->if_index; ifm->_ifm_spare1 = 0; ifm->ifm_len = sizeof(*ifm); ifm->ifm_data_off = offsetof(struct if_msghdrl, ifm_data); ifd = &ifm->ifm_data; } memcpy(ifd, src_ifd, sizeof(*ifd)); return (SYSCTL_OUT(w->w_req, (caddr_t)ifm, len)); } static int sysctl_iflist_ifm(struct ifnet *ifp, const struct if_data *src_ifd, struct rt_addrinfo *info, struct walkarg *w, int len) { struct if_msghdr *ifm; struct if_data *ifd; ifm = (struct if_msghdr *)w->w_tmem; #ifdef COMPAT_FREEBSD32 if (w->w_req->flags & SCTL_MASK32) { struct if_msghdr32 *ifm32; ifm32 = (struct if_msghdr32 *)ifm; ifm32->ifm_addrs = info->rti_addrs; ifm32->ifm_flags = ifp->if_flags | ifp->if_drv_flags; ifm32->ifm_index = ifp->if_index; ifd = &ifm32->ifm_data; } else #endif { ifm->ifm_addrs = info->rti_addrs; ifm->ifm_flags = ifp->if_flags | ifp->if_drv_flags; ifm->ifm_index = ifp->if_index; ifd = &ifm->ifm_data; } memcpy(ifd, src_ifd, sizeof(*ifd)); return (SYSCTL_OUT(w->w_req, (caddr_t)ifm, len)); } static int sysctl_iflist_ifaml(struct ifaddr *ifa, struct rt_addrinfo *info, struct walkarg *w, int len) { struct ifa_msghdrl *ifam; struct if_data *ifd; ifam = (struct ifa_msghdrl *)w->w_tmem; #ifdef COMPAT_FREEBSD32 if (w->w_req->flags & SCTL_MASK32) { struct ifa_msghdrl32 *ifam32; ifam32 = (struct ifa_msghdrl32 *)ifam; ifam32->ifam_addrs = info->rti_addrs; ifam32->ifam_flags = ifa->ifa_flags; ifam32->ifam_index = ifa->ifa_ifp->if_index; ifam32->_ifam_spare1 = 0; ifam32->ifam_len = sizeof(*ifam32); ifam32->ifam_data_off = offsetof(struct ifa_msghdrl32, ifam_data); ifam32->ifam_metric = ifa->ifa_ifp->if_metric; ifd = &ifam32->ifam_data; } else #endif { ifam->ifam_addrs = info->rti_addrs; ifam->ifam_flags = ifa->ifa_flags; ifam->ifam_index = ifa->ifa_ifp->if_index; ifam->_ifam_spare1 = 0; ifam->ifam_len = sizeof(*ifam); ifam->ifam_data_off = offsetof(struct ifa_msghdrl, ifam_data); ifam->ifam_metric = ifa->ifa_ifp->if_metric; ifd = &ifam->ifam_data; } bzero(ifd, sizeof(*ifd)); ifd->ifi_datalen = sizeof(struct if_data); ifd->ifi_ipackets = counter_u64_fetch(ifa->ifa_ipackets); ifd->ifi_opackets = counter_u64_fetch(ifa->ifa_opackets); ifd->ifi_ibytes = counter_u64_fetch(ifa->ifa_ibytes); ifd->ifi_obytes = counter_u64_fetch(ifa->ifa_obytes); /* Fixup if_data carp(4) vhid. */ if (carp_get_vhid_p != NULL) ifd->ifi_vhid = (*carp_get_vhid_p)(ifa); return (SYSCTL_OUT(w->w_req, w->w_tmem, len)); } static int sysctl_iflist_ifam(struct ifaddr *ifa, struct rt_addrinfo *info, struct walkarg *w, int len) { struct ifa_msghdr *ifam; ifam = (struct ifa_msghdr *)w->w_tmem; ifam->ifam_addrs = info->rti_addrs; ifam->ifam_flags = ifa->ifa_flags; ifam->ifam_index = ifa->ifa_ifp->if_index; ifam->ifam_metric = ifa->ifa_ifp->if_metric; return (SYSCTL_OUT(w->w_req, w->w_tmem, len)); } static int sysctl_iflist(int af, struct walkarg *w) { struct ifnet *ifp; struct ifaddr *ifa; struct if_data ifd; struct rt_addrinfo info; int len, error = 0; struct sockaddr_storage ss; bzero((caddr_t)&info, sizeof(info)); bzero(&ifd, sizeof(ifd)); IFNET_RLOCK_NOSLEEP(); - TAILQ_FOREACH(ifp, &V_ifnet, if_link) { + CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) { if (w->w_arg && w->w_arg != ifp->if_index) continue; if_data_copy(ifp, &ifd); IF_ADDR_RLOCK(ifp); ifa = ifp->if_addr; info.rti_info[RTAX_IFP] = ifa->ifa_addr; error = rtsock_msg_buffer(RTM_IFINFO, &info, w, &len); if (error != 0) goto done; info.rti_info[RTAX_IFP] = NULL; if (w->w_req && w->w_tmem) { if (w->w_op == NET_RT_IFLISTL) error = sysctl_iflist_ifml(ifp, &ifd, &info, w, len); else error = sysctl_iflist_ifm(ifp, &ifd, &info, w, len); if (error) goto done; } while ((ifa = CK_STAILQ_NEXT(ifa, ifa_link)) != NULL) { if (af && af != ifa->ifa_addr->sa_family) continue; if (prison_if(w->w_req->td->td_ucred, ifa->ifa_addr) != 0) continue; info.rti_info[RTAX_IFA] = ifa->ifa_addr; info.rti_info[RTAX_NETMASK] = rtsock_fix_netmask( ifa->ifa_addr, ifa->ifa_netmask, &ss); info.rti_info[RTAX_BRD] = ifa->ifa_dstaddr; error = rtsock_msg_buffer(RTM_NEWADDR, &info, w, &len); if (error != 0) goto done; if (w->w_req && w->w_tmem) { if (w->w_op == NET_RT_IFLISTL) error = sysctl_iflist_ifaml(ifa, &info, w, len); else error = sysctl_iflist_ifam(ifa, &info, w, len); if (error) goto done; } } IF_ADDR_RUNLOCK(ifp); info.rti_info[RTAX_IFA] = NULL; info.rti_info[RTAX_NETMASK] = NULL; info.rti_info[RTAX_BRD] = NULL; } done: if (ifp != NULL) IF_ADDR_RUNLOCK(ifp); IFNET_RUNLOCK_NOSLEEP(); return (error); } static int sysctl_ifmalist(int af, struct walkarg *w) { struct rt_addrinfo info; struct ifaddr *ifa; struct ifmultiaddr *ifma; struct ifnet *ifp; int error, len; error = 0; bzero((caddr_t)&info, sizeof(info)); IFNET_RLOCK_NOSLEEP(); - TAILQ_FOREACH(ifp, &V_ifnet, if_link) { + CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) { if (w->w_arg && w->w_arg != ifp->if_index) continue; ifa = ifp->if_addr; info.rti_info[RTAX_IFP] = ifa ? ifa->ifa_addr : NULL; IF_ADDR_RLOCK(ifp); CK_STAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { if (af && af != ifma->ifma_addr->sa_family) continue; if (prison_if(w->w_req->td->td_ucred, ifma->ifma_addr) != 0) continue; info.rti_info[RTAX_IFA] = ifma->ifma_addr; info.rti_info[RTAX_GATEWAY] = (ifma->ifma_addr->sa_family != AF_LINK) ? ifma->ifma_lladdr : NULL; error = rtsock_msg_buffer(RTM_NEWMADDR, &info, w, &len); if (error != 0) break; if (w->w_req && w->w_tmem) { struct ifma_msghdr *ifmam; ifmam = (struct ifma_msghdr *)w->w_tmem; ifmam->ifmam_index = ifma->ifma_ifp->if_index; ifmam->ifmam_flags = 0; ifmam->ifmam_addrs = info.rti_addrs; error = SYSCTL_OUT(w->w_req, w->w_tmem, len); if (error != 0) break; } } IF_ADDR_RUNLOCK(ifp); if (error != 0) break; } IFNET_RUNLOCK_NOSLEEP(); return (error); } static int sysctl_rtsock(SYSCTL_HANDLER_ARGS) { int *name = (int *)arg1; u_int namelen = arg2; struct rib_head *rnh = NULL; /* silence compiler. */ int i, lim, error = EINVAL; int fib = 0; u_char af; struct walkarg w; name ++; namelen--; if (req->newptr) return (EPERM); if (name[1] == NET_RT_DUMP) { if (namelen == 3) fib = req->td->td_proc->p_fibnum; else if (namelen == 4) fib = (name[3] == RT_ALL_FIBS) ? req->td->td_proc->p_fibnum : name[3]; else return ((namelen < 3) ? EISDIR : ENOTDIR); if (fib < 0 || fib >= rt_numfibs) return (EINVAL); } else if (namelen != 3) return ((namelen < 3) ? EISDIR : ENOTDIR); af = name[0]; if (af > AF_MAX) return (EINVAL); bzero(&w, sizeof(w)); w.w_op = name[1]; w.w_arg = name[2]; w.w_req = req; error = sysctl_wire_old_buffer(req, 0); if (error) return (error); /* * Allocate reply buffer in advance. * All rtsock messages has maximum length of u_short. */ w.w_tmemsize = 65536; w.w_tmem = malloc(w.w_tmemsize, M_TEMP, M_WAITOK); switch (w.w_op) { case NET_RT_DUMP: case NET_RT_FLAGS: if (af == 0) { /* dump all tables */ i = 1; lim = AF_MAX; } else /* dump only one table */ i = lim = af; /* * take care of llinfo entries, the caller must * specify an AF */ if (w.w_op == NET_RT_FLAGS && (w.w_arg == 0 || w.w_arg & RTF_LLINFO)) { if (af != 0) error = lltable_sysctl_dumparp(af, w.w_req); else error = EINVAL; break; } /* * take care of routing entries */ for (error = 0; error == 0 && i <= lim; i++) { rnh = rt_tables_get_rnh(fib, i); if (rnh != NULL) { RIB_RLOCK(rnh); error = rnh->rnh_walktree(&rnh->head, sysctl_dumpentry, &w); RIB_RUNLOCK(rnh); } else if (af != 0) error = EAFNOSUPPORT; } break; case NET_RT_IFLIST: case NET_RT_IFLISTL: error = sysctl_iflist(af, &w); break; case NET_RT_IFMALIST: error = sysctl_ifmalist(af, &w); break; } free(w.w_tmem, M_TEMP); return (error); } static SYSCTL_NODE(_net, PF_ROUTE, routetable, CTLFLAG_RD, sysctl_rtsock, ""); /* * Definitions of protocols supported in the ROUTE domain. */ static struct domain routedomain; /* or at least forward */ static struct protosw routesw[] = { { .pr_type = SOCK_RAW, .pr_domain = &routedomain, .pr_flags = PR_ATOMIC|PR_ADDR, .pr_output = route_output, .pr_ctlinput = raw_ctlinput, .pr_init = raw_init, .pr_usrreqs = &route_usrreqs } }; static struct domain routedomain = { .dom_family = PF_ROUTE, .dom_name = "route", .dom_protosw = routesw, .dom_protoswNPROTOSW = &routesw[nitems(routesw)] }; VNET_DOMAIN_SET(route); Index: head/sys/netinet/igmp.c =================================================================== --- head/sys/netinet/igmp.c (revision 334117) +++ head/sys/netinet/igmp.c (revision 334118) @@ -1,3654 +1,3652 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 2007-2009 Bruce Simpson. * Copyright (c) 1988 Stephen Deering. * Copyright (c) 1992, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * Stephen Deering of Stanford University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)igmp.c 8.1 (Berkeley) 7/19/93 */ /* * Internet Group Management Protocol (IGMP) routines. * [RFC1112, RFC2236, RFC3376] * * Written by Steve Deering, Stanford, May 1988. * Modified by Rosen Sharma, Stanford, Aug 1994. * Modified by Bill Fenner, Xerox PARC, Feb 1995. * Modified to fully comply to IGMPv2 by Bill Fenner, Oct 1995. * Significantly rewritten for IGMPv3, VIMAGE, and SMP by Bruce Simpson. * * MULTICAST Revision: 3.5.1.4 */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DDB #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifndef KTR_IGMPV3 #define KTR_IGMPV3 KTR_INET #endif static struct igmp_ifsoftc * igi_alloc_locked(struct ifnet *); static void igi_delete_locked(const struct ifnet *); static void igmp_dispatch_queue(struct mbufq *, int, const int); static void igmp_fasttimo_vnet(void); static void igmp_final_leave(struct in_multi *, struct igmp_ifsoftc *); static int igmp_handle_state_change(struct in_multi *, struct igmp_ifsoftc *); static int igmp_initial_join(struct in_multi *, struct igmp_ifsoftc *); static int igmp_input_v1_query(struct ifnet *, const struct ip *, const struct igmp *); static int igmp_input_v2_query(struct ifnet *, const struct ip *, const struct igmp *); static int igmp_input_v3_query(struct ifnet *, const struct ip *, /*const*/ struct igmpv3 *); static int igmp_input_v3_group_query(struct in_multi *, struct igmp_ifsoftc *, int, /*const*/ struct igmpv3 *); static int igmp_input_v1_report(struct ifnet *, /*const*/ struct ip *, /*const*/ struct igmp *); static int igmp_input_v2_report(struct ifnet *, /*const*/ struct ip *, /*const*/ struct igmp *); static void igmp_intr(struct mbuf *); static int igmp_isgroupreported(const struct in_addr); static struct mbuf * igmp_ra_alloc(void); #ifdef KTR static char * igmp_rec_type_to_str(const int); #endif static void igmp_set_version(struct igmp_ifsoftc *, const int); static void igmp_slowtimo_vnet(void); static int igmp_v1v2_queue_report(struct in_multi *, const int); static void igmp_v1v2_process_group_timer(struct in_multi *, const int); static void igmp_v1v2_process_querier_timers(struct igmp_ifsoftc *); static void igmp_v2_update_group(struct in_multi *, const int); static void igmp_v3_cancel_link_timers(struct igmp_ifsoftc *); static void igmp_v3_dispatch_general_query(struct igmp_ifsoftc *); static struct mbuf * igmp_v3_encap_report(struct ifnet *, struct mbuf *); static int igmp_v3_enqueue_group_record(struct mbufq *, struct in_multi *, const int, const int, const int); static int igmp_v3_enqueue_filter_change(struct mbufq *, struct in_multi *); static void igmp_v3_process_group_timers(struct in_multi_head *, struct mbufq *, struct mbufq *, struct in_multi *, const int); static int igmp_v3_merge_state_changes(struct in_multi *, struct mbufq *); static void igmp_v3_suppress_group_record(struct in_multi *); static int sysctl_igmp_default_version(SYSCTL_HANDLER_ARGS); static int sysctl_igmp_gsr(SYSCTL_HANDLER_ARGS); static int sysctl_igmp_ifinfo(SYSCTL_HANDLER_ARGS); static const struct netisr_handler igmp_nh = { .nh_name = "igmp", .nh_handler = igmp_intr, .nh_proto = NETISR_IGMP, .nh_policy = NETISR_POLICY_SOURCE, }; /* * System-wide globals. * * Unlocked access to these is OK, except for the global IGMP output * queue. The IGMP subsystem lock ends up being system-wide for the moment, * because all VIMAGEs have to share a global output queue, as netisrs * themselves are not virtualized. * * Locking: * * The permitted lock order is: IN_MULTI_LIST_LOCK, IGMP_LOCK, IF_ADDR_LOCK. * Any may be taken independently; if any are held at the same * time, the above lock order must be followed. * * All output is delegated to the netisr. * Now that Giant has been eliminated, the netisr may be inlined. * * IN_MULTI_LIST_LOCK covers in_multi. * * IGMP_LOCK covers igmp_ifsoftc and any global variables in this file, * including the output queue. * * IF_ADDR_LOCK covers if_multiaddrs, which is used for a variety of * per-link state iterators. * * igmp_ifsoftc is valid as long as PF_INET is attached to the interface, * therefore it is not refcounted. * We allow unlocked reads of igmp_ifsoftc when accessed via in_multi. * * Reference counting * * IGMP acquires its own reference every time an in_multi is passed to * it and the group is being joined for the first time. * * IGMP releases its reference(s) on in_multi in a deferred way, * because the operations which process the release run as part of * a loop whose control variables are directly affected by the release * (that, and not recursing on the IF_ADDR_LOCK). * * VIMAGE: Each in_multi corresponds to an ifp, and each ifp corresponds * to a vnet in ifp->if_vnet. * * SMPng: XXX We may potentially race operations on ifma_protospec. * The problem is that we currently lack a clean way of taking the * IF_ADDR_LOCK() between the ifnet and in layers w/o recursing, * as anything which modifies ifma needs to be covered by that lock. * So check for ifma_protospec being NULL before proceeding. */ struct mtx igmp_mtx; struct mbuf *m_raopt; /* Router Alert option */ static MALLOC_DEFINE(M_IGMP, "igmp", "igmp state"); /* * VIMAGE-wide globals. * * The IGMPv3 timers themselves need to run per-image, however, * protosw timers run globally (see tcp). * An ifnet can only be in one vimage at a time, and the loopback * ifnet, loif, is itself virtualized. * It would otherwise be possible to seriously hose IGMP state, * and create inconsistencies in upstream multicast routing, if you have * multiple VIMAGEs running on the same link joining different multicast * groups, UNLESS the "primary IP address" is different. This is because * IGMP for IPv4 does not force link-local addresses to be used for each * node, unlike MLD for IPv6. * Obviously the IGMPv3 per-interface state has per-vimage granularity * also as a result. * * FUTURE: Stop using IFP_TO_IA/INADDR_ANY, and use source address selection * policy to control the address used by IGMP on the link. */ static VNET_DEFINE(int, interface_timers_running); /* IGMPv3 general * query response */ static VNET_DEFINE(int, state_change_timers_running); /* IGMPv3 state-change * retransmit */ static VNET_DEFINE(int, current_state_timers_running); /* IGMPv1/v2 host * report; IGMPv3 g/sg * query response */ #define V_interface_timers_running VNET(interface_timers_running) #define V_state_change_timers_running VNET(state_change_timers_running) #define V_current_state_timers_running VNET(current_state_timers_running) static VNET_DEFINE(LIST_HEAD(, igmp_ifsoftc), igi_head) = LIST_HEAD_INITIALIZER(igi_head); static VNET_DEFINE(struct igmpstat, igmpstat) = { .igps_version = IGPS_VERSION_3, .igps_len = sizeof(struct igmpstat), }; static VNET_DEFINE(struct timeval, igmp_gsrdelay) = {10, 0}; #define V_igi_head VNET(igi_head) #define V_igmpstat VNET(igmpstat) #define V_igmp_gsrdelay VNET(igmp_gsrdelay) static VNET_DEFINE(int, igmp_recvifkludge) = 1; static VNET_DEFINE(int, igmp_sendra) = 1; static VNET_DEFINE(int, igmp_sendlocal) = 1; static VNET_DEFINE(int, igmp_v1enable) = 1; static VNET_DEFINE(int, igmp_v2enable) = 1; static VNET_DEFINE(int, igmp_legacysupp); static VNET_DEFINE(int, igmp_default_version) = IGMP_VERSION_3; #define V_igmp_recvifkludge VNET(igmp_recvifkludge) #define V_igmp_sendra VNET(igmp_sendra) #define V_igmp_sendlocal VNET(igmp_sendlocal) #define V_igmp_v1enable VNET(igmp_v1enable) #define V_igmp_v2enable VNET(igmp_v2enable) #define V_igmp_legacysupp VNET(igmp_legacysupp) #define V_igmp_default_version VNET(igmp_default_version) /* * Virtualized sysctls. */ SYSCTL_STRUCT(_net_inet_igmp, IGMPCTL_STATS, stats, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(igmpstat), igmpstat, ""); SYSCTL_INT(_net_inet_igmp, OID_AUTO, recvifkludge, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(igmp_recvifkludge), 0, "Rewrite IGMPv1/v2 reports from 0.0.0.0 to contain subnet address"); SYSCTL_INT(_net_inet_igmp, OID_AUTO, sendra, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(igmp_sendra), 0, "Send IP Router Alert option in IGMPv2/v3 messages"); SYSCTL_INT(_net_inet_igmp, OID_AUTO, sendlocal, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(igmp_sendlocal), 0, "Send IGMP membership reports for 224.0.0.0/24 groups"); SYSCTL_INT(_net_inet_igmp, OID_AUTO, v1enable, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(igmp_v1enable), 0, "Enable backwards compatibility with IGMPv1"); SYSCTL_INT(_net_inet_igmp, OID_AUTO, v2enable, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(igmp_v2enable), 0, "Enable backwards compatibility with IGMPv2"); SYSCTL_INT(_net_inet_igmp, OID_AUTO, legacysupp, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(igmp_legacysupp), 0, "Allow v1/v2 reports to suppress v3 group responses"); SYSCTL_PROC(_net_inet_igmp, OID_AUTO, default_version, CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, &VNET_NAME(igmp_default_version), 0, sysctl_igmp_default_version, "I", "Default version of IGMP to run on each interface"); SYSCTL_PROC(_net_inet_igmp, OID_AUTO, gsrdelay, CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, &VNET_NAME(igmp_gsrdelay.tv_sec), 0, sysctl_igmp_gsr, "I", "Rate limit for IGMPv3 Group-and-Source queries in seconds"); /* * Non-virtualized sysctls. */ static SYSCTL_NODE(_net_inet_igmp, OID_AUTO, ifinfo, CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_igmp_ifinfo, "Per-interface IGMPv3 state"); static __inline void igmp_save_context(struct mbuf *m, struct ifnet *ifp) { #ifdef VIMAGE m->m_pkthdr.PH_loc.ptr = ifp->if_vnet; #endif /* VIMAGE */ m->m_pkthdr.flowid = ifp->if_index; } static __inline void igmp_scrub_context(struct mbuf *m) { m->m_pkthdr.PH_loc.ptr = NULL; m->m_pkthdr.flowid = 0; } /* * Restore context from a queued IGMP output chain. * Return saved ifindex. * * VIMAGE: The assertion is there to make sure that we * actually called CURVNET_SET() with what's in the mbuf chain. */ static __inline uint32_t igmp_restore_context(struct mbuf *m) { #ifdef notyet #if defined(VIMAGE) && defined(INVARIANTS) KASSERT(curvnet == (m->m_pkthdr.PH_loc.ptr), ("%s: called when curvnet was not restored", __func__)); #endif #endif return (m->m_pkthdr.flowid); } /* * Retrieve or set default IGMP version. * * VIMAGE: Assume curvnet set by caller. * SMPng: NOTE: Serialized by IGMP lock. */ static int sysctl_igmp_default_version(SYSCTL_HANDLER_ARGS) { int error; int new; error = sysctl_wire_old_buffer(req, sizeof(int)); if (error) return (error); IGMP_LOCK(); new = V_igmp_default_version; error = sysctl_handle_int(oidp, &new, 0, req); if (error || !req->newptr) goto out_locked; if (new < IGMP_VERSION_1 || new > IGMP_VERSION_3) { error = EINVAL; goto out_locked; } CTR2(KTR_IGMPV3, "change igmp_default_version from %d to %d", V_igmp_default_version, new); V_igmp_default_version = new; out_locked: IGMP_UNLOCK(); return (error); } /* * Retrieve or set threshold between group-source queries in seconds. * * VIMAGE: Assume curvnet set by caller. * SMPng: NOTE: Serialized by IGMP lock. */ static int sysctl_igmp_gsr(SYSCTL_HANDLER_ARGS) { int error; int i; error = sysctl_wire_old_buffer(req, sizeof(int)); if (error) return (error); IGMP_LOCK(); i = V_igmp_gsrdelay.tv_sec; error = sysctl_handle_int(oidp, &i, 0, req); if (error || !req->newptr) goto out_locked; if (i < -1 || i >= 60) { error = EINVAL; goto out_locked; } CTR2(KTR_IGMPV3, "change igmp_gsrdelay from %d to %d", V_igmp_gsrdelay.tv_sec, i); V_igmp_gsrdelay.tv_sec = i; out_locked: IGMP_UNLOCK(); return (error); } /* * Expose struct igmp_ifsoftc to userland, keyed by ifindex. * For use by ifmcstat(8). * * SMPng: NOTE: Does an unlocked ifindex space read. * VIMAGE: Assume curvnet set by caller. The node handler itself * is not directly virtualized. */ static int sysctl_igmp_ifinfo(SYSCTL_HANDLER_ARGS) { int *name; int error; u_int namelen; struct ifnet *ifp; struct igmp_ifsoftc *igi; name = (int *)arg1; namelen = arg2; if (req->newptr != NULL) return (EPERM); if (namelen != 1) return (EINVAL); error = sysctl_wire_old_buffer(req, sizeof(struct igmp_ifinfo)); if (error) return (error); IN_MULTI_LIST_LOCK(); IGMP_LOCK(); if (name[0] <= 0 || name[0] > V_if_index) { error = ENOENT; goto out_locked; } error = ENOENT; ifp = ifnet_byindex(name[0]); if (ifp == NULL) goto out_locked; LIST_FOREACH(igi, &V_igi_head, igi_link) { if (ifp == igi->igi_ifp) { struct igmp_ifinfo info; info.igi_version = igi->igi_version; info.igi_v1_timer = igi->igi_v1_timer; info.igi_v2_timer = igi->igi_v2_timer; info.igi_v3_timer = igi->igi_v3_timer; info.igi_flags = igi->igi_flags; info.igi_rv = igi->igi_rv; info.igi_qi = igi->igi_qi; info.igi_qri = igi->igi_qri; info.igi_uri = igi->igi_uri; error = SYSCTL_OUT(req, &info, sizeof(info)); break; } } out_locked: IGMP_UNLOCK(); IN_MULTI_LIST_UNLOCK(); return (error); } /* * Dispatch an entire queue of pending packet chains * using the netisr. * VIMAGE: Assumes the vnet pointer has been set. */ static void igmp_dispatch_queue(struct mbufq *mq, int limit, const int loop) { struct mbuf *m; while ((m = mbufq_dequeue(mq)) != NULL) { CTR3(KTR_IGMPV3, "%s: dispatch %p from %p", __func__, mq, m); if (loop) m->m_flags |= M_IGMP_LOOP; netisr_dispatch(NETISR_IGMP, m); if (--limit == 0) break; } } /* * Filter outgoing IGMP report state by group. * * Reports are ALWAYS suppressed for ALL-HOSTS (224.0.0.1). * If the net.inet.igmp.sendlocal sysctl is 0, then IGMP reports are * disabled for all groups in the 224.0.0.0/24 link-local scope. However, * this may break certain IGMP snooping switches which rely on the old * report behaviour. * * Return zero if the given group is one for which IGMP reports * should be suppressed, or non-zero if reports should be issued. */ static __inline int igmp_isgroupreported(const struct in_addr addr) { if (in_allhosts(addr) || ((!V_igmp_sendlocal && IN_LOCAL_GROUP(ntohl(addr.s_addr))))) return (0); return (1); } /* * Construct a Router Alert option to use in outgoing packets. */ static struct mbuf * igmp_ra_alloc(void) { struct mbuf *m; struct ipoption *p; m = m_get(M_WAITOK, MT_DATA); p = mtod(m, struct ipoption *); p->ipopt_dst.s_addr = INADDR_ANY; p->ipopt_list[0] = (char)IPOPT_RA; /* Router Alert Option */ p->ipopt_list[1] = 0x04; /* 4 bytes long */ p->ipopt_list[2] = IPOPT_EOL; /* End of IP option list */ p->ipopt_list[3] = 0x00; /* pad byte */ m->m_len = sizeof(p->ipopt_dst) + p->ipopt_list[1]; return (m); } /* * Attach IGMP when PF_INET is attached to an interface. */ struct igmp_ifsoftc * igmp_domifattach(struct ifnet *ifp) { struct igmp_ifsoftc *igi; CTR3(KTR_IGMPV3, "%s: called for ifp %p(%s)", __func__, ifp, ifp->if_xname); IGMP_LOCK(); igi = igi_alloc_locked(ifp); if (!(ifp->if_flags & IFF_MULTICAST)) igi->igi_flags |= IGIF_SILENT; IGMP_UNLOCK(); return (igi); } /* * VIMAGE: assume curvnet set by caller. */ static struct igmp_ifsoftc * igi_alloc_locked(/*const*/ struct ifnet *ifp) { struct igmp_ifsoftc *igi; IGMP_LOCK_ASSERT(); igi = malloc(sizeof(struct igmp_ifsoftc), M_IGMP, M_NOWAIT|M_ZERO); if (igi == NULL) goto out; igi->igi_ifp = ifp; igi->igi_version = V_igmp_default_version; igi->igi_flags = 0; igi->igi_rv = IGMP_RV_INIT; igi->igi_qi = IGMP_QI_INIT; igi->igi_qri = IGMP_QRI_INIT; igi->igi_uri = IGMP_URI_INIT; mbufq_init(&igi->igi_gq, IGMP_MAX_RESPONSE_PACKETS); LIST_INSERT_HEAD(&V_igi_head, igi, igi_link); CTR2(KTR_IGMPV3, "allocate igmp_ifsoftc for ifp %p(%s)", ifp, ifp->if_xname); out: return (igi); } /* * Hook for ifdetach. * * NOTE: Some finalization tasks need to run before the protocol domain * is detached, but also before the link layer does its cleanup. * * SMPNG: igmp_ifdetach() needs to take IF_ADDR_LOCK(). * XXX This is also bitten by unlocked ifma_protospec access. */ void igmp_ifdetach(struct ifnet *ifp) { struct igmp_ifsoftc *igi; struct ifmultiaddr *ifma, *next; struct in_multi *inm; struct in_multi_head inm_free_tmp; CTR3(KTR_IGMPV3, "%s: called for ifp %p(%s)", __func__, ifp, ifp->if_xname); SLIST_INIT(&inm_free_tmp); IGMP_LOCK(); igi = ((struct in_ifinfo *)ifp->if_afdata[AF_INET])->ii_igmp; if (igi->igi_version == IGMP_VERSION_3) { IF_ADDR_WLOCK(ifp); restart: CK_STAILQ_FOREACH_SAFE(ifma, &ifp->if_multiaddrs, ifma_link, next) { if (ifma->ifma_addr->sa_family != AF_INET || ifma->ifma_protospec == NULL) continue; inm = (struct in_multi *)ifma->ifma_protospec; if (inm->inm_state == IGMP_LEAVING_MEMBER) inm_rele_locked(&inm_free_tmp, inm); inm_clear_recorded(inm); if (__predict_false(ifma_restart)) { ifma_restart = false; goto restart; } } IF_ADDR_WUNLOCK(ifp); inm_release_list_deferred(&inm_free_tmp); } IGMP_UNLOCK(); } /* * Hook for domifdetach. */ void igmp_domifdetach(struct ifnet *ifp) { CTR3(KTR_IGMPV3, "%s: called for ifp %p(%s)", __func__, ifp, ifp->if_xname); IGMP_LOCK(); igi_delete_locked(ifp); IGMP_UNLOCK(); } static void igi_delete_locked(const struct ifnet *ifp) { struct igmp_ifsoftc *igi, *tigi; CTR3(KTR_IGMPV3, "%s: freeing igmp_ifsoftc for ifp %p(%s)", __func__, ifp, ifp->if_xname); IGMP_LOCK_ASSERT(); LIST_FOREACH_SAFE(igi, &V_igi_head, igi_link, tigi) { if (igi->igi_ifp == ifp) { /* * Free deferred General Query responses. */ mbufq_drain(&igi->igi_gq); LIST_REMOVE(igi, igi_link); free(igi, M_IGMP); return; } } } /* * Process a received IGMPv1 query. * Return non-zero if the message should be dropped. * * VIMAGE: The curvnet pointer is derived from the input ifp. */ static int igmp_input_v1_query(struct ifnet *ifp, const struct ip *ip, const struct igmp *igmp) { struct ifmultiaddr *ifma; struct igmp_ifsoftc *igi; struct in_multi *inm; /* * IGMPv1 Host Mmembership Queries SHOULD always be addressed to * 224.0.0.1. They are always treated as General Queries. * igmp_group is always ignored. Do not drop it as a userland * daemon may wish to see it. * XXX SMPng: unlocked increments in igmpstat assumed atomic. */ if (!in_allhosts(ip->ip_dst) || !in_nullhost(igmp->igmp_group)) { IGMPSTAT_INC(igps_rcv_badqueries); return (0); } IGMPSTAT_INC(igps_rcv_gen_queries); IN_MULTI_LIST_LOCK(); IGMP_LOCK(); igi = ((struct in_ifinfo *)ifp->if_afdata[AF_INET])->ii_igmp; KASSERT(igi != NULL, ("%s: no igmp_ifsoftc for ifp %p", __func__, ifp)); if (igi->igi_flags & IGIF_LOOPBACK) { CTR2(KTR_IGMPV3, "ignore v1 query on IGIF_LOOPBACK ifp %p(%s)", ifp, ifp->if_xname); goto out_locked; } /* * Switch to IGMPv1 host compatibility mode. */ igmp_set_version(igi, IGMP_VERSION_1); CTR2(KTR_IGMPV3, "process v1 query on ifp %p(%s)", ifp, ifp->if_xname); /* * Start the timers in all of our group records * for the interface on which the query arrived, * except those which are already running. */ IF_ADDR_RLOCK(ifp); CK_STAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { if (ifma->ifma_addr->sa_family != AF_INET || ifma->ifma_protospec == NULL) continue; inm = (struct in_multi *)ifma->ifma_protospec; if (inm->inm_timer != 0) continue; switch (inm->inm_state) { case IGMP_NOT_MEMBER: case IGMP_SILENT_MEMBER: break; case IGMP_G_QUERY_PENDING_MEMBER: case IGMP_SG_QUERY_PENDING_MEMBER: case IGMP_REPORTING_MEMBER: case IGMP_IDLE_MEMBER: case IGMP_LAZY_MEMBER: case IGMP_SLEEPING_MEMBER: case IGMP_AWAKENING_MEMBER: inm->inm_state = IGMP_REPORTING_MEMBER; inm->inm_timer = IGMP_RANDOM_DELAY( IGMP_V1V2_MAX_RI * PR_FASTHZ); V_current_state_timers_running = 1; break; case IGMP_LEAVING_MEMBER: break; } } IF_ADDR_RUNLOCK(ifp); out_locked: IGMP_UNLOCK(); IN_MULTI_LIST_UNLOCK(); return (0); } /* * Process a received IGMPv2 general or group-specific query. */ static int igmp_input_v2_query(struct ifnet *ifp, const struct ip *ip, const struct igmp *igmp) { struct ifmultiaddr *ifma; struct igmp_ifsoftc *igi; struct in_multi *inm; int is_general_query; uint16_t timer; is_general_query = 0; /* * Validate address fields upfront. * XXX SMPng: unlocked increments in igmpstat assumed atomic. */ if (in_nullhost(igmp->igmp_group)) { /* * IGMPv2 General Query. * If this was not sent to the all-hosts group, ignore it. */ if (!in_allhosts(ip->ip_dst)) return (0); IGMPSTAT_INC(igps_rcv_gen_queries); is_general_query = 1; } else { /* IGMPv2 Group-Specific Query. */ IGMPSTAT_INC(igps_rcv_group_queries); } IN_MULTI_LIST_LOCK(); IGMP_LOCK(); igi = ((struct in_ifinfo *)ifp->if_afdata[AF_INET])->ii_igmp; KASSERT(igi != NULL, ("%s: no igmp_ifsoftc for ifp %p", __func__, ifp)); if (igi->igi_flags & IGIF_LOOPBACK) { CTR2(KTR_IGMPV3, "ignore v2 query on IGIF_LOOPBACK ifp %p(%s)", ifp, ifp->if_xname); goto out_locked; } /* * Ignore v2 query if in v1 Compatibility Mode. */ if (igi->igi_version == IGMP_VERSION_1) goto out_locked; igmp_set_version(igi, IGMP_VERSION_2); timer = igmp->igmp_code * PR_FASTHZ / IGMP_TIMER_SCALE; if (timer == 0) timer = 1; if (is_general_query) { /* * For each reporting group joined on this * interface, kick the report timer. */ CTR2(KTR_IGMPV3, "process v2 general query on ifp %p(%s)", ifp, ifp->if_xname); IF_ADDR_RLOCK(ifp); CK_STAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { if (ifma->ifma_addr->sa_family != AF_INET || ifma->ifma_protospec == NULL) continue; inm = (struct in_multi *)ifma->ifma_protospec; igmp_v2_update_group(inm, timer); } IF_ADDR_RUNLOCK(ifp); } else { /* * Group-specific IGMPv2 query, we need only * look up the single group to process it. */ inm = inm_lookup(ifp, igmp->igmp_group); if (inm != NULL) { CTR3(KTR_IGMPV3, "process v2 query 0x%08x on ifp %p(%s)", ntohl(igmp->igmp_group.s_addr), ifp, ifp->if_xname); igmp_v2_update_group(inm, timer); } } out_locked: IGMP_UNLOCK(); IN_MULTI_LIST_UNLOCK(); return (0); } /* * Update the report timer on a group in response to an IGMPv2 query. * * If we are becoming the reporting member for this group, start the timer. * If we already are the reporting member for this group, and timer is * below the threshold, reset it. * * We may be updating the group for the first time since we switched * to IGMPv3. If we are, then we must clear any recorded source lists, * and transition to REPORTING state; the group timer is overloaded * for group and group-source query responses. * * Unlike IGMPv3, the delay per group should be jittered * to avoid bursts of IGMPv2 reports. */ static void igmp_v2_update_group(struct in_multi *inm, const int timer) { CTR4(KTR_IGMPV3, "0x%08x: %s/%s timer=%d", __func__, ntohl(inm->inm_addr.s_addr), inm->inm_ifp->if_xname, timer); IN_MULTI_LIST_LOCK_ASSERT(); switch (inm->inm_state) { case IGMP_NOT_MEMBER: case IGMP_SILENT_MEMBER: break; case IGMP_REPORTING_MEMBER: if (inm->inm_timer != 0 && inm->inm_timer <= timer) { CTR1(KTR_IGMPV3, "%s: REPORTING and timer running, " "skipping.", __func__); break; } /* FALLTHROUGH */ case IGMP_SG_QUERY_PENDING_MEMBER: case IGMP_G_QUERY_PENDING_MEMBER: case IGMP_IDLE_MEMBER: case IGMP_LAZY_MEMBER: case IGMP_AWAKENING_MEMBER: CTR1(KTR_IGMPV3, "%s: ->REPORTING", __func__); inm->inm_state = IGMP_REPORTING_MEMBER; inm->inm_timer = IGMP_RANDOM_DELAY(timer); V_current_state_timers_running = 1; break; case IGMP_SLEEPING_MEMBER: CTR1(KTR_IGMPV3, "%s: ->AWAKENING", __func__); inm->inm_state = IGMP_AWAKENING_MEMBER; break; case IGMP_LEAVING_MEMBER: break; } } /* * Process a received IGMPv3 general, group-specific or * group-and-source-specific query. * Assumes m has already been pulled up to the full IGMP message length. * Return 0 if successful, otherwise an appropriate error code is returned. */ static int igmp_input_v3_query(struct ifnet *ifp, const struct ip *ip, /*const*/ struct igmpv3 *igmpv3) { struct igmp_ifsoftc *igi; struct in_multi *inm; int is_general_query; uint32_t maxresp, nsrc, qqi; uint16_t timer; uint8_t qrv; is_general_query = 0; CTR2(KTR_IGMPV3, "process v3 query on ifp %p(%s)", ifp, ifp->if_xname); maxresp = igmpv3->igmp_code; /* in 1/10ths of a second */ if (maxresp >= 128) { maxresp = IGMP_MANT(igmpv3->igmp_code) << (IGMP_EXP(igmpv3->igmp_code) + 3); } /* * Robustness must never be less than 2 for on-wire IGMPv3. * FUTURE: Check if ifp has IGIF_LOOPBACK set, as we will make * an exception for interfaces whose IGMPv3 state changes * are redirected to loopback (e.g. MANET). */ qrv = IGMP_QRV(igmpv3->igmp_misc); if (qrv < 2) { CTR3(KTR_IGMPV3, "%s: clamping qrv %d to %d", __func__, qrv, IGMP_RV_INIT); qrv = IGMP_RV_INIT; } qqi = igmpv3->igmp_qqi; if (qqi >= 128) { qqi = IGMP_MANT(igmpv3->igmp_qqi) << (IGMP_EXP(igmpv3->igmp_qqi) + 3); } timer = maxresp * PR_FASTHZ / IGMP_TIMER_SCALE; if (timer == 0) timer = 1; nsrc = ntohs(igmpv3->igmp_numsrc); /* * Validate address fields and versions upfront before * accepting v3 query. * XXX SMPng: Unlocked access to igmpstat counters here. */ if (in_nullhost(igmpv3->igmp_group)) { /* * IGMPv3 General Query. * * General Queries SHOULD be directed to 224.0.0.1. * A general query with a source list has undefined * behaviour; discard it. */ IGMPSTAT_INC(igps_rcv_gen_queries); if (!in_allhosts(ip->ip_dst) || nsrc > 0) { IGMPSTAT_INC(igps_rcv_badqueries); return (0); } is_general_query = 1; } else { /* Group or group-source specific query. */ if (nsrc == 0) IGMPSTAT_INC(igps_rcv_group_queries); else IGMPSTAT_INC(igps_rcv_gsr_queries); } IN_MULTI_LIST_LOCK(); IGMP_LOCK(); igi = ((struct in_ifinfo *)ifp->if_afdata[AF_INET])->ii_igmp; KASSERT(igi != NULL, ("%s: no igmp_ifsoftc for ifp %p", __func__, ifp)); if (igi->igi_flags & IGIF_LOOPBACK) { CTR2(KTR_IGMPV3, "ignore v3 query on IGIF_LOOPBACK ifp %p(%s)", ifp, ifp->if_xname); goto out_locked; } /* * Discard the v3 query if we're in Compatibility Mode. * The RFC is not obviously worded that hosts need to stay in * compatibility mode until the Old Version Querier Present * timer expires. */ if (igi->igi_version != IGMP_VERSION_3) { CTR3(KTR_IGMPV3, "ignore v3 query in v%d mode on ifp %p(%s)", igi->igi_version, ifp, ifp->if_xname); goto out_locked; } igmp_set_version(igi, IGMP_VERSION_3); igi->igi_rv = qrv; igi->igi_qi = qqi; igi->igi_qri = maxresp; CTR4(KTR_IGMPV3, "%s: qrv %d qi %d qri %d", __func__, qrv, qqi, maxresp); if (is_general_query) { /* * Schedule a current-state report on this ifp for * all groups, possibly containing source lists. * If there is a pending General Query response * scheduled earlier than the selected delay, do * not schedule any other reports. * Otherwise, reset the interface timer. */ CTR2(KTR_IGMPV3, "process v3 general query on ifp %p(%s)", ifp, ifp->if_xname); if (igi->igi_v3_timer == 0 || igi->igi_v3_timer >= timer) { igi->igi_v3_timer = IGMP_RANDOM_DELAY(timer); V_interface_timers_running = 1; } } else { /* * Group-source-specific queries are throttled on * a per-group basis to defeat denial-of-service attempts. * Queries for groups we are not a member of on this * link are simply ignored. */ inm = inm_lookup(ifp, igmpv3->igmp_group); if (inm == NULL) goto out_locked; if (nsrc > 0) { if (!ratecheck(&inm->inm_lastgsrtv, &V_igmp_gsrdelay)) { CTR1(KTR_IGMPV3, "%s: GS query throttled.", __func__); IGMPSTAT_INC(igps_drop_gsr_queries); goto out_locked; } } CTR3(KTR_IGMPV3, "process v3 0x%08x query on ifp %p(%s)", ntohl(igmpv3->igmp_group.s_addr), ifp, ifp->if_xname); /* * If there is a pending General Query response * scheduled sooner than the selected delay, no * further report need be scheduled. * Otherwise, prepare to respond to the * group-specific or group-and-source query. */ if (igi->igi_v3_timer == 0 || igi->igi_v3_timer >= timer) igmp_input_v3_group_query(inm, igi, timer, igmpv3); } out_locked: IGMP_UNLOCK(); IN_MULTI_LIST_UNLOCK(); return (0); } /* * Process a received IGMPv3 group-specific or group-and-source-specific * query. * Return <0 if any error occurred. Currently this is ignored. */ static int igmp_input_v3_group_query(struct in_multi *inm, struct igmp_ifsoftc *igi, int timer, /*const*/ struct igmpv3 *igmpv3) { int retval; uint16_t nsrc; IN_MULTI_LIST_LOCK_ASSERT(); IGMP_LOCK_ASSERT(); retval = 0; switch (inm->inm_state) { case IGMP_NOT_MEMBER: case IGMP_SILENT_MEMBER: case IGMP_SLEEPING_MEMBER: case IGMP_LAZY_MEMBER: case IGMP_AWAKENING_MEMBER: case IGMP_IDLE_MEMBER: case IGMP_LEAVING_MEMBER: return (retval); break; case IGMP_REPORTING_MEMBER: case IGMP_G_QUERY_PENDING_MEMBER: case IGMP_SG_QUERY_PENDING_MEMBER: break; } nsrc = ntohs(igmpv3->igmp_numsrc); /* * Deal with group-specific queries upfront. * If any group query is already pending, purge any recorded * source-list state if it exists, and schedule a query response * for this group-specific query. */ if (nsrc == 0) { if (inm->inm_state == IGMP_G_QUERY_PENDING_MEMBER || inm->inm_state == IGMP_SG_QUERY_PENDING_MEMBER) { inm_clear_recorded(inm); timer = min(inm->inm_timer, timer); } inm->inm_state = IGMP_G_QUERY_PENDING_MEMBER; inm->inm_timer = IGMP_RANDOM_DELAY(timer); V_current_state_timers_running = 1; return (retval); } /* * Deal with the case where a group-and-source-specific query has * been received but a group-specific query is already pending. */ if (inm->inm_state == IGMP_G_QUERY_PENDING_MEMBER) { timer = min(inm->inm_timer, timer); inm->inm_timer = IGMP_RANDOM_DELAY(timer); V_current_state_timers_running = 1; return (retval); } /* * Finally, deal with the case where a group-and-source-specific * query has been received, where a response to a previous g-s-r * query exists, or none exists. * In this case, we need to parse the source-list which the Querier * has provided us with and check if we have any source list filter * entries at T1 for these sources. If we do not, there is no need * schedule a report and the query may be dropped. * If we do, we must record them and schedule a current-state * report for those sources. * FIXME: Handling source lists larger than 1 mbuf requires that * we pass the mbuf chain pointer down to this function, and use * m_getptr() to walk the chain. */ if (inm->inm_nsrc > 0) { const struct in_addr *ap; int i, nrecorded; ap = (const struct in_addr *)(igmpv3 + 1); nrecorded = 0; for (i = 0; i < nsrc; i++, ap++) { retval = inm_record_source(inm, ap->s_addr); if (retval < 0) break; nrecorded += retval; } if (nrecorded > 0) { CTR1(KTR_IGMPV3, "%s: schedule response to SG query", __func__); inm->inm_state = IGMP_SG_QUERY_PENDING_MEMBER; inm->inm_timer = IGMP_RANDOM_DELAY(timer); V_current_state_timers_running = 1; } } return (retval); } /* * Process a received IGMPv1 host membership report. * * NOTE: 0.0.0.0 workaround breaks const correctness. */ static int igmp_input_v1_report(struct ifnet *ifp, /*const*/ struct ip *ip, /*const*/ struct igmp *igmp) { struct rm_priotracker in_ifa_tracker; struct in_ifaddr *ia; struct in_multi *inm; IGMPSTAT_INC(igps_rcv_reports); if (ifp->if_flags & IFF_LOOPBACK) return (0); if (!IN_MULTICAST(ntohl(igmp->igmp_group.s_addr)) || !in_hosteq(igmp->igmp_group, ip->ip_dst)) { IGMPSTAT_INC(igps_rcv_badreports); return (EINVAL); } /* * RFC 3376, Section 4.2.13, 9.2, 9.3: * Booting clients may use the source address 0.0.0.0. Some * IGMP daemons may not know how to use IP_RECVIF to determine * the interface upon which this message was received. * Replace 0.0.0.0 with the subnet address if told to do so. */ if (V_igmp_recvifkludge && in_nullhost(ip->ip_src)) { + NET_EPOCH_ENTER(); IFP_TO_IA(ifp, ia, &in_ifa_tracker); - if (ia != NULL) { + if (ia != NULL) ip->ip_src.s_addr = htonl(ia->ia_subnet); - ifa_free(&ia->ia_ifa); - } + NET_EPOCH_EXIT(); } CTR3(KTR_IGMPV3, "process v1 report 0x%08x on ifp %p(%s)", ntohl(igmp->igmp_group.s_addr), ifp, ifp->if_xname); /* * IGMPv1 report suppression. * If we are a member of this group, and our membership should be * reported, stop our group timer and transition to the 'lazy' state. */ IN_MULTI_LIST_LOCK(); inm = inm_lookup(ifp, igmp->igmp_group); if (inm != NULL) { struct igmp_ifsoftc *igi; igi = inm->inm_igi; if (igi == NULL) { KASSERT(igi != NULL, ("%s: no igi for ifp %p", __func__, ifp)); goto out_locked; } IGMPSTAT_INC(igps_rcv_ourreports); /* * If we are in IGMPv3 host mode, do not allow the * other host's IGMPv1 report to suppress our reports * unless explicitly configured to do so. */ if (igi->igi_version == IGMP_VERSION_3) { if (V_igmp_legacysupp) igmp_v3_suppress_group_record(inm); goto out_locked; } inm->inm_timer = 0; switch (inm->inm_state) { case IGMP_NOT_MEMBER: case IGMP_SILENT_MEMBER: break; case IGMP_IDLE_MEMBER: case IGMP_LAZY_MEMBER: case IGMP_AWAKENING_MEMBER: CTR3(KTR_IGMPV3, "report suppressed for 0x%08x on ifp %p(%s)", ntohl(igmp->igmp_group.s_addr), ifp, ifp->if_xname); case IGMP_SLEEPING_MEMBER: inm->inm_state = IGMP_SLEEPING_MEMBER; break; case IGMP_REPORTING_MEMBER: CTR3(KTR_IGMPV3, "report suppressed for 0x%08x on ifp %p(%s)", ntohl(igmp->igmp_group.s_addr), ifp, ifp->if_xname); if (igi->igi_version == IGMP_VERSION_1) inm->inm_state = IGMP_LAZY_MEMBER; else if (igi->igi_version == IGMP_VERSION_2) inm->inm_state = IGMP_SLEEPING_MEMBER; break; case IGMP_G_QUERY_PENDING_MEMBER: case IGMP_SG_QUERY_PENDING_MEMBER: case IGMP_LEAVING_MEMBER: break; } } out_locked: IN_MULTI_LIST_UNLOCK(); return (0); } /* * Process a received IGMPv2 host membership report. * * NOTE: 0.0.0.0 workaround breaks const correctness. */ static int igmp_input_v2_report(struct ifnet *ifp, /*const*/ struct ip *ip, /*const*/ struct igmp *igmp) { struct rm_priotracker in_ifa_tracker; struct in_ifaddr *ia; struct in_multi *inm; /* * Make sure we don't hear our own membership report. Fast * leave requires knowing that we are the only member of a * group. */ + NET_EPOCH_ENTER(); IFP_TO_IA(ifp, ia, &in_ifa_tracker); if (ia != NULL && in_hosteq(ip->ip_src, IA_SIN(ia)->sin_addr)) { - ifa_free(&ia->ia_ifa); + NET_EPOCH_EXIT(); return (0); } IGMPSTAT_INC(igps_rcv_reports); if (ifp->if_flags & IFF_LOOPBACK) { - if (ia != NULL) - ifa_free(&ia->ia_ifa); + NET_EPOCH_EXIT(); return (0); } if (!IN_MULTICAST(ntohl(igmp->igmp_group.s_addr)) || !in_hosteq(igmp->igmp_group, ip->ip_dst)) { - if (ia != NULL) - ifa_free(&ia->ia_ifa); + NET_EPOCH_EXIT(); IGMPSTAT_INC(igps_rcv_badreports); return (EINVAL); } /* * RFC 3376, Section 4.2.13, 9.2, 9.3: * Booting clients may use the source address 0.0.0.0. Some * IGMP daemons may not know how to use IP_RECVIF to determine * the interface upon which this message was received. * Replace 0.0.0.0 with the subnet address if told to do so. */ if (V_igmp_recvifkludge && in_nullhost(ip->ip_src)) { if (ia != NULL) ip->ip_src.s_addr = htonl(ia->ia_subnet); } - if (ia != NULL) - ifa_free(&ia->ia_ifa); + NET_EPOCH_EXIT(); CTR3(KTR_IGMPV3, "process v2 report 0x%08x on ifp %p(%s)", ntohl(igmp->igmp_group.s_addr), ifp, ifp->if_xname); /* * IGMPv2 report suppression. * If we are a member of this group, and our membership should be * reported, and our group timer is pending or about to be reset, * stop our group timer by transitioning to the 'lazy' state. */ IN_MULTI_LIST_LOCK(); inm = inm_lookup(ifp, igmp->igmp_group); if (inm != NULL) { struct igmp_ifsoftc *igi; igi = inm->inm_igi; KASSERT(igi != NULL, ("%s: no igi for ifp %p", __func__, ifp)); IGMPSTAT_INC(igps_rcv_ourreports); /* * If we are in IGMPv3 host mode, do not allow the * other host's IGMPv1 report to suppress our reports * unless explicitly configured to do so. */ if (igi->igi_version == IGMP_VERSION_3) { if (V_igmp_legacysupp) igmp_v3_suppress_group_record(inm); goto out_locked; } inm->inm_timer = 0; switch (inm->inm_state) { case IGMP_NOT_MEMBER: case IGMP_SILENT_MEMBER: case IGMP_SLEEPING_MEMBER: break; case IGMP_REPORTING_MEMBER: case IGMP_IDLE_MEMBER: case IGMP_AWAKENING_MEMBER: CTR3(KTR_IGMPV3, "report suppressed for 0x%08x on ifp %p(%s)", ntohl(igmp->igmp_group.s_addr), ifp, ifp->if_xname); case IGMP_LAZY_MEMBER: inm->inm_state = IGMP_LAZY_MEMBER; break; case IGMP_G_QUERY_PENDING_MEMBER: case IGMP_SG_QUERY_PENDING_MEMBER: case IGMP_LEAVING_MEMBER: break; } } out_locked: IN_MULTI_LIST_UNLOCK(); return (0); } int igmp_input(struct mbuf **mp, int *offp, int proto) { int iphlen; struct ifnet *ifp; struct igmp *igmp; struct ip *ip; struct mbuf *m; int igmplen; int minlen; int queryver; CTR3(KTR_IGMPV3, "%s: called w/mbuf (%p,%d)", __func__, *mp, *offp); m = *mp; ifp = m->m_pkthdr.rcvif; *mp = NULL; IGMPSTAT_INC(igps_rcv_total); ip = mtod(m, struct ip *); iphlen = *offp; igmplen = ntohs(ip->ip_len) - iphlen; /* * Validate lengths. */ if (igmplen < IGMP_MINLEN) { IGMPSTAT_INC(igps_rcv_tooshort); m_freem(m); return (IPPROTO_DONE); } /* * Always pullup to the minimum size for v1/v2 or v3 * to amortize calls to m_pullup(). */ minlen = iphlen; if (igmplen >= IGMP_V3_QUERY_MINLEN) minlen += IGMP_V3_QUERY_MINLEN; else minlen += IGMP_MINLEN; if ((!M_WRITABLE(m) || m->m_len < minlen) && (m = m_pullup(m, minlen)) == NULL) { IGMPSTAT_INC(igps_rcv_tooshort); return (IPPROTO_DONE); } ip = mtod(m, struct ip *); /* * Validate checksum. */ m->m_data += iphlen; m->m_len -= iphlen; igmp = mtod(m, struct igmp *); if (in_cksum(m, igmplen)) { IGMPSTAT_INC(igps_rcv_badsum); m_freem(m); return (IPPROTO_DONE); } m->m_data -= iphlen; m->m_len += iphlen; /* * IGMP control traffic is link-scope, and must have a TTL of 1. * DVMRP traffic (e.g. mrinfo, mtrace) is an exception; * probe packets may come from beyond the LAN. */ if (igmp->igmp_type != IGMP_DVMRP && ip->ip_ttl != 1) { IGMPSTAT_INC(igps_rcv_badttl); m_freem(m); return (IPPROTO_DONE); } switch (igmp->igmp_type) { case IGMP_HOST_MEMBERSHIP_QUERY: if (igmplen == IGMP_MINLEN) { if (igmp->igmp_code == 0) queryver = IGMP_VERSION_1; else queryver = IGMP_VERSION_2; } else if (igmplen >= IGMP_V3_QUERY_MINLEN) { queryver = IGMP_VERSION_3; } else { IGMPSTAT_INC(igps_rcv_tooshort); m_freem(m); return (IPPROTO_DONE); } switch (queryver) { case IGMP_VERSION_1: IGMPSTAT_INC(igps_rcv_v1v2_queries); if (!V_igmp_v1enable) break; if (igmp_input_v1_query(ifp, ip, igmp) != 0) { m_freem(m); return (IPPROTO_DONE); } break; case IGMP_VERSION_2: IGMPSTAT_INC(igps_rcv_v1v2_queries); if (!V_igmp_v2enable) break; if (igmp_input_v2_query(ifp, ip, igmp) != 0) { m_freem(m); return (IPPROTO_DONE); } break; case IGMP_VERSION_3: { struct igmpv3 *igmpv3; uint16_t igmpv3len; uint16_t nsrc; IGMPSTAT_INC(igps_rcv_v3_queries); igmpv3 = (struct igmpv3 *)igmp; /* * Validate length based on source count. */ nsrc = ntohs(igmpv3->igmp_numsrc); if (nsrc * sizeof(in_addr_t) > UINT16_MAX - iphlen - IGMP_V3_QUERY_MINLEN) { IGMPSTAT_INC(igps_rcv_tooshort); return (IPPROTO_DONE); } /* * m_pullup() may modify m, so pullup in * this scope. */ igmpv3len = iphlen + IGMP_V3_QUERY_MINLEN + sizeof(struct in_addr) * nsrc; if ((!M_WRITABLE(m) || m->m_len < igmpv3len) && (m = m_pullup(m, igmpv3len)) == NULL) { IGMPSTAT_INC(igps_rcv_tooshort); return (IPPROTO_DONE); } igmpv3 = (struct igmpv3 *)(mtod(m, uint8_t *) + iphlen); if (igmp_input_v3_query(ifp, ip, igmpv3) != 0) { m_freem(m); return (IPPROTO_DONE); } } break; } break; case IGMP_v1_HOST_MEMBERSHIP_REPORT: if (!V_igmp_v1enable) break; if (igmp_input_v1_report(ifp, ip, igmp) != 0) { m_freem(m); return (IPPROTO_DONE); } break; case IGMP_v2_HOST_MEMBERSHIP_REPORT: if (!V_igmp_v2enable) break; if (!ip_checkrouteralert(m)) IGMPSTAT_INC(igps_rcv_nora); if (igmp_input_v2_report(ifp, ip, igmp) != 0) { m_freem(m); return (IPPROTO_DONE); } break; case IGMP_v3_HOST_MEMBERSHIP_REPORT: /* * Hosts do not need to process IGMPv3 membership reports, * as report suppression is no longer required. */ if (!ip_checkrouteralert(m)) IGMPSTAT_INC(igps_rcv_nora); break; default: break; } /* * Pass all valid IGMP packets up to any process(es) listening on a * raw IGMP socket. */ *mp = m; return (rip_input(mp, offp, proto)); } /* * Fast timeout handler (global). * VIMAGE: Timeout handlers are expected to service all vimages. */ void igmp_fasttimo(void) { VNET_ITERATOR_DECL(vnet_iter); VNET_LIST_RLOCK_NOSLEEP(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); igmp_fasttimo_vnet(); CURVNET_RESTORE(); } VNET_LIST_RUNLOCK_NOSLEEP(); } /* * Fast timeout handler (per-vnet). * Sends are shuffled off to a netisr to deal with Giant. * * VIMAGE: Assume caller has set up our curvnet. */ static void igmp_fasttimo_vnet(void) { struct mbufq scq; /* State-change packets */ struct mbufq qrq; /* Query response packets */ struct ifnet *ifp; struct igmp_ifsoftc *igi; struct ifmultiaddr *ifma, *next; struct in_multi *inm; struct in_multi_head inm_free_tmp; int loop, uri_fasthz; loop = 0; uri_fasthz = 0; /* * Quick check to see if any work needs to be done, in order to * minimize the overhead of fasttimo processing. * SMPng: XXX Unlocked reads. */ if (!V_current_state_timers_running && !V_interface_timers_running && !V_state_change_timers_running) return; SLIST_INIT(&inm_free_tmp); IN_MULTI_LIST_LOCK(); IGMP_LOCK(); /* * IGMPv3 General Query response timer processing. */ if (V_interface_timers_running) { CTR1(KTR_IGMPV3, "%s: interface timers running", __func__); V_interface_timers_running = 0; LIST_FOREACH(igi, &V_igi_head, igi_link) { if (igi->igi_v3_timer == 0) { /* Do nothing. */ } else if (--igi->igi_v3_timer == 0) { igmp_v3_dispatch_general_query(igi); } else { V_interface_timers_running = 1; } } } if (!V_current_state_timers_running && !V_state_change_timers_running) goto out_locked; V_current_state_timers_running = 0; V_state_change_timers_running = 0; CTR1(KTR_IGMPV3, "%s: state change timers running", __func__); /* * IGMPv1/v2/v3 host report and state-change timer processing. * Note: Processing a v3 group timer may remove a node. */ LIST_FOREACH(igi, &V_igi_head, igi_link) { ifp = igi->igi_ifp; if (igi->igi_version == IGMP_VERSION_3) { loop = (igi->igi_flags & IGIF_LOOPBACK) ? 1 : 0; uri_fasthz = IGMP_RANDOM_DELAY(igi->igi_uri * PR_FASTHZ); mbufq_init(&qrq, IGMP_MAX_G_GS_PACKETS); mbufq_init(&scq, IGMP_MAX_STATE_CHANGE_PACKETS); } IF_ADDR_WLOCK(ifp); restart: CK_STAILQ_FOREACH_SAFE(ifma, &ifp->if_multiaddrs, ifma_link, next) { if (ifma->ifma_addr->sa_family != AF_INET || ifma->ifma_protospec == NULL) continue; inm = (struct in_multi *)ifma->ifma_protospec; switch (igi->igi_version) { case IGMP_VERSION_1: case IGMP_VERSION_2: igmp_v1v2_process_group_timer(inm, igi->igi_version); break; case IGMP_VERSION_3: igmp_v3_process_group_timers(&inm_free_tmp, &qrq, &scq, inm, uri_fasthz); break; } if (__predict_false(ifma_restart)) { ifma_restart = false; goto restart; } } IF_ADDR_WUNLOCK(ifp); if (igi->igi_version == IGMP_VERSION_3) { igmp_dispatch_queue(&qrq, 0, loop); igmp_dispatch_queue(&scq, 0, loop); /* * Free the in_multi reference(s) for this * IGMP lifecycle. */ inm_release_list_deferred(&inm_free_tmp); } } out_locked: IGMP_UNLOCK(); IN_MULTI_LIST_UNLOCK(); } /* * Update host report group timer for IGMPv1/v2. * Will update the global pending timer flags. */ static void igmp_v1v2_process_group_timer(struct in_multi *inm, const int version) { int report_timer_expired; IN_MULTI_LIST_LOCK_ASSERT(); IGMP_LOCK_ASSERT(); if (inm->inm_timer == 0) { report_timer_expired = 0; } else if (--inm->inm_timer == 0) { report_timer_expired = 1; } else { V_current_state_timers_running = 1; return; } switch (inm->inm_state) { case IGMP_NOT_MEMBER: case IGMP_SILENT_MEMBER: case IGMP_IDLE_MEMBER: case IGMP_LAZY_MEMBER: case IGMP_SLEEPING_MEMBER: case IGMP_AWAKENING_MEMBER: break; case IGMP_REPORTING_MEMBER: if (report_timer_expired) { inm->inm_state = IGMP_IDLE_MEMBER; (void)igmp_v1v2_queue_report(inm, (version == IGMP_VERSION_2) ? IGMP_v2_HOST_MEMBERSHIP_REPORT : IGMP_v1_HOST_MEMBERSHIP_REPORT); } break; case IGMP_G_QUERY_PENDING_MEMBER: case IGMP_SG_QUERY_PENDING_MEMBER: case IGMP_LEAVING_MEMBER: break; } } /* * Update a group's timers for IGMPv3. * Will update the global pending timer flags. * Note: Unlocked read from igi. */ static void igmp_v3_process_group_timers(struct in_multi_head *inmh, struct mbufq *qrq, struct mbufq *scq, struct in_multi *inm, const int uri_fasthz) { int query_response_timer_expired; int state_change_retransmit_timer_expired; IN_MULTI_LIST_LOCK_ASSERT(); IGMP_LOCK_ASSERT(); query_response_timer_expired = 0; state_change_retransmit_timer_expired = 0; /* * During a transition from v1/v2 compatibility mode back to v3, * a group record in REPORTING state may still have its group * timer active. This is a no-op in this function; it is easier * to deal with it here than to complicate the slow-timeout path. */ if (inm->inm_timer == 0) { query_response_timer_expired = 0; } else if (--inm->inm_timer == 0) { query_response_timer_expired = 1; } else { V_current_state_timers_running = 1; } if (inm->inm_sctimer == 0) { state_change_retransmit_timer_expired = 0; } else if (--inm->inm_sctimer == 0) { state_change_retransmit_timer_expired = 1; } else { V_state_change_timers_running = 1; } /* We are in fasttimo, so be quick about it. */ if (!state_change_retransmit_timer_expired && !query_response_timer_expired) return; switch (inm->inm_state) { case IGMP_NOT_MEMBER: case IGMP_SILENT_MEMBER: case IGMP_SLEEPING_MEMBER: case IGMP_LAZY_MEMBER: case IGMP_AWAKENING_MEMBER: case IGMP_IDLE_MEMBER: break; case IGMP_G_QUERY_PENDING_MEMBER: case IGMP_SG_QUERY_PENDING_MEMBER: /* * Respond to a previously pending Group-Specific * or Group-and-Source-Specific query by enqueueing * the appropriate Current-State report for * immediate transmission. */ if (query_response_timer_expired) { int retval __unused; retval = igmp_v3_enqueue_group_record(qrq, inm, 0, 1, (inm->inm_state == IGMP_SG_QUERY_PENDING_MEMBER)); CTR2(KTR_IGMPV3, "%s: enqueue record = %d", __func__, retval); inm->inm_state = IGMP_REPORTING_MEMBER; /* XXX Clear recorded sources for next time. */ inm_clear_recorded(inm); } /* FALLTHROUGH */ case IGMP_REPORTING_MEMBER: case IGMP_LEAVING_MEMBER: if (state_change_retransmit_timer_expired) { /* * State-change retransmission timer fired. * If there are any further pending retransmissions, * set the global pending state-change flag, and * reset the timer. */ if (--inm->inm_scrv > 0) { inm->inm_sctimer = uri_fasthz; V_state_change_timers_running = 1; } /* * Retransmit the previously computed state-change * report. If there are no further pending * retransmissions, the mbuf queue will be consumed. * Update T0 state to T1 as we have now sent * a state-change. */ (void)igmp_v3_merge_state_changes(inm, scq); inm_commit(inm); CTR3(KTR_IGMPV3, "%s: T1 -> T0 for 0x%08x/%s", __func__, ntohl(inm->inm_addr.s_addr), inm->inm_ifp->if_xname); /* * If we are leaving the group for good, make sure * we release IGMP's reference to it. * This release must be deferred using a SLIST, * as we are called from a loop which traverses * the in_ifmultiaddr TAILQ. */ if (inm->inm_state == IGMP_LEAVING_MEMBER && inm->inm_scrv == 0) { inm->inm_state = IGMP_NOT_MEMBER; inm_rele_locked(inmh, inm); } } break; } } /* * Suppress a group's pending response to a group or source/group query. * * Do NOT suppress state changes. This leads to IGMPv3 inconsistency. * Do NOT update ST1/ST0 as this operation merely suppresses * the currently pending group record. * Do NOT suppress the response to a general query. It is possible but * it would require adding another state or flag. */ static void igmp_v3_suppress_group_record(struct in_multi *inm) { IN_MULTI_LIST_LOCK_ASSERT(); KASSERT(inm->inm_igi->igi_version == IGMP_VERSION_3, ("%s: not IGMPv3 mode on link", __func__)); if (inm->inm_state != IGMP_G_QUERY_PENDING_MEMBER || inm->inm_state != IGMP_SG_QUERY_PENDING_MEMBER) return; if (inm->inm_state == IGMP_SG_QUERY_PENDING_MEMBER) inm_clear_recorded(inm); inm->inm_timer = 0; inm->inm_state = IGMP_REPORTING_MEMBER; } /* * Switch to a different IGMP version on the given interface, * as per Section 7.2.1. */ static void igmp_set_version(struct igmp_ifsoftc *igi, const int version) { int old_version_timer; IGMP_LOCK_ASSERT(); CTR4(KTR_IGMPV3, "%s: switching to v%d on ifp %p(%s)", __func__, version, igi->igi_ifp, igi->igi_ifp->if_xname); if (version == IGMP_VERSION_1 || version == IGMP_VERSION_2) { /* * Compute the "Older Version Querier Present" timer as per * Section 8.12. */ old_version_timer = igi->igi_rv * igi->igi_qi + igi->igi_qri; old_version_timer *= PR_SLOWHZ; if (version == IGMP_VERSION_1) { igi->igi_v1_timer = old_version_timer; igi->igi_v2_timer = 0; } else if (version == IGMP_VERSION_2) { igi->igi_v1_timer = 0; igi->igi_v2_timer = old_version_timer; } } if (igi->igi_v1_timer == 0 && igi->igi_v2_timer > 0) { if (igi->igi_version != IGMP_VERSION_2) { igi->igi_version = IGMP_VERSION_2; igmp_v3_cancel_link_timers(igi); } } else if (igi->igi_v1_timer > 0) { if (igi->igi_version != IGMP_VERSION_1) { igi->igi_version = IGMP_VERSION_1; igmp_v3_cancel_link_timers(igi); } } } /* * Cancel pending IGMPv3 timers for the given link and all groups * joined on it; state-change, general-query, and group-query timers. * * Only ever called on a transition from v3 to Compatibility mode. Kill * the timers stone dead (this may be expensive for large N groups), they * will be restarted if Compatibility Mode deems that they must be due to * query processing. */ static void igmp_v3_cancel_link_timers(struct igmp_ifsoftc *igi) { struct ifmultiaddr *ifma; struct ifnet *ifp; struct in_multi *inm; struct in_multi_head inm_free_tmp; CTR3(KTR_IGMPV3, "%s: cancel v3 timers on ifp %p(%s)", __func__, igi->igi_ifp, igi->igi_ifp->if_xname); IN_MULTI_LIST_LOCK_ASSERT(); IGMP_LOCK_ASSERT(); SLIST_INIT(&inm_free_tmp); /* * Stop the v3 General Query Response on this link stone dead. * If fasttimo is woken up due to V_interface_timers_running, * the flag will be cleared if there are no pending link timers. */ igi->igi_v3_timer = 0; /* * Now clear the current-state and state-change report timers * for all memberships scoped to this link. */ ifp = igi->igi_ifp; IF_ADDR_RLOCK(ifp); CK_STAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { if (ifma->ifma_addr->sa_family != AF_INET || ifma->ifma_protospec == NULL) continue; inm = (struct in_multi *)ifma->ifma_protospec; switch (inm->inm_state) { case IGMP_NOT_MEMBER: case IGMP_SILENT_MEMBER: case IGMP_IDLE_MEMBER: case IGMP_LAZY_MEMBER: case IGMP_SLEEPING_MEMBER: case IGMP_AWAKENING_MEMBER: /* * These states are either not relevant in v3 mode, * or are unreported. Do nothing. */ break; case IGMP_LEAVING_MEMBER: /* * If we are leaving the group and switching to * compatibility mode, we need to release the final * reference held for issuing the INCLUDE {}, and * transition to REPORTING to ensure the host leave * message is sent upstream to the old querier -- * transition to NOT would lose the leave and race. */ inm_rele_locked(&inm_free_tmp, inm); /* FALLTHROUGH */ case IGMP_G_QUERY_PENDING_MEMBER: case IGMP_SG_QUERY_PENDING_MEMBER: inm_clear_recorded(inm); /* FALLTHROUGH */ case IGMP_REPORTING_MEMBER: inm->inm_state = IGMP_REPORTING_MEMBER; break; } /* * Always clear state-change and group report timers. * Free any pending IGMPv3 state-change records. */ inm->inm_sctimer = 0; inm->inm_timer = 0; mbufq_drain(&inm->inm_scq); } IF_ADDR_RUNLOCK(ifp); inm_release_list_deferred(&inm_free_tmp); } /* * Update the Older Version Querier Present timers for a link. * See Section 7.2.1 of RFC 3376. */ static void igmp_v1v2_process_querier_timers(struct igmp_ifsoftc *igi) { IGMP_LOCK_ASSERT(); if (igi->igi_v1_timer == 0 && igi->igi_v2_timer == 0) { /* * IGMPv1 and IGMPv2 Querier Present timers expired. * * Revert to IGMPv3. */ if (igi->igi_version != IGMP_VERSION_3) { CTR5(KTR_IGMPV3, "%s: transition from v%d -> v%d on %p(%s)", __func__, igi->igi_version, IGMP_VERSION_3, igi->igi_ifp, igi->igi_ifp->if_xname); igi->igi_version = IGMP_VERSION_3; } } else if (igi->igi_v1_timer == 0 && igi->igi_v2_timer > 0) { /* * IGMPv1 Querier Present timer expired, * IGMPv2 Querier Present timer running. * If IGMPv2 was disabled since last timeout, * revert to IGMPv3. * If IGMPv2 is enabled, revert to IGMPv2. */ if (!V_igmp_v2enable) { CTR5(KTR_IGMPV3, "%s: transition from v%d -> v%d on %p(%s)", __func__, igi->igi_version, IGMP_VERSION_3, igi->igi_ifp, igi->igi_ifp->if_xname); igi->igi_v2_timer = 0; igi->igi_version = IGMP_VERSION_3; } else { --igi->igi_v2_timer; if (igi->igi_version != IGMP_VERSION_2) { CTR5(KTR_IGMPV3, "%s: transition from v%d -> v%d on %p(%s)", __func__, igi->igi_version, IGMP_VERSION_2, igi->igi_ifp, igi->igi_ifp->if_xname); igi->igi_version = IGMP_VERSION_2; igmp_v3_cancel_link_timers(igi); } } } else if (igi->igi_v1_timer > 0) { /* * IGMPv1 Querier Present timer running. * Stop IGMPv2 timer if running. * * If IGMPv1 was disabled since last timeout, * revert to IGMPv3. * If IGMPv1 is enabled, reset IGMPv2 timer if running. */ if (!V_igmp_v1enable) { CTR5(KTR_IGMPV3, "%s: transition from v%d -> v%d on %p(%s)", __func__, igi->igi_version, IGMP_VERSION_3, igi->igi_ifp, igi->igi_ifp->if_xname); igi->igi_v1_timer = 0; igi->igi_version = IGMP_VERSION_3; } else { --igi->igi_v1_timer; } if (igi->igi_v2_timer > 0) { CTR3(KTR_IGMPV3, "%s: cancel v2 timer on %p(%s)", __func__, igi->igi_ifp, igi->igi_ifp->if_xname); igi->igi_v2_timer = 0; } } } /* * Global slowtimo handler. * VIMAGE: Timeout handlers are expected to service all vimages. */ void igmp_slowtimo(void) { VNET_ITERATOR_DECL(vnet_iter); VNET_LIST_RLOCK_NOSLEEP(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); igmp_slowtimo_vnet(); CURVNET_RESTORE(); } VNET_LIST_RUNLOCK_NOSLEEP(); } /* * Per-vnet slowtimo handler. */ static void igmp_slowtimo_vnet(void) { struct igmp_ifsoftc *igi; IGMP_LOCK(); LIST_FOREACH(igi, &V_igi_head, igi_link) { igmp_v1v2_process_querier_timers(igi); } IGMP_UNLOCK(); } /* * Dispatch an IGMPv1/v2 host report or leave message. * These are always small enough to fit inside a single mbuf. */ static int igmp_v1v2_queue_report(struct in_multi *inm, const int type) { struct ifnet *ifp; struct igmp *igmp; struct ip *ip; struct mbuf *m; IN_MULTI_LIST_LOCK_ASSERT(); IGMP_LOCK_ASSERT(); ifp = inm->inm_ifp; m = m_gethdr(M_NOWAIT, MT_DATA); if (m == NULL) return (ENOMEM); M_ALIGN(m, sizeof(struct ip) + sizeof(struct igmp)); m->m_pkthdr.len = sizeof(struct ip) + sizeof(struct igmp); m->m_data += sizeof(struct ip); m->m_len = sizeof(struct igmp); igmp = mtod(m, struct igmp *); igmp->igmp_type = type; igmp->igmp_code = 0; igmp->igmp_group = inm->inm_addr; igmp->igmp_cksum = 0; igmp->igmp_cksum = in_cksum(m, sizeof(struct igmp)); m->m_data -= sizeof(struct ip); m->m_len += sizeof(struct ip); ip = mtod(m, struct ip *); ip->ip_tos = 0; ip->ip_len = htons(sizeof(struct ip) + sizeof(struct igmp)); ip->ip_off = 0; ip->ip_p = IPPROTO_IGMP; ip->ip_src.s_addr = INADDR_ANY; if (type == IGMP_HOST_LEAVE_MESSAGE) ip->ip_dst.s_addr = htonl(INADDR_ALLRTRS_GROUP); else ip->ip_dst = inm->inm_addr; igmp_save_context(m, ifp); m->m_flags |= M_IGMPV2; if (inm->inm_igi->igi_flags & IGIF_LOOPBACK) m->m_flags |= M_IGMP_LOOP; CTR2(KTR_IGMPV3, "%s: netisr_dispatch(NETISR_IGMP, %p)", __func__, m); netisr_dispatch(NETISR_IGMP, m); return (0); } /* * Process a state change from the upper layer for the given IPv4 group. * * Each socket holds a reference on the in_multi in its own ip_moptions. * The socket layer will have made the necessary updates to.the group * state, it is now up to IGMP to issue a state change report if there * has been any change between T0 (when the last state-change was issued) * and T1 (now). * * We use the IGMPv3 state machine at group level. The IGMP module * however makes the decision as to which IGMP protocol version to speak. * A state change *from* INCLUDE {} always means an initial join. * A state change *to* INCLUDE {} always means a final leave. * * FUTURE: If IGIF_V3LITE is enabled for this interface, then we can * save ourselves a bunch of work; any exclusive mode groups need not * compute source filter lists. * * VIMAGE: curvnet should have been set by caller, as this routine * is called from the socket option handlers. */ int igmp_change_state(struct in_multi *inm) { struct igmp_ifsoftc *igi; struct ifnet *ifp; int error; error = 0; IN_MULTI_LOCK_ASSERT(); /* * Try to detect if the upper layer just asked us to change state * for an interface which has now gone away. */ KASSERT(inm->inm_ifma != NULL, ("%s: no ifma", __func__)); ifp = inm->inm_ifma->ifma_ifp; /* * Sanity check that netinet's notion of ifp is the * same as net's. */ KASSERT(inm->inm_ifp == ifp, ("%s: bad ifp", __func__)); IGMP_LOCK(); igi = ((struct in_ifinfo *)ifp->if_afdata[AF_INET])->ii_igmp; KASSERT(igi != NULL, ("%s: no igmp_ifsoftc for ifp %p", __func__, ifp)); /* * If we detect a state transition to or from MCAST_UNDEFINED * for this group, then we are starting or finishing an IGMP * life cycle for this group. */ if (inm->inm_st[1].iss_fmode != inm->inm_st[0].iss_fmode) { CTR3(KTR_IGMPV3, "%s: inm transition %d -> %d", __func__, inm->inm_st[0].iss_fmode, inm->inm_st[1].iss_fmode); if (inm->inm_st[0].iss_fmode == MCAST_UNDEFINED) { CTR1(KTR_IGMPV3, "%s: initial join", __func__); error = igmp_initial_join(inm, igi); goto out_locked; } else if (inm->inm_st[1].iss_fmode == MCAST_UNDEFINED) { CTR1(KTR_IGMPV3, "%s: final leave", __func__); igmp_final_leave(inm, igi); goto out_locked; } } else { CTR1(KTR_IGMPV3, "%s: filter set change", __func__); } error = igmp_handle_state_change(inm, igi); out_locked: IGMP_UNLOCK(); return (error); } /* * Perform the initial join for an IGMP group. * * When joining a group: * If the group should have its IGMP traffic suppressed, do nothing. * IGMPv1 starts sending IGMPv1 host membership reports. * IGMPv2 starts sending IGMPv2 host membership reports. * IGMPv3 will schedule an IGMPv3 state-change report containing the * initial state of the membership. */ static int igmp_initial_join(struct in_multi *inm, struct igmp_ifsoftc *igi) { struct ifnet *ifp; struct mbufq *mq; int error, retval, syncstates; CTR4(KTR_IGMPV3, "%s: initial join 0x%08x on ifp %p(%s)", __func__, ntohl(inm->inm_addr.s_addr), inm->inm_ifp, inm->inm_ifp->if_xname); error = 0; syncstates = 1; ifp = inm->inm_ifp; IN_MULTI_LOCK_ASSERT(); IGMP_LOCK_ASSERT(); KASSERT(igi && igi->igi_ifp == ifp, ("%s: inconsistent ifp", __func__)); /* * Groups joined on loopback or marked as 'not reported', * e.g. 224.0.0.1, enter the IGMP_SILENT_MEMBER state and * are never reported in any IGMP protocol exchanges. * All other groups enter the appropriate IGMP state machine * for the version in use on this link. * A link marked as IGIF_SILENT causes IGMP to be completely * disabled for the link. */ if ((ifp->if_flags & IFF_LOOPBACK) || (igi->igi_flags & IGIF_SILENT) || !igmp_isgroupreported(inm->inm_addr)) { CTR1(KTR_IGMPV3, "%s: not kicking state machine for silent group", __func__); inm->inm_state = IGMP_SILENT_MEMBER; inm->inm_timer = 0; } else { /* * Deal with overlapping in_multi lifecycle. * If this group was LEAVING, then make sure * we drop the reference we picked up to keep the * group around for the final INCLUDE {} enqueue. */ if (igi->igi_version == IGMP_VERSION_3 && inm->inm_state == IGMP_LEAVING_MEMBER) { MPASS(inm->inm_refcount > 1); inm_rele_locked(NULL, inm); } inm->inm_state = IGMP_REPORTING_MEMBER; switch (igi->igi_version) { case IGMP_VERSION_1: case IGMP_VERSION_2: inm->inm_state = IGMP_IDLE_MEMBER; error = igmp_v1v2_queue_report(inm, (igi->igi_version == IGMP_VERSION_2) ? IGMP_v2_HOST_MEMBERSHIP_REPORT : IGMP_v1_HOST_MEMBERSHIP_REPORT); if (error == 0) { inm->inm_timer = IGMP_RANDOM_DELAY( IGMP_V1V2_MAX_RI * PR_FASTHZ); V_current_state_timers_running = 1; } break; case IGMP_VERSION_3: /* * Defer update of T0 to T1, until the first copy * of the state change has been transmitted. */ syncstates = 0; /* * Immediately enqueue a State-Change Report for * this interface, freeing any previous reports. * Don't kick the timers if there is nothing to do, * or if an error occurred. */ mq = &inm->inm_scq; mbufq_drain(mq); retval = igmp_v3_enqueue_group_record(mq, inm, 1, 0, 0); CTR2(KTR_IGMPV3, "%s: enqueue record = %d", __func__, retval); if (retval <= 0) { error = retval * -1; break; } /* * Schedule transmission of pending state-change * report up to RV times for this link. The timer * will fire at the next igmp_fasttimo (~200ms), * giving us an opportunity to merge the reports. */ if (igi->igi_flags & IGIF_LOOPBACK) { inm->inm_scrv = 1; } else { KASSERT(igi->igi_rv > 1, ("%s: invalid robustness %d", __func__, igi->igi_rv)); inm->inm_scrv = igi->igi_rv; } inm->inm_sctimer = 1; V_state_change_timers_running = 1; error = 0; break; } } /* * Only update the T0 state if state change is atomic, * i.e. we don't need to wait for a timer to fire before we * can consider the state change to have been communicated. */ if (syncstates) { inm_commit(inm); CTR3(KTR_IGMPV3, "%s: T1 -> T0 for 0x%08x/%s", __func__, ntohl(inm->inm_addr.s_addr), inm->inm_ifp->if_xname); } return (error); } /* * Issue an intermediate state change during the IGMP life-cycle. */ static int igmp_handle_state_change(struct in_multi *inm, struct igmp_ifsoftc *igi) { struct ifnet *ifp; int retval; CTR4(KTR_IGMPV3, "%s: state change for 0x%08x on ifp %p(%s)", __func__, ntohl(inm->inm_addr.s_addr), inm->inm_ifp, inm->inm_ifp->if_xname); ifp = inm->inm_ifp; IN_MULTI_LIST_LOCK_ASSERT(); IGMP_LOCK_ASSERT(); KASSERT(igi && igi->igi_ifp == ifp, ("%s: inconsistent ifp", __func__)); if ((ifp->if_flags & IFF_LOOPBACK) || (igi->igi_flags & IGIF_SILENT) || !igmp_isgroupreported(inm->inm_addr) || (igi->igi_version != IGMP_VERSION_3)) { if (!igmp_isgroupreported(inm->inm_addr)) { CTR1(KTR_IGMPV3, "%s: not kicking state machine for silent group", __func__); } CTR1(KTR_IGMPV3, "%s: nothing to do", __func__); inm_commit(inm); CTR3(KTR_IGMPV3, "%s: T1 -> T0 for 0x%08x/%s", __func__, ntohl(inm->inm_addr.s_addr), inm->inm_ifp->if_xname); return (0); } mbufq_drain(&inm->inm_scq); retval = igmp_v3_enqueue_group_record(&inm->inm_scq, inm, 1, 0, 0); CTR2(KTR_IGMPV3, "%s: enqueue record = %d", __func__, retval); if (retval <= 0) return (-retval); /* * If record(s) were enqueued, start the state-change * report timer for this group. */ inm->inm_scrv = ((igi->igi_flags & IGIF_LOOPBACK) ? 1 : igi->igi_rv); inm->inm_sctimer = 1; V_state_change_timers_running = 1; return (0); } /* * Perform the final leave for an IGMP group. * * When leaving a group: * IGMPv1 does nothing. * IGMPv2 sends a host leave message, if and only if we are the reporter. * IGMPv3 enqueues a state-change report containing a transition * to INCLUDE {} for immediate transmission. */ static void igmp_final_leave(struct in_multi *inm, struct igmp_ifsoftc *igi) { int syncstates; syncstates = 1; CTR4(KTR_IGMPV3, "%s: final leave 0x%08x on ifp %p(%s)", __func__, ntohl(inm->inm_addr.s_addr), inm->inm_ifp, inm->inm_ifp->if_xname); IN_MULTI_LIST_LOCK_ASSERT(); IGMP_LOCK_ASSERT(); switch (inm->inm_state) { case IGMP_NOT_MEMBER: case IGMP_SILENT_MEMBER: case IGMP_LEAVING_MEMBER: /* Already leaving or left; do nothing. */ CTR1(KTR_IGMPV3, "%s: not kicking state machine for silent group", __func__); break; case IGMP_REPORTING_MEMBER: case IGMP_IDLE_MEMBER: case IGMP_G_QUERY_PENDING_MEMBER: case IGMP_SG_QUERY_PENDING_MEMBER: if (igi->igi_version == IGMP_VERSION_2) { #ifdef INVARIANTS if (inm->inm_state == IGMP_G_QUERY_PENDING_MEMBER || inm->inm_state == IGMP_SG_QUERY_PENDING_MEMBER) panic("%s: IGMPv3 state reached, not IGMPv3 mode", __func__); #endif igmp_v1v2_queue_report(inm, IGMP_HOST_LEAVE_MESSAGE); inm->inm_state = IGMP_NOT_MEMBER; } else if (igi->igi_version == IGMP_VERSION_3) { /* * Stop group timer and all pending reports. * Immediately enqueue a state-change report * TO_IN {} to be sent on the next fast timeout, * giving us an opportunity to merge reports. */ mbufq_drain(&inm->inm_scq); inm->inm_timer = 0; if (igi->igi_flags & IGIF_LOOPBACK) { inm->inm_scrv = 1; } else { inm->inm_scrv = igi->igi_rv; } CTR4(KTR_IGMPV3, "%s: Leaving 0x%08x/%s with %d " "pending retransmissions.", __func__, ntohl(inm->inm_addr.s_addr), inm->inm_ifp->if_xname, inm->inm_scrv); if (inm->inm_scrv == 0) { inm->inm_state = IGMP_NOT_MEMBER; inm->inm_sctimer = 0; } else { int retval __unused; inm_acquire_locked(inm); retval = igmp_v3_enqueue_group_record( &inm->inm_scq, inm, 1, 0, 0); KASSERT(retval != 0, ("%s: enqueue record = %d", __func__, retval)); inm->inm_state = IGMP_LEAVING_MEMBER; inm->inm_sctimer = 1; V_state_change_timers_running = 1; syncstates = 0; } break; } break; case IGMP_LAZY_MEMBER: case IGMP_SLEEPING_MEMBER: case IGMP_AWAKENING_MEMBER: /* Our reports are suppressed; do nothing. */ break; } if (syncstates) { inm_commit(inm); CTR3(KTR_IGMPV3, "%s: T1 -> T0 for 0x%08x/%s", __func__, ntohl(inm->inm_addr.s_addr), inm->inm_ifp->if_xname); inm->inm_st[1].iss_fmode = MCAST_UNDEFINED; CTR3(KTR_IGMPV3, "%s: T1 now MCAST_UNDEFINED for 0x%08x/%s", __func__, ntohl(inm->inm_addr.s_addr), inm->inm_ifp->if_xname); } } /* * Enqueue an IGMPv3 group record to the given output queue. * * XXX This function could do with having the allocation code * split out, and the multiple-tree-walks coalesced into a single * routine as has been done in igmp_v3_enqueue_filter_change(). * * If is_state_change is zero, a current-state record is appended. * If is_state_change is non-zero, a state-change report is appended. * * If is_group_query is non-zero, an mbuf packet chain is allocated. * If is_group_query is zero, and if there is a packet with free space * at the tail of the queue, it will be appended to providing there * is enough free space. * Otherwise a new mbuf packet chain is allocated. * * If is_source_query is non-zero, each source is checked to see if * it was recorded for a Group-Source query, and will be omitted if * it is not both in-mode and recorded. * * The function will attempt to allocate leading space in the packet * for the IP/IGMP header to be prepended without fragmenting the chain. * * If successful the size of all data appended to the queue is returned, * otherwise an error code less than zero is returned, or zero if * no record(s) were appended. */ static int igmp_v3_enqueue_group_record(struct mbufq *mq, struct in_multi *inm, const int is_state_change, const int is_group_query, const int is_source_query) { struct igmp_grouprec ig; struct igmp_grouprec *pig; struct ifnet *ifp; struct ip_msource *ims, *nims; struct mbuf *m0, *m, *md; int is_filter_list_change; int minrec0len, m0srcs, msrcs, nbytes, off; int record_has_sources; int now; int type; in_addr_t naddr; uint8_t mode; IN_MULTI_LIST_LOCK_ASSERT(); ifp = inm->inm_ifp; is_filter_list_change = 0; m = NULL; m0 = NULL; m0srcs = 0; msrcs = 0; nbytes = 0; nims = NULL; record_has_sources = 1; pig = NULL; type = IGMP_DO_NOTHING; mode = inm->inm_st[1].iss_fmode; /* * If we did not transition out of ASM mode during t0->t1, * and there are no source nodes to process, we can skip * the generation of source records. */ if (inm->inm_st[0].iss_asm > 0 && inm->inm_st[1].iss_asm > 0 && inm->inm_nsrc == 0) record_has_sources = 0; if (is_state_change) { /* * Queue a state change record. * If the mode did not change, and there are non-ASM * listeners or source filters present, * we potentially need to issue two records for the group. * If we are transitioning to MCAST_UNDEFINED, we need * not send any sources. * If there are ASM listeners, and there was no filter * mode transition of any kind, do nothing. */ if (mode != inm->inm_st[0].iss_fmode) { if (mode == MCAST_EXCLUDE) { CTR1(KTR_IGMPV3, "%s: change to EXCLUDE", __func__); type = IGMP_CHANGE_TO_EXCLUDE_MODE; } else { CTR1(KTR_IGMPV3, "%s: change to INCLUDE", __func__); type = IGMP_CHANGE_TO_INCLUDE_MODE; if (mode == MCAST_UNDEFINED) record_has_sources = 0; } } else { if (record_has_sources) { is_filter_list_change = 1; } else { type = IGMP_DO_NOTHING; } } } else { /* * Queue a current state record. */ if (mode == MCAST_EXCLUDE) { type = IGMP_MODE_IS_EXCLUDE; } else if (mode == MCAST_INCLUDE) { type = IGMP_MODE_IS_INCLUDE; KASSERT(inm->inm_st[1].iss_asm == 0, ("%s: inm %p is INCLUDE but ASM count is %d", __func__, inm, inm->inm_st[1].iss_asm)); } } /* * Generate the filter list changes using a separate function. */ if (is_filter_list_change) return (igmp_v3_enqueue_filter_change(mq, inm)); if (type == IGMP_DO_NOTHING) { CTR3(KTR_IGMPV3, "%s: nothing to do for 0x%08x/%s", __func__, ntohl(inm->inm_addr.s_addr), inm->inm_ifp->if_xname); return (0); } /* * If any sources are present, we must be able to fit at least * one in the trailing space of the tail packet's mbuf, * ideally more. */ minrec0len = sizeof(struct igmp_grouprec); if (record_has_sources) minrec0len += sizeof(in_addr_t); CTR4(KTR_IGMPV3, "%s: queueing %s for 0x%08x/%s", __func__, igmp_rec_type_to_str(type), ntohl(inm->inm_addr.s_addr), inm->inm_ifp->if_xname); /* * Check if we have a packet in the tail of the queue for this * group into which the first group record for this group will fit. * Otherwise allocate a new packet. * Always allocate leading space for IP+RA_OPT+IGMP+REPORT. * Note: Group records for G/GSR query responses MUST be sent * in their own packet. */ m0 = mbufq_last(mq); if (!is_group_query && m0 != NULL && (m0->m_pkthdr.PH_vt.vt_nrecs + 1 <= IGMP_V3_REPORT_MAXRECS) && (m0->m_pkthdr.len + minrec0len) < (ifp->if_mtu - IGMP_LEADINGSPACE)) { m0srcs = (ifp->if_mtu - m0->m_pkthdr.len - sizeof(struct igmp_grouprec)) / sizeof(in_addr_t); m = m0; CTR1(KTR_IGMPV3, "%s: use existing packet", __func__); } else { if (mbufq_full(mq)) { CTR1(KTR_IGMPV3, "%s: outbound queue full", __func__); return (-ENOMEM); } m = NULL; m0srcs = (ifp->if_mtu - IGMP_LEADINGSPACE - sizeof(struct igmp_grouprec)) / sizeof(in_addr_t); if (!is_state_change && !is_group_query) { m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); if (m) m->m_data += IGMP_LEADINGSPACE; } if (m == NULL) { m = m_gethdr(M_NOWAIT, MT_DATA); if (m) M_ALIGN(m, IGMP_LEADINGSPACE); } if (m == NULL) return (-ENOMEM); igmp_save_context(m, ifp); CTR1(KTR_IGMPV3, "%s: allocated first packet", __func__); } /* * Append group record. * If we have sources, we don't know how many yet. */ ig.ig_type = type; ig.ig_datalen = 0; ig.ig_numsrc = 0; ig.ig_group = inm->inm_addr; if (!m_append(m, sizeof(struct igmp_grouprec), (void *)&ig)) { if (m != m0) m_freem(m); CTR1(KTR_IGMPV3, "%s: m_append() failed.", __func__); return (-ENOMEM); } nbytes += sizeof(struct igmp_grouprec); /* * Append as many sources as will fit in the first packet. * If we are appending to a new packet, the chain allocation * may potentially use clusters; use m_getptr() in this case. * If we are appending to an existing packet, we need to obtain * a pointer to the group record after m_append(), in case a new * mbuf was allocated. * Only append sources which are in-mode at t1. If we are * transitioning to MCAST_UNDEFINED state on the group, do not * include source entries. * Only report recorded sources in our filter set when responding * to a group-source query. */ if (record_has_sources) { if (m == m0) { md = m_last(m); pig = (struct igmp_grouprec *)(mtod(md, uint8_t *) + md->m_len - nbytes); } else { md = m_getptr(m, 0, &off); pig = (struct igmp_grouprec *)(mtod(md, uint8_t *) + off); } msrcs = 0; RB_FOREACH_SAFE(ims, ip_msource_tree, &inm->inm_srcs, nims) { CTR2(KTR_IGMPV3, "%s: visit node 0x%08x", __func__, ims->ims_haddr); now = ims_get_mode(inm, ims, 1); CTR2(KTR_IGMPV3, "%s: node is %d", __func__, now); if ((now != mode) || (now == mode && mode == MCAST_UNDEFINED)) { CTR1(KTR_IGMPV3, "%s: skip node", __func__); continue; } if (is_source_query && ims->ims_stp == 0) { CTR1(KTR_IGMPV3, "%s: skip unrecorded node", __func__); continue; } CTR1(KTR_IGMPV3, "%s: append node", __func__); naddr = htonl(ims->ims_haddr); if (!m_append(m, sizeof(in_addr_t), (void *)&naddr)) { if (m != m0) m_freem(m); CTR1(KTR_IGMPV3, "%s: m_append() failed.", __func__); return (-ENOMEM); } nbytes += sizeof(in_addr_t); ++msrcs; if (msrcs == m0srcs) break; } CTR2(KTR_IGMPV3, "%s: msrcs is %d this packet", __func__, msrcs); pig->ig_numsrc = htons(msrcs); nbytes += (msrcs * sizeof(in_addr_t)); } if (is_source_query && msrcs == 0) { CTR1(KTR_IGMPV3, "%s: no recorded sources to report", __func__); if (m != m0) m_freem(m); return (0); } /* * We are good to go with first packet. */ if (m != m0) { CTR1(KTR_IGMPV3, "%s: enqueueing first packet", __func__); m->m_pkthdr.PH_vt.vt_nrecs = 1; mbufq_enqueue(mq, m); } else m->m_pkthdr.PH_vt.vt_nrecs++; /* * No further work needed if no source list in packet(s). */ if (!record_has_sources) return (nbytes); /* * Whilst sources remain to be announced, we need to allocate * a new packet and fill out as many sources as will fit. * Always try for a cluster first. */ while (nims != NULL) { if (mbufq_full(mq)) { CTR1(KTR_IGMPV3, "%s: outbound queue full", __func__); return (-ENOMEM); } m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); if (m) m->m_data += IGMP_LEADINGSPACE; if (m == NULL) { m = m_gethdr(M_NOWAIT, MT_DATA); if (m) M_ALIGN(m, IGMP_LEADINGSPACE); } if (m == NULL) return (-ENOMEM); igmp_save_context(m, ifp); md = m_getptr(m, 0, &off); pig = (struct igmp_grouprec *)(mtod(md, uint8_t *) + off); CTR1(KTR_IGMPV3, "%s: allocated next packet", __func__); if (!m_append(m, sizeof(struct igmp_grouprec), (void *)&ig)) { if (m != m0) m_freem(m); CTR1(KTR_IGMPV3, "%s: m_append() failed.", __func__); return (-ENOMEM); } m->m_pkthdr.PH_vt.vt_nrecs = 1; nbytes += sizeof(struct igmp_grouprec); m0srcs = (ifp->if_mtu - IGMP_LEADINGSPACE - sizeof(struct igmp_grouprec)) / sizeof(in_addr_t); msrcs = 0; RB_FOREACH_FROM(ims, ip_msource_tree, nims) { CTR2(KTR_IGMPV3, "%s: visit node 0x%08x", __func__, ims->ims_haddr); now = ims_get_mode(inm, ims, 1); if ((now != mode) || (now == mode && mode == MCAST_UNDEFINED)) { CTR1(KTR_IGMPV3, "%s: skip node", __func__); continue; } if (is_source_query && ims->ims_stp == 0) { CTR1(KTR_IGMPV3, "%s: skip unrecorded node", __func__); continue; } CTR1(KTR_IGMPV3, "%s: append node", __func__); naddr = htonl(ims->ims_haddr); if (!m_append(m, sizeof(in_addr_t), (void *)&naddr)) { if (m != m0) m_freem(m); CTR1(KTR_IGMPV3, "%s: m_append() failed.", __func__); return (-ENOMEM); } ++msrcs; if (msrcs == m0srcs) break; } pig->ig_numsrc = htons(msrcs); nbytes += (msrcs * sizeof(in_addr_t)); CTR1(KTR_IGMPV3, "%s: enqueueing next packet", __func__); mbufq_enqueue(mq, m); } return (nbytes); } /* * Type used to mark record pass completion. * We exploit the fact we can cast to this easily from the * current filter modes on each ip_msource node. */ typedef enum { REC_NONE = 0x00, /* MCAST_UNDEFINED */ REC_ALLOW = 0x01, /* MCAST_INCLUDE */ REC_BLOCK = 0x02, /* MCAST_EXCLUDE */ REC_FULL = REC_ALLOW | REC_BLOCK } rectype_t; /* * Enqueue an IGMPv3 filter list change to the given output queue. * * Source list filter state is held in an RB-tree. When the filter list * for a group is changed without changing its mode, we need to compute * the deltas between T0 and T1 for each source in the filter set, * and enqueue the appropriate ALLOW_NEW/BLOCK_OLD records. * * As we may potentially queue two record types, and the entire R-B tree * needs to be walked at once, we break this out into its own function * so we can generate a tightly packed queue of packets. * * XXX This could be written to only use one tree walk, although that makes * serializing into the mbuf chains a bit harder. For now we do two walks * which makes things easier on us, and it may or may not be harder on * the L2 cache. * * If successful the size of all data appended to the queue is returned, * otherwise an error code less than zero is returned, or zero if * no record(s) were appended. */ static int igmp_v3_enqueue_filter_change(struct mbufq *mq, struct in_multi *inm) { static const int MINRECLEN = sizeof(struct igmp_grouprec) + sizeof(in_addr_t); struct ifnet *ifp; struct igmp_grouprec ig; struct igmp_grouprec *pig; struct ip_msource *ims, *nims; struct mbuf *m, *m0, *md; in_addr_t naddr; int m0srcs, nbytes, npbytes, off, rsrcs, schanged; int nallow, nblock; uint8_t mode, now, then; rectype_t crt, drt, nrt; IN_MULTI_LIST_LOCK_ASSERT(); if (inm->inm_nsrc == 0 || (inm->inm_st[0].iss_asm > 0 && inm->inm_st[1].iss_asm > 0)) return (0); ifp = inm->inm_ifp; /* interface */ mode = inm->inm_st[1].iss_fmode; /* filter mode at t1 */ crt = REC_NONE; /* current group record type */ drt = REC_NONE; /* mask of completed group record types */ nrt = REC_NONE; /* record type for current node */ m0srcs = 0; /* # source which will fit in current mbuf chain */ nbytes = 0; /* # of bytes appended to group's state-change queue */ npbytes = 0; /* # of bytes appended this packet */ rsrcs = 0; /* # sources encoded in current record */ schanged = 0; /* # nodes encoded in overall filter change */ nallow = 0; /* # of source entries in ALLOW_NEW */ nblock = 0; /* # of source entries in BLOCK_OLD */ nims = NULL; /* next tree node pointer */ /* * For each possible filter record mode. * The first kind of source we encounter tells us which * is the first kind of record we start appending. * If a node transitioned to UNDEFINED at t1, its mode is treated * as the inverse of the group's filter mode. */ while (drt != REC_FULL) { do { m0 = mbufq_last(mq); if (m0 != NULL && (m0->m_pkthdr.PH_vt.vt_nrecs + 1 <= IGMP_V3_REPORT_MAXRECS) && (m0->m_pkthdr.len + MINRECLEN) < (ifp->if_mtu - IGMP_LEADINGSPACE)) { m = m0; m0srcs = (ifp->if_mtu - m0->m_pkthdr.len - sizeof(struct igmp_grouprec)) / sizeof(in_addr_t); CTR1(KTR_IGMPV3, "%s: use previous packet", __func__); } else { m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); if (m) m->m_data += IGMP_LEADINGSPACE; if (m == NULL) { m = m_gethdr(M_NOWAIT, MT_DATA); if (m) M_ALIGN(m, IGMP_LEADINGSPACE); } if (m == NULL) { CTR1(KTR_IGMPV3, "%s: m_get*() failed", __func__); return (-ENOMEM); } m->m_pkthdr.PH_vt.vt_nrecs = 0; igmp_save_context(m, ifp); m0srcs = (ifp->if_mtu - IGMP_LEADINGSPACE - sizeof(struct igmp_grouprec)) / sizeof(in_addr_t); npbytes = 0; CTR1(KTR_IGMPV3, "%s: allocated new packet", __func__); } /* * Append the IGMP group record header to the * current packet's data area. * Recalculate pointer to free space for next * group record, in case m_append() allocated * a new mbuf or cluster. */ memset(&ig, 0, sizeof(ig)); ig.ig_group = inm->inm_addr; if (!m_append(m, sizeof(ig), (void *)&ig)) { if (m != m0) m_freem(m); CTR1(KTR_IGMPV3, "%s: m_append() failed", __func__); return (-ENOMEM); } npbytes += sizeof(struct igmp_grouprec); if (m != m0) { /* new packet; offset in c hain */ md = m_getptr(m, npbytes - sizeof(struct igmp_grouprec), &off); pig = (struct igmp_grouprec *)(mtod(md, uint8_t *) + off); } else { /* current packet; offset from last append */ md = m_last(m); pig = (struct igmp_grouprec *)(mtod(md, uint8_t *) + md->m_len - sizeof(struct igmp_grouprec)); } /* * Begin walking the tree for this record type * pass, or continue from where we left off * previously if we had to allocate a new packet. * Only report deltas in-mode at t1. * We need not report included sources as allowed * if we are in inclusive mode on the group, * however the converse is not true. */ rsrcs = 0; if (nims == NULL) nims = RB_MIN(ip_msource_tree, &inm->inm_srcs); RB_FOREACH_FROM(ims, ip_msource_tree, nims) { CTR2(KTR_IGMPV3, "%s: visit node 0x%08x", __func__, ims->ims_haddr); now = ims_get_mode(inm, ims, 1); then = ims_get_mode(inm, ims, 0); CTR3(KTR_IGMPV3, "%s: mode: t0 %d, t1 %d", __func__, then, now); if (now == then) { CTR1(KTR_IGMPV3, "%s: skip unchanged", __func__); continue; } if (mode == MCAST_EXCLUDE && now == MCAST_INCLUDE) { CTR1(KTR_IGMPV3, "%s: skip IN src on EX group", __func__); continue; } nrt = (rectype_t)now; if (nrt == REC_NONE) nrt = (rectype_t)(~mode & REC_FULL); if (schanged++ == 0) { crt = nrt; } else if (crt != nrt) continue; naddr = htonl(ims->ims_haddr); if (!m_append(m, sizeof(in_addr_t), (void *)&naddr)) { if (m != m0) m_freem(m); CTR1(KTR_IGMPV3, "%s: m_append() failed", __func__); return (-ENOMEM); } nallow += !!(crt == REC_ALLOW); nblock += !!(crt == REC_BLOCK); if (++rsrcs == m0srcs) break; } /* * If we did not append any tree nodes on this * pass, back out of allocations. */ if (rsrcs == 0) { npbytes -= sizeof(struct igmp_grouprec); if (m != m0) { CTR1(KTR_IGMPV3, "%s: m_free(m)", __func__); m_freem(m); } else { CTR1(KTR_IGMPV3, "%s: m_adj(m, -ig)", __func__); m_adj(m, -((int)sizeof( struct igmp_grouprec))); } continue; } npbytes += (rsrcs * sizeof(in_addr_t)); if (crt == REC_ALLOW) pig->ig_type = IGMP_ALLOW_NEW_SOURCES; else if (crt == REC_BLOCK) pig->ig_type = IGMP_BLOCK_OLD_SOURCES; pig->ig_numsrc = htons(rsrcs); /* * Count the new group record, and enqueue this * packet if it wasn't already queued. */ m->m_pkthdr.PH_vt.vt_nrecs++; if (m != m0) mbufq_enqueue(mq, m); nbytes += npbytes; } while (nims != NULL); drt |= crt; crt = (~crt & REC_FULL); } CTR3(KTR_IGMPV3, "%s: queued %d ALLOW_NEW, %d BLOCK_OLD", __func__, nallow, nblock); return (nbytes); } static int igmp_v3_merge_state_changes(struct in_multi *inm, struct mbufq *scq) { struct mbufq *gq; struct mbuf *m; /* pending state-change */ struct mbuf *m0; /* copy of pending state-change */ struct mbuf *mt; /* last state-change in packet */ int docopy, domerge; u_int recslen; docopy = 0; domerge = 0; recslen = 0; IN_MULTI_LIST_LOCK_ASSERT(); IGMP_LOCK_ASSERT(); /* * If there are further pending retransmissions, make a writable * copy of each queued state-change message before merging. */ if (inm->inm_scrv > 0) docopy = 1; gq = &inm->inm_scq; #ifdef KTR if (mbufq_first(gq) == NULL) { CTR2(KTR_IGMPV3, "%s: WARNING: queue for inm %p is empty", __func__, inm); } #endif m = mbufq_first(gq); while (m != NULL) { /* * Only merge the report into the current packet if * there is sufficient space to do so; an IGMPv3 report * packet may only contain 65,535 group records. * Always use a simple mbuf chain concatentation to do this, * as large state changes for single groups may have * allocated clusters. */ domerge = 0; mt = mbufq_last(scq); if (mt != NULL) { recslen = m_length(m, NULL); if ((mt->m_pkthdr.PH_vt.vt_nrecs + m->m_pkthdr.PH_vt.vt_nrecs <= IGMP_V3_REPORT_MAXRECS) && (mt->m_pkthdr.len + recslen <= (inm->inm_ifp->if_mtu - IGMP_LEADINGSPACE))) domerge = 1; } if (!domerge && mbufq_full(gq)) { CTR2(KTR_IGMPV3, "%s: outbound queue full, skipping whole packet %p", __func__, m); mt = m->m_nextpkt; if (!docopy) m_freem(m); m = mt; continue; } if (!docopy) { CTR2(KTR_IGMPV3, "%s: dequeueing %p", __func__, m); m0 = mbufq_dequeue(gq); m = m0->m_nextpkt; } else { CTR2(KTR_IGMPV3, "%s: copying %p", __func__, m); m0 = m_dup(m, M_NOWAIT); if (m0 == NULL) return (ENOMEM); m0->m_nextpkt = NULL; m = m->m_nextpkt; } if (!domerge) { CTR3(KTR_IGMPV3, "%s: queueing %p to scq %p)", __func__, m0, scq); mbufq_enqueue(scq, m0); } else { struct mbuf *mtl; /* last mbuf of packet mt */ CTR3(KTR_IGMPV3, "%s: merging %p with scq tail %p)", __func__, m0, mt); mtl = m_last(mt); m0->m_flags &= ~M_PKTHDR; mt->m_pkthdr.len += recslen; mt->m_pkthdr.PH_vt.vt_nrecs += m0->m_pkthdr.PH_vt.vt_nrecs; mtl->m_next = m0; } } return (0); } /* * Respond to a pending IGMPv3 General Query. */ static void igmp_v3_dispatch_general_query(struct igmp_ifsoftc *igi) { struct ifmultiaddr *ifma; struct ifnet *ifp; struct in_multi *inm; int retval __unused, loop; IN_MULTI_LIST_LOCK_ASSERT(); IGMP_LOCK_ASSERT(); KASSERT(igi->igi_version == IGMP_VERSION_3, ("%s: called when version %d", __func__, igi->igi_version)); /* * Check that there are some packets queued. If so, send them first. * For large number of groups the reply to general query can take * many packets, we should finish sending them before starting of * queuing the new reply. */ if (mbufq_len(&igi->igi_gq) != 0) goto send; ifp = igi->igi_ifp; IF_ADDR_RLOCK(ifp); CK_STAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { if (ifma->ifma_addr->sa_family != AF_INET || ifma->ifma_protospec == NULL) continue; inm = (struct in_multi *)ifma->ifma_protospec; KASSERT(ifp == inm->inm_ifp, ("%s: inconsistent ifp", __func__)); switch (inm->inm_state) { case IGMP_NOT_MEMBER: case IGMP_SILENT_MEMBER: break; case IGMP_REPORTING_MEMBER: case IGMP_IDLE_MEMBER: case IGMP_LAZY_MEMBER: case IGMP_SLEEPING_MEMBER: case IGMP_AWAKENING_MEMBER: inm->inm_state = IGMP_REPORTING_MEMBER; retval = igmp_v3_enqueue_group_record(&igi->igi_gq, inm, 0, 0, 0); CTR2(KTR_IGMPV3, "%s: enqueue record = %d", __func__, retval); break; case IGMP_G_QUERY_PENDING_MEMBER: case IGMP_SG_QUERY_PENDING_MEMBER: case IGMP_LEAVING_MEMBER: break; } } IF_ADDR_RUNLOCK(ifp); send: loop = (igi->igi_flags & IGIF_LOOPBACK) ? 1 : 0; igmp_dispatch_queue(&igi->igi_gq, IGMP_MAX_RESPONSE_BURST, loop); /* * Slew transmission of bursts over 500ms intervals. */ if (mbufq_first(&igi->igi_gq) != NULL) { igi->igi_v3_timer = 1 + IGMP_RANDOM_DELAY( IGMP_RESPONSE_BURST_INTERVAL); V_interface_timers_running = 1; } } /* * Transmit the next pending IGMP message in the output queue. * * We get called from netisr_processqueue(). A mutex private to igmpoq * will be acquired and released around this routine. * * VIMAGE: Needs to store/restore vnet pointer on a per-mbuf-chain basis. * MRT: Nothing needs to be done, as IGMP traffic is always local to * a link and uses a link-scope multicast address. */ static void igmp_intr(struct mbuf *m) { struct ip_moptions imo; struct ifnet *ifp; struct mbuf *ipopts, *m0; int error; uint32_t ifindex; CTR2(KTR_IGMPV3, "%s: transmit %p", __func__, m); /* * Set VNET image pointer from enqueued mbuf chain * before doing anything else. Whilst we use interface * indexes to guard against interface detach, they are * unique to each VIMAGE and must be retrieved. */ CURVNET_SET((struct vnet *)(m->m_pkthdr.PH_loc.ptr)); ifindex = igmp_restore_context(m); /* * Check if the ifnet still exists. This limits the scope of * any race in the absence of a global ifp lock for low cost * (an array lookup). */ ifp = ifnet_byindex(ifindex); if (ifp == NULL) { CTR3(KTR_IGMPV3, "%s: dropped %p as ifindex %u went away.", __func__, m, ifindex); m_freem(m); IPSTAT_INC(ips_noroute); goto out; } ipopts = V_igmp_sendra ? m_raopt : NULL; imo.imo_multicast_ttl = 1; imo.imo_multicast_vif = -1; imo.imo_multicast_loop = (V_ip_mrouter != NULL); /* * If the user requested that IGMP traffic be explicitly * redirected to the loopback interface (e.g. they are running a * MANET interface and the routing protocol needs to see the * updates), handle this now. */ if (m->m_flags & M_IGMP_LOOP) imo.imo_multicast_ifp = V_loif; else imo.imo_multicast_ifp = ifp; if (m->m_flags & M_IGMPV2) { m0 = m; } else { m0 = igmp_v3_encap_report(ifp, m); if (m0 == NULL) { CTR2(KTR_IGMPV3, "%s: dropped %p", __func__, m); m_freem(m); IPSTAT_INC(ips_odropped); goto out; } } igmp_scrub_context(m0); m_clrprotoflags(m); m0->m_pkthdr.rcvif = V_loif; #ifdef MAC mac_netinet_igmp_send(ifp, m0); #endif error = ip_output(m0, ipopts, NULL, 0, &imo, NULL); if (error) { CTR3(KTR_IGMPV3, "%s: ip_output(%p) = %d", __func__, m0, error); goto out; } IGMPSTAT_INC(igps_snd_reports); out: /* * We must restore the existing vnet pointer before * continuing as we are run from netisr context. */ CURVNET_RESTORE(); } /* * Encapsulate an IGMPv3 report. * * The internal mbuf flag M_IGMPV3_HDR is used to indicate that the mbuf * chain has already had its IP/IGMPv3 header prepended. In this case * the function will not attempt to prepend; the lengths and checksums * will however be re-computed. * * Returns a pointer to the new mbuf chain head, or NULL if the * allocation failed. */ static struct mbuf * igmp_v3_encap_report(struct ifnet *ifp, struct mbuf *m) { struct rm_priotracker in_ifa_tracker; struct igmp_report *igmp; struct ip *ip; int hdrlen, igmpreclen; KASSERT((m->m_flags & M_PKTHDR), ("%s: mbuf chain %p is !M_PKTHDR", __func__, m)); igmpreclen = m_length(m, NULL); hdrlen = sizeof(struct ip) + sizeof(struct igmp_report); if (m->m_flags & M_IGMPV3_HDR) { igmpreclen -= hdrlen; } else { M_PREPEND(m, hdrlen, M_NOWAIT); if (m == NULL) return (NULL); m->m_flags |= M_IGMPV3_HDR; } CTR2(KTR_IGMPV3, "%s: igmpreclen is %d", __func__, igmpreclen); m->m_data += sizeof(struct ip); m->m_len -= sizeof(struct ip); igmp = mtod(m, struct igmp_report *); igmp->ir_type = IGMP_v3_HOST_MEMBERSHIP_REPORT; igmp->ir_rsv1 = 0; igmp->ir_rsv2 = 0; igmp->ir_numgrps = htons(m->m_pkthdr.PH_vt.vt_nrecs); igmp->ir_cksum = 0; igmp->ir_cksum = in_cksum(m, sizeof(struct igmp_report) + igmpreclen); m->m_pkthdr.PH_vt.vt_nrecs = 0; m->m_data -= sizeof(struct ip); m->m_len += sizeof(struct ip); ip = mtod(m, struct ip *); ip->ip_tos = IPTOS_PREC_INTERNETCONTROL; ip->ip_len = htons(hdrlen + igmpreclen); ip->ip_off = htons(IP_DF); ip->ip_p = IPPROTO_IGMP; ip->ip_sum = 0; ip->ip_src.s_addr = INADDR_ANY; if (m->m_flags & M_IGMP_LOOP) { struct in_ifaddr *ia; + NET_EPOCH_ENTER(); IFP_TO_IA(ifp, ia, &in_ifa_tracker); - if (ia != NULL) { + if (ia != NULL) ip->ip_src = ia->ia_addr.sin_addr; - ifa_free(&ia->ia_ifa); - } + NET_EPOCH_EXIT(); } ip->ip_dst.s_addr = htonl(INADDR_ALLRPTS_GROUP); return (m); } #ifdef KTR static char * igmp_rec_type_to_str(const int type) { switch (type) { case IGMP_CHANGE_TO_EXCLUDE_MODE: return "TO_EX"; break; case IGMP_CHANGE_TO_INCLUDE_MODE: return "TO_IN"; break; case IGMP_MODE_IS_EXCLUDE: return "MODE_EX"; break; case IGMP_MODE_IS_INCLUDE: return "MODE_IN"; break; case IGMP_ALLOW_NEW_SOURCES: return "ALLOW_NEW"; break; case IGMP_BLOCK_OLD_SOURCES: return "BLOCK_OLD"; break; default: break; } return "unknown"; } #endif #ifdef VIMAGE static void vnet_igmp_init(const void *unused __unused) { netisr_register_vnet(&igmp_nh); } VNET_SYSINIT(vnet_igmp_init, SI_SUB_PROTO_MC, SI_ORDER_ANY, vnet_igmp_init, NULL); static void vnet_igmp_uninit(const void *unused __unused) { /* This can happen when we shutdown the entire network stack. */ CTR1(KTR_IGMPV3, "%s: tearing down", __func__); netisr_unregister_vnet(&igmp_nh); } VNET_SYSUNINIT(vnet_igmp_uninit, SI_SUB_PROTO_MC, SI_ORDER_ANY, vnet_igmp_uninit, NULL); #endif #ifdef DDB DB_SHOW_COMMAND(igi_list, db_show_igi_list) { struct igmp_ifsoftc *igi, *tigi; LIST_HEAD(_igi_list, igmp_ifsoftc) *igi_head; if (!have_addr) { db_printf("usage: show igi_list \n"); return; } igi_head = (struct _igi_list *)addr; LIST_FOREACH_SAFE(igi, igi_head, igi_link, tigi) { db_printf("igmp_ifsoftc %p:\n", igi); db_printf(" ifp %p\n", igi->igi_ifp); db_printf(" version %u\n", igi->igi_version); db_printf(" v1_timer %u\n", igi->igi_v1_timer); db_printf(" v2_timer %u\n", igi->igi_v2_timer); db_printf(" v3_timer %u\n", igi->igi_v3_timer); db_printf(" flags %#x\n", igi->igi_flags); db_printf(" rv %u\n", igi->igi_rv); db_printf(" qi %u\n", igi->igi_qi); db_printf(" qri %u\n", igi->igi_qri); db_printf(" uri %u\n", igi->igi_uri); /* struct mbufq igi_gq; */ db_printf("\n"); } } #endif static int igmp_modevent(module_t mod, int type, void *unused __unused) { switch (type) { case MOD_LOAD: CTR1(KTR_IGMPV3, "%s: initializing", __func__); IGMP_LOCK_INIT(); m_raopt = igmp_ra_alloc(); netisr_register(&igmp_nh); break; case MOD_UNLOAD: CTR1(KTR_IGMPV3, "%s: tearing down", __func__); netisr_unregister(&igmp_nh); m_free(m_raopt); m_raopt = NULL; IGMP_LOCK_DESTROY(); break; default: return (EOPNOTSUPP); } return (0); } static moduledata_t igmp_mod = { "igmp", igmp_modevent, 0 }; DECLARE_MODULE(igmp, igmp_mod, SI_SUB_PROTO_MC, SI_ORDER_MIDDLE); Index: head/sys/netinet/in.c =================================================================== --- head/sys/netinet/in.c (revision 334117) +++ head/sys/netinet/in.c (revision 334118) @@ -1,1506 +1,1508 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1991, 1993 * The Regents of the University of California. All rights reserved. * Copyright (C) 2001 WIDE Project. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)in.c 8.4 (Berkeley) 1/9/95 */ #include __FBSDID("$FreeBSD$"); #include "opt_mpath.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static int in_aifaddr_ioctl(u_long, caddr_t, struct ifnet *, struct thread *); static int in_difaddr_ioctl(u_long, caddr_t, struct ifnet *, struct thread *); static void in_socktrim(struct sockaddr_in *); static void in_purgemaddrs(struct ifnet *); static VNET_DEFINE(int, nosameprefix); #define V_nosameprefix VNET(nosameprefix) SYSCTL_INT(_net_inet_ip, OID_AUTO, no_same_prefix, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(nosameprefix), 0, "Refuse to create same prefixes on different interfaces"); VNET_DECLARE(struct inpcbinfo, ripcbinfo); #define V_ripcbinfo VNET(ripcbinfo) static struct sx in_control_sx; SX_SYSINIT(in_control_sx, &in_control_sx, "in_control"); /* * Return 1 if an internet address is for a ``local'' host * (one to which we have a connection). */ int in_localaddr(struct in_addr in) { struct rm_priotracker in_ifa_tracker; u_long i = ntohl(in.s_addr); struct in_ifaddr *ia; IN_IFADDR_RLOCK(&in_ifa_tracker); CK_STAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) { if ((i & ia->ia_subnetmask) == ia->ia_subnet) { IN_IFADDR_RUNLOCK(&in_ifa_tracker); return (1); } } IN_IFADDR_RUNLOCK(&in_ifa_tracker); return (0); } /* * Return 1 if an internet address is for the local host and configured * on one of its interfaces. */ int in_localip(struct in_addr in) { struct rm_priotracker in_ifa_tracker; struct in_ifaddr *ia; IN_IFADDR_RLOCK(&in_ifa_tracker); LIST_FOREACH(ia, INADDR_HASH(in.s_addr), ia_hash) { if (IA_SIN(ia)->sin_addr.s_addr == in.s_addr) { IN_IFADDR_RUNLOCK(&in_ifa_tracker); return (1); } } IN_IFADDR_RUNLOCK(&in_ifa_tracker); return (0); } /* * Return 1 if an internet address is configured on an interface. */ int in_ifhasaddr(struct ifnet *ifp, struct in_addr in) { struct ifaddr *ifa; struct in_ifaddr *ia; IF_ADDR_RLOCK(ifp); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET) continue; ia = (struct in_ifaddr *)ifa; if (ia->ia_addr.sin_addr.s_addr == in.s_addr) { IF_ADDR_RUNLOCK(ifp); return (1); } } IF_ADDR_RUNLOCK(ifp); return (0); } /* * Return a reference to the interface address which is different to * the supplied one but with same IP address value. */ static struct in_ifaddr * in_localip_more(struct in_ifaddr *ia) { struct rm_priotracker in_ifa_tracker; in_addr_t in = IA_SIN(ia)->sin_addr.s_addr; struct in_ifaddr *it; IN_IFADDR_RLOCK(&in_ifa_tracker); LIST_FOREACH(it, INADDR_HASH(in), ia_hash) { if (it != ia && IA_SIN(it)->sin_addr.s_addr == in) { ifa_ref(&it->ia_ifa); IN_IFADDR_RUNLOCK(&in_ifa_tracker); return (it); } } IN_IFADDR_RUNLOCK(&in_ifa_tracker); return (NULL); } /* * Determine whether an IP address is in a reserved set of addresses * that may not be forwarded, or whether datagrams to that destination * may be forwarded. */ int in_canforward(struct in_addr in) { u_long i = ntohl(in.s_addr); u_long net; if (IN_EXPERIMENTAL(i) || IN_MULTICAST(i) || IN_LINKLOCAL(i)) return (0); if (IN_CLASSA(i)) { net = i & IN_CLASSA_NET; if (net == 0 || net == (IN_LOOPBACKNET << IN_CLASSA_NSHIFT)) return (0); } return (1); } /* * Trim a mask in a sockaddr */ static void in_socktrim(struct sockaddr_in *ap) { char *cplim = (char *) &ap->sin_addr; char *cp = (char *) (&ap->sin_addr + 1); ap->sin_len = 0; while (--cp >= cplim) if (*cp) { (ap)->sin_len = cp - (char *) (ap) + 1; break; } } /* * Generic internet control operations (ioctl's). */ int in_control(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp, struct thread *td) { struct ifreq *ifr = (struct ifreq *)data; struct sockaddr_in *addr = (struct sockaddr_in *)&ifr->ifr_addr; struct ifaddr *ifa; struct in_ifaddr *ia; int error; if (ifp == NULL) return (EADDRNOTAVAIL); /* * Filter out 4 ioctls we implement directly. Forward the rest * to specific functions and ifp->if_ioctl(). */ switch (cmd) { case SIOCGIFADDR: case SIOCGIFBRDADDR: case SIOCGIFDSTADDR: case SIOCGIFNETMASK: break; case SIOCDIFADDR: sx_xlock(&in_control_sx); error = in_difaddr_ioctl(cmd, data, ifp, td); sx_xunlock(&in_control_sx); return (error); case OSIOCAIFADDR: /* 9.x compat */ case SIOCAIFADDR: sx_xlock(&in_control_sx); error = in_aifaddr_ioctl(cmd, data, ifp, td); sx_xunlock(&in_control_sx); return (error); case SIOCSIFADDR: case SIOCSIFBRDADDR: case SIOCSIFDSTADDR: case SIOCSIFNETMASK: /* We no longer support that old commands. */ return (EINVAL); default: if (ifp->if_ioctl == NULL) return (EOPNOTSUPP); return ((*ifp->if_ioctl)(ifp, cmd, data)); } if (addr->sin_addr.s_addr != INADDR_ANY && prison_check_ip4(td->td_ucred, &addr->sin_addr) != 0) return (EADDRNOTAVAIL); /* * Find address for this interface, if it exists. If an * address was specified, find that one instead of the * first one on the interface, if possible. */ IF_ADDR_RLOCK(ifp); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET) continue; ia = (struct in_ifaddr *)ifa; if (ia->ia_addr.sin_addr.s_addr == addr->sin_addr.s_addr) break; } if (ifa == NULL) CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) if (ifa->ifa_addr->sa_family == AF_INET) { ia = (struct in_ifaddr *)ifa; if (prison_check_ip4(td->td_ucred, &ia->ia_addr.sin_addr) == 0) break; } if (ifa == NULL) { IF_ADDR_RUNLOCK(ifp); return (EADDRNOTAVAIL); } error = 0; switch (cmd) { case SIOCGIFADDR: *addr = ia->ia_addr; break; case SIOCGIFBRDADDR: if ((ifp->if_flags & IFF_BROADCAST) == 0) { error = EINVAL; break; } *addr = ia->ia_broadaddr; break; case SIOCGIFDSTADDR: if ((ifp->if_flags & IFF_POINTOPOINT) == 0) { error = EINVAL; break; } *addr = ia->ia_dstaddr; break; case SIOCGIFNETMASK: *addr = ia->ia_sockmask; break; } IF_ADDR_RUNLOCK(ifp); return (error); } static int in_aifaddr_ioctl(u_long cmd, caddr_t data, struct ifnet *ifp, struct thread *td) { const struct in_aliasreq *ifra = (struct in_aliasreq *)data; const struct sockaddr_in *addr = &ifra->ifra_addr; const struct sockaddr_in *broadaddr = &ifra->ifra_broadaddr; const struct sockaddr_in *mask = &ifra->ifra_mask; const struct sockaddr_in *dstaddr = &ifra->ifra_dstaddr; const int vhid = (cmd == SIOCAIFADDR) ? ifra->ifra_vhid : 0; struct ifaddr *ifa; struct in_ifaddr *ia; bool iaIsFirst; int error = 0; error = priv_check(td, PRIV_NET_ADDIFADDR); if (error) return (error); /* * ifra_addr must be present and be of INET family. * ifra_broadaddr/ifra_dstaddr and ifra_mask are optional. */ if (addr->sin_len != sizeof(struct sockaddr_in) || addr->sin_family != AF_INET) return (EINVAL); if (broadaddr->sin_len != 0 && (broadaddr->sin_len != sizeof(struct sockaddr_in) || broadaddr->sin_family != AF_INET)) return (EINVAL); if (mask->sin_len != 0 && (mask->sin_len != sizeof(struct sockaddr_in) || mask->sin_family != AF_INET)) return (EINVAL); if ((ifp->if_flags & IFF_POINTOPOINT) && (dstaddr->sin_len != sizeof(struct sockaddr_in) || dstaddr->sin_addr.s_addr == INADDR_ANY)) return (EDESTADDRREQ); if (vhid > 0 && carp_attach_p == NULL) return (EPROTONOSUPPORT); /* * See whether address already exist. */ iaIsFirst = true; ia = NULL; IF_ADDR_RLOCK(ifp); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { struct in_ifaddr *it; if (ifa->ifa_addr->sa_family != AF_INET) continue; it = (struct in_ifaddr *)ifa; iaIsFirst = false; if (it->ia_addr.sin_addr.s_addr == addr->sin_addr.s_addr && prison_check_ip4(td->td_ucred, &addr->sin_addr) == 0) ia = it; } IF_ADDR_RUNLOCK(ifp); if (ia != NULL) (void )in_difaddr_ioctl(cmd, data, ifp, td); ifa = ifa_alloc(sizeof(struct in_ifaddr), M_WAITOK); ia = (struct in_ifaddr *)ifa; ifa->ifa_addr = (struct sockaddr *)&ia->ia_addr; ifa->ifa_dstaddr = (struct sockaddr *)&ia->ia_dstaddr; ifa->ifa_netmask = (struct sockaddr *)&ia->ia_sockmask; callout_init_rw(&ia->ia_garp_timer, &ifp->if_addr_lock, CALLOUT_RETURNUNLOCKED); ia->ia_ifp = ifp; ia->ia_addr = *addr; if (mask->sin_len != 0) { ia->ia_sockmask = *mask; ia->ia_subnetmask = ntohl(ia->ia_sockmask.sin_addr.s_addr); } else { in_addr_t i = ntohl(addr->sin_addr.s_addr); /* * Be compatible with network classes, if netmask isn't * supplied, guess it based on classes. */ if (IN_CLASSA(i)) ia->ia_subnetmask = IN_CLASSA_NET; else if (IN_CLASSB(i)) ia->ia_subnetmask = IN_CLASSB_NET; else ia->ia_subnetmask = IN_CLASSC_NET; ia->ia_sockmask.sin_addr.s_addr = htonl(ia->ia_subnetmask); } ia->ia_subnet = ntohl(addr->sin_addr.s_addr) & ia->ia_subnetmask; in_socktrim(&ia->ia_sockmask); if (ifp->if_flags & IFF_BROADCAST) { if (broadaddr->sin_len != 0) { ia->ia_broadaddr = *broadaddr; } else if (ia->ia_subnetmask == IN_RFC3021_MASK) { ia->ia_broadaddr.sin_addr.s_addr = INADDR_BROADCAST; ia->ia_broadaddr.sin_len = sizeof(struct sockaddr_in); ia->ia_broadaddr.sin_family = AF_INET; } else { ia->ia_broadaddr.sin_addr.s_addr = htonl(ia->ia_subnet | ~ia->ia_subnetmask); ia->ia_broadaddr.sin_len = sizeof(struct sockaddr_in); ia->ia_broadaddr.sin_family = AF_INET; } } if (ifp->if_flags & IFF_POINTOPOINT) ia->ia_dstaddr = *dstaddr; /* XXXGL: rtinit() needs this strange assignment. */ if (ifp->if_flags & IFF_LOOPBACK) ia->ia_dstaddr = ia->ia_addr; if (vhid != 0) { error = (*carp_attach_p)(&ia->ia_ifa, vhid); if (error) return (error); } /* if_addrhead is already referenced by ifa_alloc() */ IF_ADDR_WLOCK(ifp); CK_STAILQ_INSERT_TAIL(&ifp->if_addrhead, ifa, ifa_link); IF_ADDR_WUNLOCK(ifp); ifa_ref(ifa); /* in_ifaddrhead */ IN_IFADDR_WLOCK(); CK_STAILQ_INSERT_TAIL(&V_in_ifaddrhead, ia, ia_link); LIST_INSERT_HEAD(INADDR_HASH(ia->ia_addr.sin_addr.s_addr), ia, ia_hash); IN_IFADDR_WUNLOCK(); /* * Give the interface a chance to initialize * if this is its first address, * and to validate the address if necessary. */ if (ifp->if_ioctl != NULL) { error = (*ifp->if_ioctl)(ifp, SIOCSIFADDR, (caddr_t)ia); if (error) goto fail1; } /* * Add route for the network. */ if (vhid == 0) { int flags = RTF_UP; if (ifp->if_flags & (IFF_LOOPBACK|IFF_POINTOPOINT)) flags |= RTF_HOST; error = in_addprefix(ia, flags); if (error) goto fail1; } /* * Add a loopback route to self. */ if (vhid == 0 && (ifp->if_flags & IFF_LOOPBACK) == 0 && ia->ia_addr.sin_addr.s_addr != INADDR_ANY && !((ifp->if_flags & IFF_POINTOPOINT) && ia->ia_dstaddr.sin_addr.s_addr == ia->ia_addr.sin_addr.s_addr)) { struct in_ifaddr *eia; eia = in_localip_more(ia); if (eia == NULL) { error = ifa_add_loopback_route((struct ifaddr *)ia, (struct sockaddr *)&ia->ia_addr); if (error) goto fail2; } else ifa_free(&eia->ia_ifa); } if (iaIsFirst && (ifp->if_flags & IFF_MULTICAST)) { struct in_addr allhosts_addr; struct in_ifinfo *ii; ii = ((struct in_ifinfo *)ifp->if_afdata[AF_INET]); allhosts_addr.s_addr = htonl(INADDR_ALLHOSTS_GROUP); error = in_joingroup(ifp, &allhosts_addr, NULL, &ii->ii_allhosts); } EVENTHANDLER_INVOKE(ifaddr_event, ifp); return (error); fail2: if (vhid == 0) (void )in_scrubprefix(ia, LLE_STATIC); fail1: if (ia->ia_ifa.ifa_carp) (*carp_detach_p)(&ia->ia_ifa, false); IF_ADDR_WLOCK(ifp); CK_STAILQ_REMOVE(&ifp->if_addrhead, &ia->ia_ifa, ifaddr, ifa_link); IF_ADDR_WUNLOCK(ifp); ifa_free(&ia->ia_ifa); /* if_addrhead */ IN_IFADDR_WLOCK(); CK_STAILQ_REMOVE(&V_in_ifaddrhead, ia, in_ifaddr, ia_link); LIST_REMOVE(ia, ia_hash); IN_IFADDR_WUNLOCK(); ifa_free(&ia->ia_ifa); /* in_ifaddrhead */ return (error); } static int in_difaddr_ioctl(u_long cmd, caddr_t data, struct ifnet *ifp, struct thread *td) { const struct ifreq *ifr = (struct ifreq *)data; const struct sockaddr_in *addr = (const struct sockaddr_in *) &ifr->ifr_addr; struct ifaddr *ifa; struct in_ifaddr *ia; bool deleteAny, iaIsLast; int error; if (td != NULL) { error = priv_check(td, PRIV_NET_DELIFADDR); if (error) return (error); } if (addr->sin_len != sizeof(struct sockaddr_in) || addr->sin_family != AF_INET) deleteAny = true; else deleteAny = false; iaIsLast = true; ia = NULL; IF_ADDR_WLOCK(ifp); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { struct in_ifaddr *it; if (ifa->ifa_addr->sa_family != AF_INET) continue; it = (struct in_ifaddr *)ifa; if (deleteAny && ia == NULL && (td == NULL || prison_check_ip4(td->td_ucred, &it->ia_addr.sin_addr) == 0)) ia = it; if (it->ia_addr.sin_addr.s_addr == addr->sin_addr.s_addr && (td == NULL || prison_check_ip4(td->td_ucred, &addr->sin_addr) == 0)) ia = it; if (it != ia) iaIsLast = false; } if (ia == NULL) { IF_ADDR_WUNLOCK(ifp); return (EADDRNOTAVAIL); } CK_STAILQ_REMOVE(&ifp->if_addrhead, &ia->ia_ifa, ifaddr, ifa_link); IF_ADDR_WUNLOCK(ifp); ifa_free(&ia->ia_ifa); /* if_addrhead */ IN_IFADDR_WLOCK(); CK_STAILQ_REMOVE(&V_in_ifaddrhead, ia, in_ifaddr, ia_link); LIST_REMOVE(ia, ia_hash); IN_IFADDR_WUNLOCK(); /* * in_scrubprefix() kills the interface route. */ in_scrubprefix(ia, LLE_STATIC); /* * in_ifadown gets rid of all the rest of * the routes. This is not quite the right * thing to do, but at least if we are running * a routing process they will come back. */ in_ifadown(&ia->ia_ifa, 1); if (ia->ia_ifa.ifa_carp) (*carp_detach_p)(&ia->ia_ifa, (cmd == SIOCDIFADDR) ? false : true); /* * If this is the last IPv4 address configured on this * interface, leave the all-hosts group. * No state-change report need be transmitted. */ if (iaIsLast && (ifp->if_flags & IFF_MULTICAST)) { struct in_ifinfo *ii; ii = ((struct in_ifinfo *)ifp->if_afdata[AF_INET]); if (ii->ii_allhosts) { (void)in_leavegroup(ii->ii_allhosts, NULL); ii->ii_allhosts = NULL; } } IF_ADDR_WLOCK(ifp); if (callout_stop(&ia->ia_garp_timer) == 1) { ifa_free(&ia->ia_ifa); } IF_ADDR_WUNLOCK(ifp); EVENTHANDLER_INVOKE(ifaddr_event, ifp); ifa_free(&ia->ia_ifa); /* in_ifaddrhead */ return (0); } #define rtinitflags(x) \ ((((x)->ia_ifp->if_flags & (IFF_LOOPBACK | IFF_POINTOPOINT)) != 0) \ ? RTF_HOST : 0) /* * Check if we have a route for the given prefix already or add one accordingly. */ int in_addprefix(struct in_ifaddr *target, int flags) { struct rm_priotracker in_ifa_tracker; struct in_ifaddr *ia; struct in_addr prefix, mask, p, m; int error; if ((flags & RTF_HOST) != 0) { prefix = target->ia_dstaddr.sin_addr; mask.s_addr = 0; } else { prefix = target->ia_addr.sin_addr; mask = target->ia_sockmask.sin_addr; prefix.s_addr &= mask.s_addr; } IN_IFADDR_RLOCK(&in_ifa_tracker); /* Look for an existing address with the same prefix, mask, and fib */ CK_STAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) { if (rtinitflags(ia)) { p = ia->ia_dstaddr.sin_addr; if (prefix.s_addr != p.s_addr) continue; } else { p = ia->ia_addr.sin_addr; m = ia->ia_sockmask.sin_addr; p.s_addr &= m.s_addr; if (prefix.s_addr != p.s_addr || mask.s_addr != m.s_addr) continue; } if (target->ia_ifp->if_fib != ia->ia_ifp->if_fib) continue; /* * If we got a matching prefix route inserted by other * interface address, we are done here. */ if (ia->ia_flags & IFA_ROUTE) { #ifdef RADIX_MPATH if (ia->ia_addr.sin_addr.s_addr == target->ia_addr.sin_addr.s_addr) { IN_IFADDR_RUNLOCK(&in_ifa_tracker); return (EEXIST); } else break; #endif if (V_nosameprefix) { IN_IFADDR_RUNLOCK(&in_ifa_tracker); return (EEXIST); } else { int fibnum; fibnum = V_rt_add_addr_allfibs ? RT_ALL_FIBS : target->ia_ifp->if_fib; rt_addrmsg(RTM_ADD, &target->ia_ifa, fibnum); IN_IFADDR_RUNLOCK(&in_ifa_tracker); return (0); } } } IN_IFADDR_RUNLOCK(&in_ifa_tracker); /* * No-one seem to have this prefix route, so we try to insert it. */ error = rtinit(&target->ia_ifa, (int)RTM_ADD, flags); if (!error) target->ia_flags |= IFA_ROUTE; return (error); } /* * Removes either all lle entries for given @ia, or lle * corresponding to @ia address. */ static void in_scrubprefixlle(struct in_ifaddr *ia, int all, u_int flags) { struct sockaddr_in addr, mask; struct sockaddr *saddr, *smask; struct ifnet *ifp; saddr = (struct sockaddr *)&addr; bzero(&addr, sizeof(addr)); addr.sin_len = sizeof(addr); addr.sin_family = AF_INET; smask = (struct sockaddr *)&mask; bzero(&mask, sizeof(mask)); mask.sin_len = sizeof(mask); mask.sin_family = AF_INET; mask.sin_addr.s_addr = ia->ia_subnetmask; ifp = ia->ia_ifp; if (all) { /* * Remove all L2 entries matching given prefix. * Convert address to host representation to avoid * doing this on every callback. ia_subnetmask is already * stored in host representation. */ addr.sin_addr.s_addr = ntohl(ia->ia_addr.sin_addr.s_addr); lltable_prefix_free(AF_INET, saddr, smask, flags); } else { /* Remove interface address only */ addr.sin_addr.s_addr = ia->ia_addr.sin_addr.s_addr; lltable_delete_addr(LLTABLE(ifp), LLE_IFADDR, saddr); } } /* * If there is no other address in the system that can serve a route to the * same prefix, remove the route. Hand over the route to the new address * otherwise. */ int in_scrubprefix(struct in_ifaddr *target, u_int flags) { struct rm_priotracker in_ifa_tracker; struct in_ifaddr *ia; struct in_addr prefix, mask, p, m; int error = 0; /* * Remove the loopback route to the interface address. */ if ((target->ia_addr.sin_addr.s_addr != INADDR_ANY) && !(target->ia_ifp->if_flags & IFF_LOOPBACK) && (flags & LLE_STATIC)) { struct in_ifaddr *eia; /* * XXXME: add fib-aware in_localip. * We definitely don't want to switch between * prefixes in different fibs. */ eia = in_localip_more(target); if (eia != NULL) { error = ifa_switch_loopback_route((struct ifaddr *)eia, (struct sockaddr *)&target->ia_addr); ifa_free(&eia->ia_ifa); } else { error = ifa_del_loopback_route((struct ifaddr *)target, (struct sockaddr *)&target->ia_addr); } } if (rtinitflags(target)) { prefix = target->ia_dstaddr.sin_addr; mask.s_addr = 0; } else { prefix = target->ia_addr.sin_addr; mask = target->ia_sockmask.sin_addr; prefix.s_addr &= mask.s_addr; } if ((target->ia_flags & IFA_ROUTE) == 0) { int fibnum; fibnum = V_rt_add_addr_allfibs ? RT_ALL_FIBS : target->ia_ifp->if_fib; rt_addrmsg(RTM_DELETE, &target->ia_ifa, fibnum); /* * Removing address from !IFF_UP interface or * prefix which exists on other interface (along with route). * No entries should exist here except target addr. * Given that, delete this entry only. */ in_scrubprefixlle(target, 0, flags); return (0); } IN_IFADDR_RLOCK(&in_ifa_tracker); CK_STAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) { if (rtinitflags(ia)) { p = ia->ia_dstaddr.sin_addr; if (prefix.s_addr != p.s_addr) continue; } else { p = ia->ia_addr.sin_addr; m = ia->ia_sockmask.sin_addr; p.s_addr &= m.s_addr; if (prefix.s_addr != p.s_addr || mask.s_addr != m.s_addr) continue; } if ((ia->ia_ifp->if_flags & IFF_UP) == 0) continue; /* * If we got a matching prefix address, move IFA_ROUTE and * the route itself to it. Make sure that routing daemons * get a heads-up. */ if ((ia->ia_flags & IFA_ROUTE) == 0) { ifa_ref(&ia->ia_ifa); IN_IFADDR_RUNLOCK(&in_ifa_tracker); error = rtinit(&(target->ia_ifa), (int)RTM_DELETE, rtinitflags(target)); if (error == 0) target->ia_flags &= ~IFA_ROUTE; else log(LOG_INFO, "in_scrubprefix: err=%d, old prefix delete failed\n", error); /* Scrub all entries IFF interface is different */ in_scrubprefixlle(target, target->ia_ifp != ia->ia_ifp, flags); error = rtinit(&ia->ia_ifa, (int)RTM_ADD, rtinitflags(ia) | RTF_UP); if (error == 0) ia->ia_flags |= IFA_ROUTE; else log(LOG_INFO, "in_scrubprefix: err=%d, new prefix add failed\n", error); ifa_free(&ia->ia_ifa); return (error); } } IN_IFADDR_RUNLOCK(&in_ifa_tracker); /* * remove all L2 entries on the given prefix */ in_scrubprefixlle(target, 1, flags); /* * As no-one seem to have this prefix, we can remove the route. */ error = rtinit(&(target->ia_ifa), (int)RTM_DELETE, rtinitflags(target)); if (error == 0) target->ia_flags &= ~IFA_ROUTE; else log(LOG_INFO, "in_scrubprefix: err=%d, prefix delete failed\n", error); return (error); } #undef rtinitflags void in_ifscrub_all(void) { struct ifnet *ifp; struct ifaddr *ifa, *nifa; struct ifaliasreq ifr; IFNET_RLOCK(); - TAILQ_FOREACH(ifp, &V_ifnet, if_link) { + CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) { /* Cannot lock here - lock recursion. */ /* IF_ADDR_RLOCK(ifp); */ CK_STAILQ_FOREACH_SAFE(ifa, &ifp->if_addrhead, ifa_link, nifa) { if (ifa->ifa_addr->sa_family != AF_INET) continue; /* * This is ugly but the only way for legacy IP to * cleanly remove addresses and everything attached. */ bzero(&ifr, sizeof(ifr)); ifr.ifra_addr = *ifa->ifa_addr; if (ifa->ifa_dstaddr) ifr.ifra_broadaddr = *ifa->ifa_dstaddr; (void)in_control(NULL, SIOCDIFADDR, (caddr_t)&ifr, ifp, NULL); } /* IF_ADDR_RUNLOCK(ifp); */ in_purgemaddrs(ifp); igmp_domifdetach(ifp); } IFNET_RUNLOCK(); } int in_ifaddr_broadcast(struct in_addr in, struct in_ifaddr *ia) { return ((in.s_addr == ia->ia_broadaddr.sin_addr.s_addr || /* * Check for old-style (host 0) broadcast, but * taking into account that RFC 3021 obsoletes it. */ (ia->ia_subnetmask != IN_RFC3021_MASK && ntohl(in.s_addr) == ia->ia_subnet)) && /* * Check for an all one subnetmask. These * only exist when an interface gets a secondary * address. */ ia->ia_subnetmask != (u_long)0xffffffff); } /* * Return 1 if the address might be a local broadcast address. */ int in_broadcast(struct in_addr in, struct ifnet *ifp) { struct ifaddr *ifa; int found; if (in.s_addr == INADDR_BROADCAST || in.s_addr == INADDR_ANY) return (1); if ((ifp->if_flags & IFF_BROADCAST) == 0) return (0); found = 0; /* * Look through the list of addresses for a match * with a broadcast address. */ IF_ADDR_RLOCK(ifp); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) if (ifa->ifa_addr->sa_family == AF_INET && in_ifaddr_broadcast(in, (struct in_ifaddr *)ifa)) { found = 1; break; } IF_ADDR_RUNLOCK(ifp); return (found); } /* * On interface removal, clean up IPv4 data structures hung off of the ifnet. */ void in_ifdetach(struct ifnet *ifp) { IN_MULTI_LOCK(); in_pcbpurgeif0(&V_ripcbinfo, ifp); in_pcbpurgeif0(&V_udbinfo, ifp); in_pcbpurgeif0(&V_ulitecbinfo, ifp); in_purgemaddrs(ifp); IN_MULTI_UNLOCK(); } /* * Delete all IPv4 multicast address records, and associated link-layer * multicast address records, associated with ifp. * XXX It looks like domifdetach runs AFTER the link layer cleanup. * XXX This should not race with ifma_protospec being set during * a new allocation, if it does, we have bigger problems. */ static void in_purgemaddrs(struct ifnet *ifp) { struct in_multi_head purgeinms; struct in_multi *inm; struct ifmultiaddr *ifma, *next; SLIST_INIT(&purgeinms); IN_MULTI_LIST_LOCK(); /* * Extract list of in_multi associated with the detaching ifp * which the PF_INET layer is about to release. * We need to do this as IF_ADDR_LOCK() may be re-acquired * by code further down. */ IF_ADDR_WLOCK(ifp); restart: CK_STAILQ_FOREACH_SAFE(ifma, &ifp->if_multiaddrs, ifma_link, next) { if (ifma->ifma_addr->sa_family != AF_INET || ifma->ifma_protospec == NULL) continue; inm = (struct in_multi *)ifma->ifma_protospec; inm_rele_locked(&purgeinms, inm); if (__predict_false(ifma_restart)) { ifma_restart = true; goto restart; } } IF_ADDR_WUNLOCK(ifp); inm_release_list_deferred(&purgeinms); igmp_ifdetach(ifp); IN_MULTI_LIST_UNLOCK(); } struct in_llentry { struct llentry base; }; #define IN_LLTBL_DEFAULT_HSIZE 32 #define IN_LLTBL_HASH(k, h) \ (((((((k >> 8) ^ k) >> 8) ^ k) >> 8) ^ k) & ((h) - 1)) /* * Do actual deallocation of @lle. */ static void -in_lltable_destroy_lle_unlocked(struct llentry *lle) +in_lltable_destroy_lle_unlocked(epoch_context_t ctx) { + struct llentry *lle; + lle = __containerof(ctx, struct llentry, lle_epoch_ctx); LLE_LOCK_DESTROY(lle); LLE_REQ_DESTROY(lle); free(lle, M_LLTABLE); } /* * Called by the datapath to indicate that * the entry was used. */ static void in_lltable_mark_used(struct llentry *lle) { LLE_REQ_LOCK(lle); lle->r_skip_req = 0; LLE_REQ_UNLOCK(lle); } /* * Called by LLE_FREE_LOCKED when number of references * drops to zero. */ static void in_lltable_destroy_lle(struct llentry *lle) { LLE_WUNLOCK(lle); - in_lltable_destroy_lle_unlocked(lle); + epoch_call(net_epoch_preempt, &lle->lle_epoch_ctx, in_lltable_destroy_lle_unlocked); } static struct llentry * in_lltable_new(struct in_addr addr4, u_int flags) { struct in_llentry *lle; lle = malloc(sizeof(struct in_llentry), M_LLTABLE, M_NOWAIT | M_ZERO); if (lle == NULL) /* NB: caller generates msg */ return NULL; /* * For IPv4 this will trigger "arpresolve" to generate * an ARP request. */ lle->base.la_expire = time_uptime; /* mark expired */ lle->base.r_l3addr.addr4 = addr4; lle->base.lle_refcnt = 1; lle->base.lle_free = in_lltable_destroy_lle; LLE_LOCK_INIT(&lle->base); LLE_REQ_INIT(&lle->base); callout_init(&lle->base.lle_timer, 1); return (&lle->base); } #define IN_ARE_MASKED_ADDR_EQUAL(d, a, m) ( \ ((((d).s_addr ^ (a).s_addr) & (m).s_addr)) == 0 ) static int in_lltable_match_prefix(const struct sockaddr *saddr, const struct sockaddr *smask, u_int flags, struct llentry *lle) { struct in_addr addr, mask, lle_addr; addr = ((const struct sockaddr_in *)saddr)->sin_addr; mask = ((const struct sockaddr_in *)smask)->sin_addr; lle_addr.s_addr = ntohl(lle->r_l3addr.addr4.s_addr); if (IN_ARE_MASKED_ADDR_EQUAL(lle_addr, addr, mask) == 0) return (0); if (lle->la_flags & LLE_IFADDR) { /* * Delete LLE_IFADDR records IFF address & flag matches. * Note that addr is the interface address within prefix * being matched. * Note also we should handle 'ifdown' cases without removing * ifaddr macs. */ if (addr.s_addr == lle_addr.s_addr && (flags & LLE_STATIC) != 0) return (1); return (0); } /* flags & LLE_STATIC means deleting both dynamic and static entries */ if ((flags & LLE_STATIC) || !(lle->la_flags & LLE_STATIC)) return (1); return (0); } static void in_lltable_free_entry(struct lltable *llt, struct llentry *lle) { size_t pkts_dropped; LLE_WLOCK_ASSERT(lle); KASSERT(llt != NULL, ("lltable is NULL")); /* Unlink entry from table if not already */ if ((lle->la_flags & LLE_LINKED) != 0) { IF_AFDATA_WLOCK_ASSERT(llt->llt_ifp); lltable_unlink_entry(llt, lle); } /* cancel timer */ if (callout_stop(&lle->lle_timer) > 0) LLE_REMREF(lle); /* Drop hold queue */ pkts_dropped = llentry_free(lle); ARPSTAT_ADD(dropped, pkts_dropped); } static int in_lltable_rtcheck(struct ifnet *ifp, u_int flags, const struct sockaddr *l3addr) { struct rt_addrinfo info; struct sockaddr_in rt_key, rt_mask; struct sockaddr rt_gateway; int rt_flags; KASSERT(l3addr->sa_family == AF_INET, ("sin_family %d", l3addr->sa_family)); bzero(&rt_key, sizeof(rt_key)); rt_key.sin_len = sizeof(rt_key); bzero(&rt_mask, sizeof(rt_mask)); rt_mask.sin_len = sizeof(rt_mask); bzero(&rt_gateway, sizeof(rt_gateway)); rt_gateway.sa_len = sizeof(rt_gateway); bzero(&info, sizeof(info)); info.rti_info[RTAX_DST] = (struct sockaddr *)&rt_key; info.rti_info[RTAX_NETMASK] = (struct sockaddr *)&rt_mask; info.rti_info[RTAX_GATEWAY] = (struct sockaddr *)&rt_gateway; if (rib_lookup_info(ifp->if_fib, l3addr, NHR_REF, 0, &info) != 0) return (EINVAL); rt_flags = info.rti_flags; /* * If the gateway for an existing host route matches the target L3 * address, which is a special route inserted by some implementation * such as MANET, and the interface is of the correct type, then * allow for ARP to proceed. */ if (rt_flags & RTF_GATEWAY) { if (!(rt_flags & RTF_HOST) || !info.rti_ifp || info.rti_ifp->if_type != IFT_ETHER || (info.rti_ifp->if_flags & (IFF_NOARP | IFF_STATICARP)) != 0 || memcmp(rt_gateway.sa_data, l3addr->sa_data, sizeof(in_addr_t)) != 0) { rib_free_info(&info); return (EINVAL); } } rib_free_info(&info); /* * Make sure that at least the destination address is covered * by the route. This is for handling the case where 2 or more * interfaces have the same prefix. An incoming packet arrives * on one interface and the corresponding outgoing packet leaves * another interface. */ if (!(rt_flags & RTF_HOST) && info.rti_ifp != ifp) { const char *sa, *mask, *addr, *lim; const struct sockaddr_in *l3sin; mask = (const char *)&rt_mask; /* * Just being extra cautious to avoid some custom * code getting into trouble. */ if ((info.rti_addrs & RTA_NETMASK) == 0) return (EINVAL); sa = (const char *)&rt_key; addr = (const char *)l3addr; l3sin = (const struct sockaddr_in *)l3addr; lim = addr + l3sin->sin_len; for ( ; addr < lim; sa++, mask++, addr++) { if ((*sa ^ *addr) & *mask) { #ifdef DIAGNOSTIC char addrbuf[INET_ADDRSTRLEN]; log(LOG_INFO, "IPv4 address: \"%s\" " "is not on the network\n", inet_ntoa_r(l3sin->sin_addr, addrbuf)); #endif return (EINVAL); } } } return (0); } static inline uint32_t in_lltable_hash_dst(const struct in_addr dst, uint32_t hsize) { return (IN_LLTBL_HASH(dst.s_addr, hsize)); } static uint32_t in_lltable_hash(const struct llentry *lle, uint32_t hsize) { return (in_lltable_hash_dst(lle->r_l3addr.addr4, hsize)); } static void in_lltable_fill_sa_entry(const struct llentry *lle, struct sockaddr *sa) { struct sockaddr_in *sin; sin = (struct sockaddr_in *)sa; bzero(sin, sizeof(*sin)); sin->sin_family = AF_INET; sin->sin_len = sizeof(*sin); sin->sin_addr = lle->r_l3addr.addr4; } static inline struct llentry * in_lltable_find_dst(struct lltable *llt, struct in_addr dst) { struct llentry *lle; struct llentries *lleh; u_int hashidx; hashidx = in_lltable_hash_dst(dst, llt->llt_hsize); lleh = &llt->lle_head[hashidx]; - LIST_FOREACH(lle, lleh, lle_next) { + CK_LIST_FOREACH(lle, lleh, lle_next) { if (lle->la_flags & LLE_DELETED) continue; if (lle->r_l3addr.addr4.s_addr == dst.s_addr) break; } return (lle); } static void in_lltable_delete_entry(struct lltable *llt, struct llentry *lle) { lle->la_flags |= LLE_DELETED; EVENTHANDLER_INVOKE(lle_event, lle, LLENTRY_DELETED); #ifdef DIAGNOSTIC log(LOG_INFO, "ifaddr cache = %p is deleted\n", lle); #endif llentry_free(lle); } static struct llentry * in_lltable_alloc(struct lltable *llt, u_int flags, const struct sockaddr *l3addr) { const struct sockaddr_in *sin = (const struct sockaddr_in *)l3addr; struct ifnet *ifp = llt->llt_ifp; struct llentry *lle; char linkhdr[LLE_MAX_LINKHDR]; size_t linkhdrsize; int lladdr_off; KASSERT(l3addr->sa_family == AF_INET, ("sin_family %d", l3addr->sa_family)); /* * A route that covers the given address must have * been installed 1st because we are doing a resolution, * verify this. */ if (!(flags & LLE_IFADDR) && in_lltable_rtcheck(ifp, flags, l3addr) != 0) return (NULL); lle = in_lltable_new(sin->sin_addr, flags); if (lle == NULL) { log(LOG_INFO, "lla_lookup: new lle malloc failed\n"); return (NULL); } lle->la_flags = flags; if (flags & LLE_STATIC) lle->r_flags |= RLLE_VALID; if ((flags & LLE_IFADDR) == LLE_IFADDR) { linkhdrsize = LLE_MAX_LINKHDR; if (lltable_calc_llheader(ifp, AF_INET, IF_LLADDR(ifp), linkhdr, &linkhdrsize, &lladdr_off) != 0) { - in_lltable_destroy_lle_unlocked(lle); + epoch_call(net_epoch_preempt, &lle->lle_epoch_ctx, in_lltable_destroy_lle_unlocked); return (NULL); } lltable_set_entry_addr(ifp, lle, linkhdr, linkhdrsize, lladdr_off); lle->la_flags |= LLE_STATIC; lle->r_flags |= (RLLE_VALID | RLLE_IFADDR); } return (lle); } /* * Return NULL if not found or marked for deletion. * If found return lle read locked. */ static struct llentry * in_lltable_lookup(struct lltable *llt, u_int flags, const struct sockaddr *l3addr) { const struct sockaddr_in *sin = (const struct sockaddr_in *)l3addr; struct llentry *lle; IF_AFDATA_LOCK_ASSERT(llt->llt_ifp); KASSERT(l3addr->sa_family == AF_INET, ("sin_family %d", l3addr->sa_family)); lle = in_lltable_find_dst(llt, sin->sin_addr); if (lle == NULL) return (NULL); KASSERT((flags & (LLE_UNLOCKED|LLE_EXCLUSIVE)) != (LLE_UNLOCKED|LLE_EXCLUSIVE),("wrong lle request flags: 0x%X", flags)); if (flags & LLE_UNLOCKED) return (lle); if (flags & LLE_EXCLUSIVE) LLE_WLOCK(lle); else LLE_RLOCK(lle); return (lle); } static int in_lltable_dump_entry(struct lltable *llt, struct llentry *lle, struct sysctl_req *wr) { struct ifnet *ifp = llt->llt_ifp; /* XXX stack use */ struct { struct rt_msghdr rtm; struct sockaddr_in sin; struct sockaddr_dl sdl; } arpc; struct sockaddr_dl *sdl; int error; bzero(&arpc, sizeof(arpc)); /* skip deleted entries */ if ((lle->la_flags & LLE_DELETED) == LLE_DELETED) return (0); /* Skip if jailed and not a valid IP of the prison. */ lltable_fill_sa_entry(lle,(struct sockaddr *)&arpc.sin); if (prison_if(wr->td->td_ucred, (struct sockaddr *)&arpc.sin) != 0) return (0); /* * produce a msg made of: * struct rt_msghdr; * struct sockaddr_in; (IPv4) * struct sockaddr_dl; */ arpc.rtm.rtm_msglen = sizeof(arpc); arpc.rtm.rtm_version = RTM_VERSION; arpc.rtm.rtm_type = RTM_GET; arpc.rtm.rtm_flags = RTF_UP; arpc.rtm.rtm_addrs = RTA_DST | RTA_GATEWAY; /* publish */ if (lle->la_flags & LLE_PUB) arpc.rtm.rtm_flags |= RTF_ANNOUNCE; sdl = &arpc.sdl; sdl->sdl_family = AF_LINK; sdl->sdl_len = sizeof(*sdl); sdl->sdl_index = ifp->if_index; sdl->sdl_type = ifp->if_type; if ((lle->la_flags & LLE_VALID) == LLE_VALID) { sdl->sdl_alen = ifp->if_addrlen; bcopy(lle->ll_addr, LLADDR(sdl), ifp->if_addrlen); } else { sdl->sdl_alen = 0; bzero(LLADDR(sdl), ifp->if_addrlen); } arpc.rtm.rtm_rmx.rmx_expire = lle->la_flags & LLE_STATIC ? 0 : lle->la_expire; arpc.rtm.rtm_flags |= (RTF_HOST | RTF_LLDATA); if (lle->la_flags & LLE_STATIC) arpc.rtm.rtm_flags |= RTF_STATIC; if (lle->la_flags & LLE_IFADDR) arpc.rtm.rtm_flags |= RTF_PINNED; arpc.rtm.rtm_index = ifp->if_index; error = SYSCTL_OUT(wr, &arpc, sizeof(arpc)); return (error); } static struct lltable * in_lltattach(struct ifnet *ifp) { struct lltable *llt; llt = lltable_allocate_htbl(IN_LLTBL_DEFAULT_HSIZE); llt->llt_af = AF_INET; llt->llt_ifp = ifp; llt->llt_lookup = in_lltable_lookup; llt->llt_alloc_entry = in_lltable_alloc; llt->llt_delete_entry = in_lltable_delete_entry; llt->llt_dump_entry = in_lltable_dump_entry; llt->llt_hash = in_lltable_hash; llt->llt_fill_sa_entry = in_lltable_fill_sa_entry; llt->llt_free_entry = in_lltable_free_entry; llt->llt_match_prefix = in_lltable_match_prefix; llt->llt_mark_used = in_lltable_mark_used; lltable_link(llt); return (llt); } void * in_domifattach(struct ifnet *ifp) { struct in_ifinfo *ii; ii = malloc(sizeof(struct in_ifinfo), M_IFADDR, M_WAITOK|M_ZERO); ii->ii_llt = in_lltattach(ifp); ii->ii_igmp = igmp_domifattach(ifp); return (ii); } void in_domifdetach(struct ifnet *ifp, void *aux) { struct in_ifinfo *ii = (struct in_ifinfo *)aux; igmp_domifdetach(ifp); lltable_free(ii->ii_llt); free(ii, M_IFADDR); } Index: head/sys/netinet/in_mcast.c =================================================================== --- head/sys/netinet/in_mcast.c (revision 334117) +++ head/sys/netinet/in_mcast.c (revision 334118) @@ -1,3107 +1,3107 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 2007-2009 Bruce Simpson. * Copyright (c) 2005 Robert N. M. Watson. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote * products derived from this software without specific prior written * permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * IPv4 multicast socket, group, and socket option processing module. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifndef KTR_IGMPV3 #define KTR_IGMPV3 KTR_INET #endif #ifndef __SOCKUNION_DECLARED union sockunion { struct sockaddr_storage ss; struct sockaddr sa; struct sockaddr_dl sdl; struct sockaddr_in sin; }; typedef union sockunion sockunion_t; #define __SOCKUNION_DECLARED #endif /* __SOCKUNION_DECLARED */ static MALLOC_DEFINE(M_INMFILTER, "in_mfilter", "IPv4 multicast PCB-layer source filter"); static MALLOC_DEFINE(M_IPMADDR, "in_multi", "IPv4 multicast group"); static MALLOC_DEFINE(M_IPMOPTS, "ip_moptions", "IPv4 multicast options"); static MALLOC_DEFINE(M_IPMSOURCE, "ip_msource", "IPv4 multicast IGMP-layer source filter"); /* * Locking: * - Lock order is: Giant, INP_WLOCK, IN_MULTI_LIST_LOCK, IGMP_LOCK, IF_ADDR_LOCK. * - The IF_ADDR_LOCK is implicitly taken by inm_lookup() earlier, however * it can be taken by code in net/if.c also. * - ip_moptions and in_mfilter are covered by the INP_WLOCK. * * struct in_multi is covered by IN_MULTI_LIST_LOCK. There isn't strictly * any need for in_multi itself to be virtualized -- it is bound to an ifp * anyway no matter what happens. */ struct mtx in_multi_list_mtx; MTX_SYSINIT(in_multi_mtx, &in_multi_list_mtx, "in_multi_list_mtx", MTX_DEF); struct mtx in_multi_free_mtx; MTX_SYSINIT(in_multi_free_mtx, &in_multi_free_mtx, "in_multi_free_mtx", MTX_DEF); struct sx in_multi_sx; SX_SYSINIT(in_multi_sx, &in_multi_sx, "in_multi_sx"); int ifma_restart; /* * Functions with non-static linkage defined in this file should be * declared in in_var.h: * imo_multi_filter() * in_addmulti() * in_delmulti() * in_joingroup() * in_joingroup_locked() * in_leavegroup() * in_leavegroup_locked() * and ip_var.h: * inp_freemoptions() * inp_getmoptions() * inp_setmoptions() * * XXX: Both carp and pf need to use the legacy (*,G) KPIs in_addmulti() * and in_delmulti(). */ static void imf_commit(struct in_mfilter *); static int imf_get_source(struct in_mfilter *imf, const struct sockaddr_in *psin, struct in_msource **); static struct in_msource * imf_graft(struct in_mfilter *, const uint8_t, const struct sockaddr_in *); static void imf_leave(struct in_mfilter *); static int imf_prune(struct in_mfilter *, const struct sockaddr_in *); static void imf_purge(struct in_mfilter *); static void imf_rollback(struct in_mfilter *); static void imf_reap(struct in_mfilter *); static int imo_grow(struct ip_moptions *); static size_t imo_match_group(const struct ip_moptions *, const struct ifnet *, const struct sockaddr *); static struct in_msource * imo_match_source(const struct ip_moptions *, const size_t, const struct sockaddr *); static void ims_merge(struct ip_msource *ims, const struct in_msource *lims, const int rollback); static int in_getmulti(struct ifnet *, const struct in_addr *, struct in_multi **); static int inm_get_source(struct in_multi *inm, const in_addr_t haddr, const int noalloc, struct ip_msource **pims); #ifdef KTR static int inm_is_ifp_detached(const struct in_multi *); #endif static int inm_merge(struct in_multi *, /*const*/ struct in_mfilter *); static void inm_purge(struct in_multi *); static void inm_reap(struct in_multi *); static void inm_release(struct in_multi *); static struct ip_moptions * inp_findmoptions(struct inpcb *); static int inp_get_source_filters(struct inpcb *, struct sockopt *); static int inp_join_group(struct inpcb *, struct sockopt *); static int inp_leave_group(struct inpcb *, struct sockopt *); static struct ifnet * inp_lookup_mcast_ifp(const struct inpcb *, const struct sockaddr_in *, const struct in_addr); static int inp_block_unblock_source(struct inpcb *, struct sockopt *); static int inp_set_multicast_if(struct inpcb *, struct sockopt *); static int inp_set_source_filters(struct inpcb *, struct sockopt *); static int sysctl_ip_mcast_filters(SYSCTL_HANDLER_ARGS); static SYSCTL_NODE(_net_inet_ip, OID_AUTO, mcast, CTLFLAG_RW, 0, "IPv4 multicast"); static u_long in_mcast_maxgrpsrc = IP_MAX_GROUP_SRC_FILTER; SYSCTL_ULONG(_net_inet_ip_mcast, OID_AUTO, maxgrpsrc, CTLFLAG_RWTUN, &in_mcast_maxgrpsrc, 0, "Max source filters per group"); static u_long in_mcast_maxsocksrc = IP_MAX_SOCK_SRC_FILTER; SYSCTL_ULONG(_net_inet_ip_mcast, OID_AUTO, maxsocksrc, CTLFLAG_RWTUN, &in_mcast_maxsocksrc, 0, "Max source filters per socket"); int in_mcast_loop = IP_DEFAULT_MULTICAST_LOOP; SYSCTL_INT(_net_inet_ip_mcast, OID_AUTO, loop, CTLFLAG_RWTUN, &in_mcast_loop, 0, "Loopback multicast datagrams by default"); static SYSCTL_NODE(_net_inet_ip_mcast, OID_AUTO, filters, CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_ip_mcast_filters, "Per-interface stack-wide source filters"); #ifdef KTR /* * Inline function which wraps assertions for a valid ifp. * The ifnet layer will set the ifma's ifp pointer to NULL if the ifp * is detached. */ static int __inline inm_is_ifp_detached(const struct in_multi *inm) { struct ifnet *ifp; KASSERT(inm->inm_ifma != NULL, ("%s: no ifma", __func__)); ifp = inm->inm_ifma->ifma_ifp; if (ifp != NULL) { /* * Sanity check that netinet's notion of ifp is the * same as net's. */ KASSERT(inm->inm_ifp == ifp, ("%s: bad ifp", __func__)); } return (ifp == NULL); } #endif static struct grouptask free_gtask; static struct in_multi_head inm_free_list; static void inm_release_task(void *arg __unused); static void inm_init(void) { SLIST_INIT(&inm_free_list); taskqgroup_config_gtask_init(NULL, &free_gtask, inm_release_task, "inm release task"); } SYSINIT(inm_init, SI_SUB_SMP + 1, SI_ORDER_FIRST, inm_init, NULL); void inm_release_list_deferred(struct in_multi_head *inmh) { if (SLIST_EMPTY(inmh)) return; mtx_lock(&in_multi_free_mtx); SLIST_CONCAT(&inm_free_list, inmh, in_multi, inm_nrele); mtx_unlock(&in_multi_free_mtx); GROUPTASK_ENQUEUE(&free_gtask); } void inm_disconnect(struct in_multi *inm) { struct ifnet *ifp; struct ifmultiaddr *ifma, *ll_ifma; ifp = inm->inm_ifp; IF_ADDR_WLOCK_ASSERT(ifp); ifma = inm->inm_ifma; if_ref(ifp); CK_STAILQ_REMOVE(&ifp->if_multiaddrs, ifma, ifmultiaddr, ifma_link); MCDPRINTF("removed ifma: %p from %s\n", ifma, ifp->if_xname); if ((ll_ifma = ifma->ifma_llifma) != NULL) { MPASS(ifma != ll_ifma); ifma->ifma_llifma = NULL; MPASS(ll_ifma->ifma_llifma == NULL); MPASS(ll_ifma->ifma_ifp == ifp); if (--ll_ifma->ifma_refcount == 0) { CK_STAILQ_REMOVE(&ifp->if_multiaddrs, ll_ifma, ifmultiaddr, ifma_link); MCDPRINTF("removed ll_ifma: %p from %s\n", ll_ifma, ifp->if_xname); if_freemulti(ll_ifma); ifma_restart = true; } } } void inm_release_deferred(struct in_multi *inm) { struct in_multi_head tmp; IN_MULTI_LIST_LOCK_ASSERT(); MPASS(inm->inm_refcount > 0); if (--inm->inm_refcount == 0) { SLIST_INIT(&tmp); inm_disconnect(inm); inm->inm_ifma->ifma_protospec = NULL; SLIST_INSERT_HEAD(&tmp, inm, inm_nrele); inm_release_list_deferred(&tmp); } } static void inm_release_task(void *arg __unused) { struct in_multi_head inm_free_tmp; struct in_multi *inm, *tinm; SLIST_INIT(&inm_free_tmp); mtx_lock(&in_multi_free_mtx); SLIST_CONCAT(&inm_free_tmp, &inm_free_list, in_multi, inm_nrele); mtx_unlock(&in_multi_free_mtx); IN_MULTI_LOCK(); SLIST_FOREACH_SAFE(inm, &inm_free_tmp, inm_nrele, tinm) { SLIST_REMOVE_HEAD(&inm_free_tmp, inm_nrele); MPASS(inm); inm_release(inm); } IN_MULTI_UNLOCK(); } /* * Initialize an in_mfilter structure to a known state at t0, t1 * with an empty source filter list. */ static __inline void imf_init(struct in_mfilter *imf, const int st0, const int st1) { memset(imf, 0, sizeof(struct in_mfilter)); RB_INIT(&imf->imf_sources); imf->imf_st[0] = st0; imf->imf_st[1] = st1; } /* * Function for looking up an in_multi record for an IPv4 multicast address * on a given interface. ifp must be valid. If no record found, return NULL. * The IN_MULTI_LIST_LOCK and IF_ADDR_LOCK on ifp must be held. */ struct in_multi * inm_lookup_locked(struct ifnet *ifp, const struct in_addr ina) { struct ifmultiaddr *ifma; struct in_multi *inm; IN_MULTI_LIST_LOCK_ASSERT(); IF_ADDR_LOCK_ASSERT(ifp); inm = NULL; CK_STAILQ_FOREACH(ifma, &((ifp)->if_multiaddrs), ifma_link) { if (ifma->ifma_addr->sa_family != AF_INET || ifma->ifma_protospec == NULL) continue; inm = (struct in_multi *)ifma->ifma_protospec; if (inm->inm_addr.s_addr == ina.s_addr) break; inm = NULL; } return (inm); } /* * Wrapper for inm_lookup_locked(). * The IF_ADDR_LOCK will be taken on ifp and released on return. */ struct in_multi * inm_lookup(struct ifnet *ifp, const struct in_addr ina) { struct in_multi *inm; IN_MULTI_LIST_LOCK_ASSERT(); IF_ADDR_RLOCK(ifp); inm = inm_lookup_locked(ifp, ina); IF_ADDR_RUNLOCK(ifp); return (inm); } /* * Resize the ip_moptions vector to the next power-of-two minus 1. * May be called with locks held; do not sleep. */ static int imo_grow(struct ip_moptions *imo) { struct in_multi **nmships; struct in_multi **omships; struct in_mfilter *nmfilters; struct in_mfilter *omfilters; size_t idx; size_t newmax; size_t oldmax; nmships = NULL; nmfilters = NULL; omships = imo->imo_membership; omfilters = imo->imo_mfilters; oldmax = imo->imo_max_memberships; newmax = ((oldmax + 1) * 2) - 1; if (newmax <= IP_MAX_MEMBERSHIPS) { nmships = (struct in_multi **)realloc(omships, sizeof(struct in_multi *) * newmax, M_IPMOPTS, M_NOWAIT); nmfilters = (struct in_mfilter *)realloc(omfilters, sizeof(struct in_mfilter) * newmax, M_INMFILTER, M_NOWAIT); if (nmships != NULL && nmfilters != NULL) { /* Initialize newly allocated source filter heads. */ for (idx = oldmax; idx < newmax; idx++) { imf_init(&nmfilters[idx], MCAST_UNDEFINED, MCAST_EXCLUDE); } imo->imo_max_memberships = newmax; imo->imo_membership = nmships; imo->imo_mfilters = nmfilters; } } if (nmships == NULL || nmfilters == NULL) { if (nmships != NULL) free(nmships, M_IPMOPTS); if (nmfilters != NULL) free(nmfilters, M_INMFILTER); return (ETOOMANYREFS); } return (0); } /* * Find an IPv4 multicast group entry for this ip_moptions instance * which matches the specified group, and optionally an interface. * Return its index into the array, or -1 if not found. */ static size_t imo_match_group(const struct ip_moptions *imo, const struct ifnet *ifp, const struct sockaddr *group) { const struct sockaddr_in *gsin; struct in_multi **pinm; int idx; int nmships; gsin = (const struct sockaddr_in *)group; /* The imo_membership array may be lazy allocated. */ if (imo->imo_membership == NULL || imo->imo_num_memberships == 0) return (-1); nmships = imo->imo_num_memberships; pinm = &imo->imo_membership[0]; for (idx = 0; idx < nmships; idx++, pinm++) { if (*pinm == NULL) continue; if ((ifp == NULL || ((*pinm)->inm_ifp == ifp)) && in_hosteq((*pinm)->inm_addr, gsin->sin_addr)) { break; } } if (idx >= nmships) idx = -1; return (idx); } /* * Find an IPv4 multicast source entry for this imo which matches * the given group index for this socket, and source address. * * NOTE: This does not check if the entry is in-mode, merely if * it exists, which may not be the desired behaviour. */ static struct in_msource * imo_match_source(const struct ip_moptions *imo, const size_t gidx, const struct sockaddr *src) { struct ip_msource find; struct in_mfilter *imf; struct ip_msource *ims; const sockunion_t *psa; KASSERT(src->sa_family == AF_INET, ("%s: !AF_INET", __func__)); KASSERT(gidx != -1 && gidx < imo->imo_num_memberships, ("%s: invalid index %d\n", __func__, (int)gidx)); /* The imo_mfilters array may be lazy allocated. */ if (imo->imo_mfilters == NULL) return (NULL); imf = &imo->imo_mfilters[gidx]; /* Source trees are keyed in host byte order. */ psa = (const sockunion_t *)src; find.ims_haddr = ntohl(psa->sin.sin_addr.s_addr); ims = RB_FIND(ip_msource_tree, &imf->imf_sources, &find); return ((struct in_msource *)ims); } /* * Perform filtering for multicast datagrams on a socket by group and source. * * Returns 0 if a datagram should be allowed through, or various error codes * if the socket was not a member of the group, or the source was muted, etc. */ int imo_multi_filter(const struct ip_moptions *imo, const struct ifnet *ifp, const struct sockaddr *group, const struct sockaddr *src) { size_t gidx; struct in_msource *ims; int mode; KASSERT(ifp != NULL, ("%s: null ifp", __func__)); gidx = imo_match_group(imo, ifp, group); if (gidx == -1) return (MCAST_NOTGMEMBER); /* * Check if the source was included in an (S,G) join. * Allow reception on exclusive memberships by default, * reject reception on inclusive memberships by default. * Exclude source only if an in-mode exclude filter exists. * Include source only if an in-mode include filter exists. * NOTE: We are comparing group state here at IGMP t1 (now) * with socket-layer t0 (since last downcall). */ mode = imo->imo_mfilters[gidx].imf_st[1]; ims = imo_match_source(imo, gidx, src); if ((ims == NULL && mode == MCAST_INCLUDE) || (ims != NULL && ims->imsl_st[0] != mode)) return (MCAST_NOTSMEMBER); return (MCAST_PASS); } /* * Find and return a reference to an in_multi record for (ifp, group), * and bump its reference count. * If one does not exist, try to allocate it, and update link-layer multicast * filters on ifp to listen for group. * Assumes the IN_MULTI lock is held across the call. * Return 0 if successful, otherwise return an appropriate error code. */ static int in_getmulti(struct ifnet *ifp, const struct in_addr *group, struct in_multi **pinm) { struct sockaddr_in gsin; struct ifmultiaddr *ifma; struct in_ifinfo *ii; struct in_multi *inm; int error; IN_MULTI_LOCK_ASSERT(); ii = (struct in_ifinfo *)ifp->if_afdata[AF_INET]; IN_MULTI_LIST_LOCK(); inm = inm_lookup(ifp, *group); if (inm != NULL) { /* * If we already joined this group, just bump the * refcount and return it. */ KASSERT(inm->inm_refcount >= 1, ("%s: bad refcount %d", __func__, inm->inm_refcount)); inm_acquire_locked(inm); *pinm = inm; } IN_MULTI_LIST_UNLOCK(); if (inm != NULL) return (0); memset(&gsin, 0, sizeof(gsin)); gsin.sin_family = AF_INET; gsin.sin_len = sizeof(struct sockaddr_in); gsin.sin_addr = *group; /* * Check if a link-layer group is already associated * with this network-layer group on the given ifnet. */ error = if_addmulti(ifp, (struct sockaddr *)&gsin, &ifma); if (error != 0) return (error); /* XXX ifma_protospec must be covered by IF_ADDR_LOCK */ IN_MULTI_LIST_LOCK(); IF_ADDR_WLOCK(ifp); /* * If something other than netinet is occupying the link-layer * group, print a meaningful error message and back out of * the allocation. * Otherwise, bump the refcount on the existing network-layer * group association and return it. */ if (ifma->ifma_protospec != NULL) { inm = (struct in_multi *)ifma->ifma_protospec; #ifdef INVARIANTS KASSERT(ifma->ifma_addr != NULL, ("%s: no ifma_addr", __func__)); KASSERT(ifma->ifma_addr->sa_family == AF_INET, ("%s: ifma not AF_INET", __func__)); KASSERT(inm != NULL, ("%s: no ifma_protospec", __func__)); if (inm->inm_ifma != ifma || inm->inm_ifp != ifp || !in_hosteq(inm->inm_addr, *group)) { char addrbuf[INET_ADDRSTRLEN]; panic("%s: ifma %p is inconsistent with %p (%s)", __func__, ifma, inm, inet_ntoa_r(*group, addrbuf)); } #endif inm_acquire_locked(inm); *pinm = inm; goto out_locked; } IF_ADDR_WLOCK_ASSERT(ifp); /* * A new in_multi record is needed; allocate and initialize it. * We DO NOT perform an IGMP join as the in_ layer may need to * push an initial source list down to IGMP to support SSM. * * The initial source filter state is INCLUDE, {} as per the RFC. */ inm = malloc(sizeof(*inm), M_IPMADDR, M_NOWAIT | M_ZERO); if (inm == NULL) { IF_ADDR_WUNLOCK(ifp); IN_MULTI_LIST_UNLOCK(); if_delmulti_ifma(ifma); return (ENOMEM); } inm->inm_addr = *group; inm->inm_ifp = ifp; inm->inm_igi = ii->ii_igmp; inm->inm_ifma = ifma; inm->inm_refcount = 1; inm->inm_state = IGMP_NOT_MEMBER; mbufq_init(&inm->inm_scq, IGMP_MAX_STATE_CHANGES); inm->inm_st[0].iss_fmode = MCAST_UNDEFINED; inm->inm_st[1].iss_fmode = MCAST_UNDEFINED; RB_INIT(&inm->inm_srcs); ifma->ifma_protospec = inm; *pinm = inm; out_locked: IF_ADDR_WUNLOCK(ifp); IN_MULTI_LIST_UNLOCK(); return (0); } /* * Drop a reference to an in_multi record. * * If the refcount drops to 0, free the in_multi record and * delete the underlying link-layer membership. */ static void inm_release(struct in_multi *inm) { struct ifmultiaddr *ifma; struct ifnet *ifp; CTR2(KTR_IGMPV3, "%s: refcount is %d", __func__, inm->inm_refcount); MPASS(inm->inm_refcount == 0); CTR2(KTR_IGMPV3, "%s: freeing inm %p", __func__, inm); ifma = inm->inm_ifma; ifp = inm->inm_ifp; /* XXX this access is not covered by IF_ADDR_LOCK */ CTR2(KTR_IGMPV3, "%s: purging ifma %p", __func__, ifma); if (ifp != NULL) { CURVNET_SET(ifp->if_vnet); inm_purge(inm); free(inm, M_IPMADDR); if_delmulti_ifma_flags(ifma, 1); CURVNET_RESTORE(); if_rele(ifp); } else { inm_purge(inm); free(inm, M_IPMADDR); if_delmulti_ifma_flags(ifma, 1); } } /* * Clear recorded source entries for a group. * Used by the IGMP code. Caller must hold the IN_MULTI lock. * FIXME: Should reap. */ void inm_clear_recorded(struct in_multi *inm) { struct ip_msource *ims; IN_MULTI_LIST_LOCK_ASSERT(); RB_FOREACH(ims, ip_msource_tree, &inm->inm_srcs) { if (ims->ims_stp) { ims->ims_stp = 0; --inm->inm_st[1].iss_rec; } } KASSERT(inm->inm_st[1].iss_rec == 0, ("%s: iss_rec %d not 0", __func__, inm->inm_st[1].iss_rec)); } /* * Record a source as pending for a Source-Group IGMPv3 query. * This lives here as it modifies the shared tree. * * inm is the group descriptor. * naddr is the address of the source to record in network-byte order. * * If the net.inet.igmp.sgalloc sysctl is non-zero, we will * lazy-allocate a source node in response to an SG query. * Otherwise, no allocation is performed. This saves some memory * with the trade-off that the source will not be reported to the * router if joined in the window between the query response and * the group actually being joined on the local host. * * VIMAGE: XXX: Currently the igmp_sgalloc feature has been removed. * This turns off the allocation of a recorded source entry if * the group has not been joined. * * Return 0 if the source didn't exist or was already marked as recorded. * Return 1 if the source was marked as recorded by this function. * Return <0 if any error occurred (negated errno code). */ int inm_record_source(struct in_multi *inm, const in_addr_t naddr) { struct ip_msource find; struct ip_msource *ims, *nims; IN_MULTI_LIST_LOCK_ASSERT(); find.ims_haddr = ntohl(naddr); ims = RB_FIND(ip_msource_tree, &inm->inm_srcs, &find); if (ims && ims->ims_stp) return (0); if (ims == NULL) { if (inm->inm_nsrc == in_mcast_maxgrpsrc) return (-ENOSPC); nims = malloc(sizeof(struct ip_msource), M_IPMSOURCE, M_NOWAIT | M_ZERO); if (nims == NULL) return (-ENOMEM); nims->ims_haddr = find.ims_haddr; RB_INSERT(ip_msource_tree, &inm->inm_srcs, nims); ++inm->inm_nsrc; ims = nims; } /* * Mark the source as recorded and update the recorded * source count. */ ++ims->ims_stp; ++inm->inm_st[1].iss_rec; return (1); } /* * Return a pointer to an in_msource owned by an in_mfilter, * given its source address. * Lazy-allocate if needed. If this is a new entry its filter state is * undefined at t0. * * imf is the filter set being modified. * haddr is the source address in *host* byte-order. * * SMPng: May be called with locks held; malloc must not block. */ static int imf_get_source(struct in_mfilter *imf, const struct sockaddr_in *psin, struct in_msource **plims) { struct ip_msource find; struct ip_msource *ims, *nims; struct in_msource *lims; int error; error = 0; ims = NULL; lims = NULL; /* key is host byte order */ find.ims_haddr = ntohl(psin->sin_addr.s_addr); ims = RB_FIND(ip_msource_tree, &imf->imf_sources, &find); lims = (struct in_msource *)ims; if (lims == NULL) { if (imf->imf_nsrc == in_mcast_maxsocksrc) return (ENOSPC); nims = malloc(sizeof(struct in_msource), M_INMFILTER, M_NOWAIT | M_ZERO); if (nims == NULL) return (ENOMEM); lims = (struct in_msource *)nims; lims->ims_haddr = find.ims_haddr; lims->imsl_st[0] = MCAST_UNDEFINED; RB_INSERT(ip_msource_tree, &imf->imf_sources, nims); ++imf->imf_nsrc; } *plims = lims; return (error); } /* * Graft a source entry into an existing socket-layer filter set, * maintaining any required invariants and checking allocations. * * The source is marked as being in the new filter mode at t1. * * Return the pointer to the new node, otherwise return NULL. */ static struct in_msource * imf_graft(struct in_mfilter *imf, const uint8_t st1, const struct sockaddr_in *psin) { struct ip_msource *nims; struct in_msource *lims; nims = malloc(sizeof(struct in_msource), M_INMFILTER, M_NOWAIT | M_ZERO); if (nims == NULL) return (NULL); lims = (struct in_msource *)nims; lims->ims_haddr = ntohl(psin->sin_addr.s_addr); lims->imsl_st[0] = MCAST_UNDEFINED; lims->imsl_st[1] = st1; RB_INSERT(ip_msource_tree, &imf->imf_sources, nims); ++imf->imf_nsrc; return (lims); } /* * Prune a source entry from an existing socket-layer filter set, * maintaining any required invariants and checking allocations. * * The source is marked as being left at t1, it is not freed. * * Return 0 if no error occurred, otherwise return an errno value. */ static int imf_prune(struct in_mfilter *imf, const struct sockaddr_in *psin) { struct ip_msource find; struct ip_msource *ims; struct in_msource *lims; /* key is host byte order */ find.ims_haddr = ntohl(psin->sin_addr.s_addr); ims = RB_FIND(ip_msource_tree, &imf->imf_sources, &find); if (ims == NULL) return (ENOENT); lims = (struct in_msource *)ims; lims->imsl_st[1] = MCAST_UNDEFINED; return (0); } /* * Revert socket-layer filter set deltas at t1 to t0 state. */ static void imf_rollback(struct in_mfilter *imf) { struct ip_msource *ims, *tims; struct in_msource *lims; RB_FOREACH_SAFE(ims, ip_msource_tree, &imf->imf_sources, tims) { lims = (struct in_msource *)ims; if (lims->imsl_st[0] == lims->imsl_st[1]) { /* no change at t1 */ continue; } else if (lims->imsl_st[0] != MCAST_UNDEFINED) { /* revert change to existing source at t1 */ lims->imsl_st[1] = lims->imsl_st[0]; } else { /* revert source added t1 */ CTR2(KTR_IGMPV3, "%s: free ims %p", __func__, ims); RB_REMOVE(ip_msource_tree, &imf->imf_sources, ims); free(ims, M_INMFILTER); imf->imf_nsrc--; } } imf->imf_st[1] = imf->imf_st[0]; } /* * Mark socket-layer filter set as INCLUDE {} at t1. */ static void imf_leave(struct in_mfilter *imf) { struct ip_msource *ims; struct in_msource *lims; RB_FOREACH(ims, ip_msource_tree, &imf->imf_sources) { lims = (struct in_msource *)ims; lims->imsl_st[1] = MCAST_UNDEFINED; } imf->imf_st[1] = MCAST_INCLUDE; } /* * Mark socket-layer filter set deltas as committed. */ static void imf_commit(struct in_mfilter *imf) { struct ip_msource *ims; struct in_msource *lims; RB_FOREACH(ims, ip_msource_tree, &imf->imf_sources) { lims = (struct in_msource *)ims; lims->imsl_st[0] = lims->imsl_st[1]; } imf->imf_st[0] = imf->imf_st[1]; } /* * Reap unreferenced sources from socket-layer filter set. */ static void imf_reap(struct in_mfilter *imf) { struct ip_msource *ims, *tims; struct in_msource *lims; RB_FOREACH_SAFE(ims, ip_msource_tree, &imf->imf_sources, tims) { lims = (struct in_msource *)ims; if ((lims->imsl_st[0] == MCAST_UNDEFINED) && (lims->imsl_st[1] == MCAST_UNDEFINED)) { CTR2(KTR_IGMPV3, "%s: free lims %p", __func__, ims); RB_REMOVE(ip_msource_tree, &imf->imf_sources, ims); free(ims, M_INMFILTER); imf->imf_nsrc--; } } } /* * Purge socket-layer filter set. */ static void imf_purge(struct in_mfilter *imf) { struct ip_msource *ims, *tims; RB_FOREACH_SAFE(ims, ip_msource_tree, &imf->imf_sources, tims) { CTR2(KTR_IGMPV3, "%s: free ims %p", __func__, ims); RB_REMOVE(ip_msource_tree, &imf->imf_sources, ims); free(ims, M_INMFILTER); imf->imf_nsrc--; } imf->imf_st[0] = imf->imf_st[1] = MCAST_UNDEFINED; KASSERT(RB_EMPTY(&imf->imf_sources), ("%s: imf_sources not empty", __func__)); } /* * Look up a source filter entry for a multicast group. * * inm is the group descriptor to work with. * haddr is the host-byte-order IPv4 address to look up. * noalloc may be non-zero to suppress allocation of sources. * *pims will be set to the address of the retrieved or allocated source. * * SMPng: NOTE: may be called with locks held. * Return 0 if successful, otherwise return a non-zero error code. */ static int inm_get_source(struct in_multi *inm, const in_addr_t haddr, const int noalloc, struct ip_msource **pims) { struct ip_msource find; struct ip_msource *ims, *nims; find.ims_haddr = haddr; ims = RB_FIND(ip_msource_tree, &inm->inm_srcs, &find); if (ims == NULL && !noalloc) { if (inm->inm_nsrc == in_mcast_maxgrpsrc) return (ENOSPC); nims = malloc(sizeof(struct ip_msource), M_IPMSOURCE, M_NOWAIT | M_ZERO); if (nims == NULL) return (ENOMEM); nims->ims_haddr = haddr; RB_INSERT(ip_msource_tree, &inm->inm_srcs, nims); ++inm->inm_nsrc; ims = nims; #ifdef KTR CTR3(KTR_IGMPV3, "%s: allocated 0x%08x as %p", __func__, haddr, ims); #endif } *pims = ims; return (0); } /* * Merge socket-layer source into IGMP-layer source. * If rollback is non-zero, perform the inverse of the merge. */ static void ims_merge(struct ip_msource *ims, const struct in_msource *lims, const int rollback) { int n = rollback ? -1 : 1; if (lims->imsl_st[0] == MCAST_EXCLUDE) { CTR3(KTR_IGMPV3, "%s: t1 ex -= %d on 0x%08x", __func__, n, ims->ims_haddr); ims->ims_st[1].ex -= n; } else if (lims->imsl_st[0] == MCAST_INCLUDE) { CTR3(KTR_IGMPV3, "%s: t1 in -= %d on 0x%08x", __func__, n, ims->ims_haddr); ims->ims_st[1].in -= n; } if (lims->imsl_st[1] == MCAST_EXCLUDE) { CTR3(KTR_IGMPV3, "%s: t1 ex += %d on 0x%08x", __func__, n, ims->ims_haddr); ims->ims_st[1].ex += n; } else if (lims->imsl_st[1] == MCAST_INCLUDE) { CTR3(KTR_IGMPV3, "%s: t1 in += %d on 0x%08x", __func__, n, ims->ims_haddr); ims->ims_st[1].in += n; } } /* * Atomically update the global in_multi state, when a membership's * filter list is being updated in any way. * * imf is the per-inpcb-membership group filter pointer. * A fake imf may be passed for in-kernel consumers. * * XXX This is a candidate for a set-symmetric-difference style loop * which would eliminate the repeated lookup from root of ims nodes, * as they share the same key space. * * If any error occurred this function will back out of refcounts * and return a non-zero value. */ static int inm_merge(struct in_multi *inm, /*const*/ struct in_mfilter *imf) { struct ip_msource *ims, *nims; struct in_msource *lims; int schanged, error; int nsrc0, nsrc1; schanged = 0; error = 0; nsrc1 = nsrc0 = 0; IN_MULTI_LIST_LOCK_ASSERT(); /* * Update the source filters first, as this may fail. * Maintain count of in-mode filters at t0, t1. These are * used to work out if we transition into ASM mode or not. * Maintain a count of source filters whose state was * actually modified by this operation. */ RB_FOREACH(ims, ip_msource_tree, &imf->imf_sources) { lims = (struct in_msource *)ims; if (lims->imsl_st[0] == imf->imf_st[0]) nsrc0++; if (lims->imsl_st[1] == imf->imf_st[1]) nsrc1++; if (lims->imsl_st[0] == lims->imsl_st[1]) continue; error = inm_get_source(inm, lims->ims_haddr, 0, &nims); ++schanged; if (error) break; ims_merge(nims, lims, 0); } if (error) { struct ip_msource *bims; RB_FOREACH_REVERSE_FROM(ims, ip_msource_tree, nims) { lims = (struct in_msource *)ims; if (lims->imsl_st[0] == lims->imsl_st[1]) continue; (void)inm_get_source(inm, lims->ims_haddr, 1, &bims); if (bims == NULL) continue; ims_merge(bims, lims, 1); } goto out_reap; } CTR3(KTR_IGMPV3, "%s: imf filters in-mode: %d at t0, %d at t1", __func__, nsrc0, nsrc1); /* Handle transition between INCLUDE {n} and INCLUDE {} on socket. */ if (imf->imf_st[0] == imf->imf_st[1] && imf->imf_st[1] == MCAST_INCLUDE) { if (nsrc1 == 0) { CTR1(KTR_IGMPV3, "%s: --in on inm at t1", __func__); --inm->inm_st[1].iss_in; } } /* Handle filter mode transition on socket. */ if (imf->imf_st[0] != imf->imf_st[1]) { CTR3(KTR_IGMPV3, "%s: imf transition %d to %d", __func__, imf->imf_st[0], imf->imf_st[1]); if (imf->imf_st[0] == MCAST_EXCLUDE) { CTR1(KTR_IGMPV3, "%s: --ex on inm at t1", __func__); --inm->inm_st[1].iss_ex; } else if (imf->imf_st[0] == MCAST_INCLUDE) { CTR1(KTR_IGMPV3, "%s: --in on inm at t1", __func__); --inm->inm_st[1].iss_in; } if (imf->imf_st[1] == MCAST_EXCLUDE) { CTR1(KTR_IGMPV3, "%s: ex++ on inm at t1", __func__); inm->inm_st[1].iss_ex++; } else if (imf->imf_st[1] == MCAST_INCLUDE && nsrc1 > 0) { CTR1(KTR_IGMPV3, "%s: in++ on inm at t1", __func__); inm->inm_st[1].iss_in++; } } /* * Track inm filter state in terms of listener counts. * If there are any exclusive listeners, stack-wide * membership is exclusive. * Otherwise, if only inclusive listeners, stack-wide is inclusive. * If no listeners remain, state is undefined at t1, * and the IGMP lifecycle for this group should finish. */ if (inm->inm_st[1].iss_ex > 0) { CTR1(KTR_IGMPV3, "%s: transition to EX", __func__); inm->inm_st[1].iss_fmode = MCAST_EXCLUDE; } else if (inm->inm_st[1].iss_in > 0) { CTR1(KTR_IGMPV3, "%s: transition to IN", __func__); inm->inm_st[1].iss_fmode = MCAST_INCLUDE; } else { CTR1(KTR_IGMPV3, "%s: transition to UNDEF", __func__); inm->inm_st[1].iss_fmode = MCAST_UNDEFINED; } /* Decrement ASM listener count on transition out of ASM mode. */ if (imf->imf_st[0] == MCAST_EXCLUDE && nsrc0 == 0) { if ((imf->imf_st[1] != MCAST_EXCLUDE) || (imf->imf_st[1] == MCAST_EXCLUDE && nsrc1 > 0)) { CTR1(KTR_IGMPV3, "%s: --asm on inm at t1", __func__); --inm->inm_st[1].iss_asm; } } /* Increment ASM listener count on transition to ASM mode. */ if (imf->imf_st[1] == MCAST_EXCLUDE && nsrc1 == 0) { CTR1(KTR_IGMPV3, "%s: asm++ on inm at t1", __func__); inm->inm_st[1].iss_asm++; } CTR3(KTR_IGMPV3, "%s: merged imf %p to inm %p", __func__, imf, inm); inm_print(inm); out_reap: if (schanged > 0) { CTR1(KTR_IGMPV3, "%s: sources changed; reaping", __func__); inm_reap(inm); } return (error); } /* * Mark an in_multi's filter set deltas as committed. * Called by IGMP after a state change has been enqueued. */ void inm_commit(struct in_multi *inm) { struct ip_msource *ims; CTR2(KTR_IGMPV3, "%s: commit inm %p", __func__, inm); CTR1(KTR_IGMPV3, "%s: pre commit:", __func__); inm_print(inm); RB_FOREACH(ims, ip_msource_tree, &inm->inm_srcs) { ims->ims_st[0] = ims->ims_st[1]; } inm->inm_st[0] = inm->inm_st[1]; } /* * Reap unreferenced nodes from an in_multi's filter set. */ static void inm_reap(struct in_multi *inm) { struct ip_msource *ims, *tims; RB_FOREACH_SAFE(ims, ip_msource_tree, &inm->inm_srcs, tims) { if (ims->ims_st[0].ex > 0 || ims->ims_st[0].in > 0 || ims->ims_st[1].ex > 0 || ims->ims_st[1].in > 0 || ims->ims_stp != 0) continue; CTR2(KTR_IGMPV3, "%s: free ims %p", __func__, ims); RB_REMOVE(ip_msource_tree, &inm->inm_srcs, ims); free(ims, M_IPMSOURCE); inm->inm_nsrc--; } } /* * Purge all source nodes from an in_multi's filter set. */ static void inm_purge(struct in_multi *inm) { struct ip_msource *ims, *tims; RB_FOREACH_SAFE(ims, ip_msource_tree, &inm->inm_srcs, tims) { CTR2(KTR_IGMPV3, "%s: free ims %p", __func__, ims); RB_REMOVE(ip_msource_tree, &inm->inm_srcs, ims); free(ims, M_IPMSOURCE); inm->inm_nsrc--; } } /* * Join a multicast group; unlocked entry point. * * SMPng: XXX: in_joingroup() is called from in_control() when Giant * is not held. Fortunately, ifp is unlikely to have been detached * at this point, so we assume it's OK to recurse. */ int in_joingroup(struct ifnet *ifp, const struct in_addr *gina, /*const*/ struct in_mfilter *imf, struct in_multi **pinm) { int error; IN_MULTI_LOCK(); error = in_joingroup_locked(ifp, gina, imf, pinm); IN_MULTI_UNLOCK(); return (error); } /* * Join a multicast group; real entry point. * * Only preserves atomicity at inm level. * NOTE: imf argument cannot be const due to sys/tree.h limitations. * * If the IGMP downcall fails, the group is not joined, and an error * code is returned. */ int in_joingroup_locked(struct ifnet *ifp, const struct in_addr *gina, /*const*/ struct in_mfilter *imf, struct in_multi **pinm) { struct in_mfilter timf; struct in_multi *inm; int error; IN_MULTI_LOCK_ASSERT(); IN_MULTI_LIST_UNLOCK_ASSERT(); CTR4(KTR_IGMPV3, "%s: join 0x%08x on %p(%s))", __func__, ntohl(gina->s_addr), ifp, ifp->if_xname); error = 0; inm = NULL; /* * If no imf was specified (i.e. kernel consumer), * fake one up and assume it is an ASM join. */ if (imf == NULL) { imf_init(&timf, MCAST_UNDEFINED, MCAST_EXCLUDE); imf = &timf; } error = in_getmulti(ifp, gina, &inm); if (error) { CTR1(KTR_IGMPV3, "%s: in_getmulti() failure", __func__); return (error); } IN_MULTI_LIST_LOCK(); CTR1(KTR_IGMPV3, "%s: merge inm state", __func__); error = inm_merge(inm, imf); if (error) { CTR1(KTR_IGMPV3, "%s: failed to merge inm state", __func__); goto out_inm_release; } CTR1(KTR_IGMPV3, "%s: doing igmp downcall", __func__); error = igmp_change_state(inm); if (error) { CTR1(KTR_IGMPV3, "%s: failed to update source", __func__); goto out_inm_release; } out_inm_release: if (error) { CTR2(KTR_IGMPV3, "%s: dropping ref on %p", __func__, inm); inm_release_deferred(inm); } else { *pinm = inm; } IN_MULTI_LIST_UNLOCK(); return (error); } /* * Leave a multicast group; unlocked entry point. */ int in_leavegroup(struct in_multi *inm, /*const*/ struct in_mfilter *imf) { int error; IN_MULTI_LOCK(); error = in_leavegroup_locked(inm, imf); IN_MULTI_UNLOCK(); return (error); } /* * Leave a multicast group; real entry point. * All source filters will be expunged. * * Only preserves atomicity at inm level. * * Holding the write lock for the INP which contains imf * is highly advisable. We can't assert for it as imf does not * contain a back-pointer to the owning inp. * * Note: This is not the same as inm_release(*) as this function also * makes a state change downcall into IGMP. */ int in_leavegroup_locked(struct in_multi *inm, /*const*/ struct in_mfilter *imf) { struct in_mfilter timf; int error; error = 0; IN_MULTI_LOCK_ASSERT(); IN_MULTI_LIST_UNLOCK_ASSERT(); CTR5(KTR_IGMPV3, "%s: leave inm %p, 0x%08x/%s, imf %p", __func__, inm, ntohl(inm->inm_addr.s_addr), (inm_is_ifp_detached(inm) ? "null" : inm->inm_ifp->if_xname), imf); /* * If no imf was specified (i.e. kernel consumer), * fake one up and assume it is an ASM join. */ if (imf == NULL) { imf_init(&timf, MCAST_EXCLUDE, MCAST_UNDEFINED); imf = &timf; } /* * Begin state merge transaction at IGMP layer. * * As this particular invocation should not cause any memory * to be allocated, and there is no opportunity to roll back * the transaction, it MUST NOT fail. */ CTR1(KTR_IGMPV3, "%s: merge inm state", __func__); IN_MULTI_LIST_LOCK(); error = inm_merge(inm, imf); KASSERT(error == 0, ("%s: failed to merge inm state", __func__)); CTR1(KTR_IGMPV3, "%s: doing igmp downcall", __func__); CURVNET_SET(inm->inm_ifp->if_vnet); error = igmp_change_state(inm); IF_ADDR_WLOCK(inm->inm_ifp); inm_release_deferred(inm); IF_ADDR_WUNLOCK(inm->inm_ifp); IN_MULTI_LIST_UNLOCK(); CURVNET_RESTORE(); if (error) CTR1(KTR_IGMPV3, "%s: failed igmp downcall", __func__); CTR2(KTR_IGMPV3, "%s: dropping ref on %p", __func__, inm); return (error); } /*#ifndef BURN_BRIDGES*/ /* * Join an IPv4 multicast group in (*,G) exclusive mode. * The group must be a 224.0.0.0/24 link-scope group. * This KPI is for legacy kernel consumers only. */ struct in_multi * in_addmulti(struct in_addr *ap, struct ifnet *ifp) { struct in_multi *pinm; int error; #ifdef INVARIANTS char addrbuf[INET_ADDRSTRLEN]; #endif KASSERT(IN_LOCAL_GROUP(ntohl(ap->s_addr)), ("%s: %s not in 224.0.0.0/24", __func__, inet_ntoa_r(*ap, addrbuf))); error = in_joingroup(ifp, ap, NULL, &pinm); if (error != 0) pinm = NULL; return (pinm); } /* * Block or unblock an ASM multicast source on an inpcb. * This implements the delta-based API described in RFC 3678. * * The delta-based API applies only to exclusive-mode memberships. * An IGMP downcall will be performed. * * SMPng: NOTE: Must take Giant as a join may create a new ifma. * * Return 0 if successful, otherwise return an appropriate error code. */ static int inp_block_unblock_source(struct inpcb *inp, struct sockopt *sopt) { struct group_source_req gsr; sockunion_t *gsa, *ssa; struct ifnet *ifp; struct in_mfilter *imf; struct ip_moptions *imo; struct in_msource *ims; struct in_multi *inm; size_t idx; uint16_t fmode; int error, doblock; ifp = NULL; error = 0; doblock = 0; memset(&gsr, 0, sizeof(struct group_source_req)); gsa = (sockunion_t *)&gsr.gsr_group; ssa = (sockunion_t *)&gsr.gsr_source; switch (sopt->sopt_name) { case IP_BLOCK_SOURCE: case IP_UNBLOCK_SOURCE: { struct ip_mreq_source mreqs; error = sooptcopyin(sopt, &mreqs, sizeof(struct ip_mreq_source), sizeof(struct ip_mreq_source)); if (error) return (error); gsa->sin.sin_family = AF_INET; gsa->sin.sin_len = sizeof(struct sockaddr_in); gsa->sin.sin_addr = mreqs.imr_multiaddr; ssa->sin.sin_family = AF_INET; ssa->sin.sin_len = sizeof(struct sockaddr_in); ssa->sin.sin_addr = mreqs.imr_sourceaddr; if (!in_nullhost(mreqs.imr_interface)) INADDR_TO_IFP(mreqs.imr_interface, ifp); if (sopt->sopt_name == IP_BLOCK_SOURCE) doblock = 1; CTR3(KTR_IGMPV3, "%s: imr_interface = 0x%08x, ifp = %p", __func__, ntohl(mreqs.imr_interface.s_addr), ifp); break; } case MCAST_BLOCK_SOURCE: case MCAST_UNBLOCK_SOURCE: error = sooptcopyin(sopt, &gsr, sizeof(struct group_source_req), sizeof(struct group_source_req)); if (error) return (error); if (gsa->sin.sin_family != AF_INET || gsa->sin.sin_len != sizeof(struct sockaddr_in)) return (EINVAL); if (ssa->sin.sin_family != AF_INET || ssa->sin.sin_len != sizeof(struct sockaddr_in)) return (EINVAL); if (gsr.gsr_interface == 0 || V_if_index < gsr.gsr_interface) return (EADDRNOTAVAIL); ifp = ifnet_byindex(gsr.gsr_interface); if (sopt->sopt_name == MCAST_BLOCK_SOURCE) doblock = 1; break; default: CTR2(KTR_IGMPV3, "%s: unknown sopt_name %d", __func__, sopt->sopt_name); return (EOPNOTSUPP); break; } if (!IN_MULTICAST(ntohl(gsa->sin.sin_addr.s_addr))) return (EINVAL); /* * Check if we are actually a member of this group. */ imo = inp_findmoptions(inp); idx = imo_match_group(imo, ifp, &gsa->sa); if (idx == -1 || imo->imo_mfilters == NULL) { error = EADDRNOTAVAIL; goto out_inp_locked; } KASSERT(imo->imo_mfilters != NULL, ("%s: imo_mfilters not allocated", __func__)); imf = &imo->imo_mfilters[idx]; inm = imo->imo_membership[idx]; /* * Attempting to use the delta-based API on an * non exclusive-mode membership is an error. */ fmode = imf->imf_st[0]; if (fmode != MCAST_EXCLUDE) { error = EINVAL; goto out_inp_locked; } /* * Deal with error cases up-front: * Asked to block, but already blocked; or * Asked to unblock, but nothing to unblock. * If adding a new block entry, allocate it. */ ims = imo_match_source(imo, idx, &ssa->sa); if ((ims != NULL && doblock) || (ims == NULL && !doblock)) { CTR3(KTR_IGMPV3, "%s: source 0x%08x %spresent", __func__, ntohl(ssa->sin.sin_addr.s_addr), doblock ? "" : "not "); error = EADDRNOTAVAIL; goto out_inp_locked; } INP_WLOCK_ASSERT(inp); /* * Begin state merge transaction at socket layer. */ if (doblock) { CTR2(KTR_IGMPV3, "%s: %s source", __func__, "block"); ims = imf_graft(imf, fmode, &ssa->sin); if (ims == NULL) error = ENOMEM; } else { CTR2(KTR_IGMPV3, "%s: %s source", __func__, "allow"); error = imf_prune(imf, &ssa->sin); } if (error) { CTR1(KTR_IGMPV3, "%s: merge imf state failed", __func__); goto out_imf_rollback; } /* * Begin state merge transaction at IGMP layer. */ IN_MULTI_LOCK(); IN_MULTI_LIST_LOCK(); CTR1(KTR_IGMPV3, "%s: merge inm state", __func__); error = inm_merge(inm, imf); if (error) { CTR1(KTR_IGMPV3, "%s: failed to merge inm state", __func__); goto out_in_multi_locked; } CTR1(KTR_IGMPV3, "%s: doing igmp downcall", __func__); error = igmp_change_state(inm); if (error) CTR1(KTR_IGMPV3, "%s: failed igmp downcall", __func__); out_in_multi_locked: IN_MULTI_UNLOCK(); IN_MULTI_UNLOCK(); out_imf_rollback: if (error) imf_rollback(imf); else imf_commit(imf); imf_reap(imf); out_inp_locked: INP_WUNLOCK(inp); return (error); } /* * Given an inpcb, return its multicast options structure pointer. Accepts * an unlocked inpcb pointer, but will return it locked. May sleep. * * SMPng: NOTE: Potentially calls malloc(M_WAITOK) with Giant held. * SMPng: NOTE: Returns with the INP write lock held. */ static struct ip_moptions * inp_findmoptions(struct inpcb *inp) { struct ip_moptions *imo; struct in_multi **immp; struct in_mfilter *imfp; size_t idx; INP_WLOCK(inp); if (inp->inp_moptions != NULL) return (inp->inp_moptions); INP_WUNLOCK(inp); imo = malloc(sizeof(*imo), M_IPMOPTS, M_WAITOK); immp = malloc(sizeof(*immp) * IP_MIN_MEMBERSHIPS, M_IPMOPTS, M_WAITOK | M_ZERO); imfp = malloc(sizeof(struct in_mfilter) * IP_MIN_MEMBERSHIPS, M_INMFILTER, M_WAITOK); imo->imo_multicast_ifp = NULL; imo->imo_multicast_addr.s_addr = INADDR_ANY; imo->imo_multicast_vif = -1; imo->imo_multicast_ttl = IP_DEFAULT_MULTICAST_TTL; imo->imo_multicast_loop = in_mcast_loop; imo->imo_num_memberships = 0; imo->imo_max_memberships = IP_MIN_MEMBERSHIPS; imo->imo_membership = immp; /* Initialize per-group source filters. */ for (idx = 0; idx < IP_MIN_MEMBERSHIPS; idx++) imf_init(&imfp[idx], MCAST_UNDEFINED, MCAST_EXCLUDE); imo->imo_mfilters = imfp; INP_WLOCK(inp); if (inp->inp_moptions != NULL) { free(imfp, M_INMFILTER); free(immp, M_IPMOPTS); free(imo, M_IPMOPTS); return (inp->inp_moptions); } inp->inp_moptions = imo; return (imo); } static void inp_gcmoptions(epoch_context_t ctx) { struct ip_moptions *imo; struct in_mfilter *imf; struct in_multi *inm; struct ifnet *ifp; size_t idx, nmships; imo = __containerof(ctx, struct ip_moptions, imo_epoch_ctx); nmships = imo->imo_num_memberships; for (idx = 0; idx < nmships; ++idx) { imf = imo->imo_mfilters ? &imo->imo_mfilters[idx] : NULL; if (imf) imf_leave(imf); inm = imo->imo_membership[idx]; ifp = inm->inm_ifp; if (ifp != NULL) { CURVNET_SET(ifp->if_vnet); (void)in_leavegroup(inm, imf); CURVNET_RESTORE(); } else { (void)in_leavegroup(inm, imf); } if (imf) imf_purge(imf); } if (imo->imo_mfilters) free(imo->imo_mfilters, M_INMFILTER); free(imo->imo_membership, M_IPMOPTS); free(imo, M_IPMOPTS); } /* * Discard the IP multicast options (and source filters). To minimize * the amount of work done while holding locks such as the INP's * pcbinfo lock (which is used in the receive path), the free * operation is deferred to the epoch callback task. */ void inp_freemoptions(struct ip_moptions *imo) { if (imo == NULL) return; epoch_call(net_epoch_preempt, &imo->imo_epoch_ctx, inp_gcmoptions); } /* * Atomically get source filters on a socket for an IPv4 multicast group. * Called with INP lock held; returns with lock released. */ static int inp_get_source_filters(struct inpcb *inp, struct sockopt *sopt) { struct __msfilterreq msfr; sockunion_t *gsa; struct ifnet *ifp; struct ip_moptions *imo; struct in_mfilter *imf; struct ip_msource *ims; struct in_msource *lims; struct sockaddr_in *psin; struct sockaddr_storage *ptss; struct sockaddr_storage *tss; int error; size_t idx, nsrcs, ncsrcs; INP_WLOCK_ASSERT(inp); imo = inp->inp_moptions; KASSERT(imo != NULL, ("%s: null ip_moptions", __func__)); INP_WUNLOCK(inp); error = sooptcopyin(sopt, &msfr, sizeof(struct __msfilterreq), sizeof(struct __msfilterreq)); if (error) return (error); if (msfr.msfr_ifindex == 0 || V_if_index < msfr.msfr_ifindex) return (EINVAL); ifp = ifnet_byindex(msfr.msfr_ifindex); if (ifp == NULL) return (EINVAL); INP_WLOCK(inp); /* * Lookup group on the socket. */ gsa = (sockunion_t *)&msfr.msfr_group; idx = imo_match_group(imo, ifp, &gsa->sa); if (idx == -1 || imo->imo_mfilters == NULL) { INP_WUNLOCK(inp); return (EADDRNOTAVAIL); } imf = &imo->imo_mfilters[idx]; /* * Ignore memberships which are in limbo. */ if (imf->imf_st[1] == MCAST_UNDEFINED) { INP_WUNLOCK(inp); return (EAGAIN); } msfr.msfr_fmode = imf->imf_st[1]; /* * If the user specified a buffer, copy out the source filter * entries to userland gracefully. * We only copy out the number of entries which userland * has asked for, but we always tell userland how big the * buffer really needs to be. */ if (msfr.msfr_nsrcs > in_mcast_maxsocksrc) msfr.msfr_nsrcs = in_mcast_maxsocksrc; tss = NULL; if (msfr.msfr_srcs != NULL && msfr.msfr_nsrcs > 0) { tss = malloc(sizeof(struct sockaddr_storage) * msfr.msfr_nsrcs, M_TEMP, M_NOWAIT | M_ZERO); if (tss == NULL) { INP_WUNLOCK(inp); return (ENOBUFS); } } /* * Count number of sources in-mode at t0. * If buffer space exists and remains, copy out source entries. */ nsrcs = msfr.msfr_nsrcs; ncsrcs = 0; ptss = tss; RB_FOREACH(ims, ip_msource_tree, &imf->imf_sources) { lims = (struct in_msource *)ims; if (lims->imsl_st[0] == MCAST_UNDEFINED || lims->imsl_st[0] != imf->imf_st[0]) continue; ++ncsrcs; if (tss != NULL && nsrcs > 0) { psin = (struct sockaddr_in *)ptss; psin->sin_family = AF_INET; psin->sin_len = sizeof(struct sockaddr_in); psin->sin_addr.s_addr = htonl(lims->ims_haddr); psin->sin_port = 0; ++ptss; --nsrcs; } } INP_WUNLOCK(inp); if (tss != NULL) { error = copyout(tss, msfr.msfr_srcs, sizeof(struct sockaddr_storage) * msfr.msfr_nsrcs); free(tss, M_TEMP); if (error) return (error); } msfr.msfr_nsrcs = ncsrcs; error = sooptcopyout(sopt, &msfr, sizeof(struct __msfilterreq)); return (error); } /* * Return the IP multicast options in response to user getsockopt(). */ int inp_getmoptions(struct inpcb *inp, struct sockopt *sopt) { struct rm_priotracker in_ifa_tracker; struct ip_mreqn mreqn; struct ip_moptions *imo; struct ifnet *ifp; struct in_ifaddr *ia; int error, optval; u_char coptval; INP_WLOCK(inp); imo = inp->inp_moptions; /* * If socket is neither of type SOCK_RAW or SOCK_DGRAM, * or is a divert socket, reject it. */ if (inp->inp_socket->so_proto->pr_protocol == IPPROTO_DIVERT || (inp->inp_socket->so_proto->pr_type != SOCK_RAW && inp->inp_socket->so_proto->pr_type != SOCK_DGRAM)) { INP_WUNLOCK(inp); return (EOPNOTSUPP); } error = 0; switch (sopt->sopt_name) { case IP_MULTICAST_VIF: if (imo != NULL) optval = imo->imo_multicast_vif; else optval = -1; INP_WUNLOCK(inp); error = sooptcopyout(sopt, &optval, sizeof(int)); break; case IP_MULTICAST_IF: memset(&mreqn, 0, sizeof(struct ip_mreqn)); if (imo != NULL) { ifp = imo->imo_multicast_ifp; if (!in_nullhost(imo->imo_multicast_addr)) { mreqn.imr_address = imo->imo_multicast_addr; } else if (ifp != NULL) { mreqn.imr_ifindex = ifp->if_index; + NET_EPOCH_ENTER(); IFP_TO_IA(ifp, ia, &in_ifa_tracker); - if (ia != NULL) { + if (ia != NULL) mreqn.imr_address = IA_SIN(ia)->sin_addr; - ifa_free(&ia->ia_ifa); - } + NET_EPOCH_EXIT(); } } INP_WUNLOCK(inp); if (sopt->sopt_valsize == sizeof(struct ip_mreqn)) { error = sooptcopyout(sopt, &mreqn, sizeof(struct ip_mreqn)); } else { error = sooptcopyout(sopt, &mreqn.imr_address, sizeof(struct in_addr)); } break; case IP_MULTICAST_TTL: if (imo == NULL) optval = coptval = IP_DEFAULT_MULTICAST_TTL; else optval = coptval = imo->imo_multicast_ttl; INP_WUNLOCK(inp); if (sopt->sopt_valsize == sizeof(u_char)) error = sooptcopyout(sopt, &coptval, sizeof(u_char)); else error = sooptcopyout(sopt, &optval, sizeof(int)); break; case IP_MULTICAST_LOOP: if (imo == NULL) optval = coptval = IP_DEFAULT_MULTICAST_LOOP; else optval = coptval = imo->imo_multicast_loop; INP_WUNLOCK(inp); if (sopt->sopt_valsize == sizeof(u_char)) error = sooptcopyout(sopt, &coptval, sizeof(u_char)); else error = sooptcopyout(sopt, &optval, sizeof(int)); break; case IP_MSFILTER: if (imo == NULL) { error = EADDRNOTAVAIL; INP_WUNLOCK(inp); } else { error = inp_get_source_filters(inp, sopt); } break; default: INP_WUNLOCK(inp); error = ENOPROTOOPT; break; } INP_UNLOCK_ASSERT(inp); return (error); } /* * Look up the ifnet to use for a multicast group membership, * given the IPv4 address of an interface, and the IPv4 group address. * * This routine exists to support legacy multicast applications * which do not understand that multicast memberships are scoped to * specific physical links in the networking stack, or which need * to join link-scope groups before IPv4 addresses are configured. * * If inp is non-NULL, use this socket's current FIB number for any * required FIB lookup. * If ina is INADDR_ANY, look up the group address in the unicast FIB, * and use its ifp; usually, this points to the default next-hop. * * If the FIB lookup fails, attempt to use the first non-loopback * interface with multicast capability in the system as a * last resort. The legacy IPv4 ASM API requires that we do * this in order to allow groups to be joined when the routing * table has not yet been populated during boot. * * Returns NULL if no ifp could be found. * * SMPng: TODO: Acquire the appropriate locks for INADDR_TO_IFP. * FUTURE: Implement IPv4 source-address selection. */ static struct ifnet * inp_lookup_mcast_ifp(const struct inpcb *inp, const struct sockaddr_in *gsin, const struct in_addr ina) { struct rm_priotracker in_ifa_tracker; struct ifnet *ifp; struct nhop4_basic nh4; uint32_t fibnum; KASSERT(gsin->sin_family == AF_INET, ("%s: not AF_INET", __func__)); KASSERT(IN_MULTICAST(ntohl(gsin->sin_addr.s_addr)), ("%s: not multicast", __func__)); ifp = NULL; if (!in_nullhost(ina)) { INADDR_TO_IFP(ina, ifp); } else { fibnum = inp ? inp->inp_inc.inc_fibnum : 0; if (fib4_lookup_nh_basic(fibnum, gsin->sin_addr, 0, 0, &nh4)==0) ifp = nh4.nh_ifp; else { struct in_ifaddr *ia; struct ifnet *mifp; mifp = NULL; IN_IFADDR_RLOCK(&in_ifa_tracker); CK_STAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) { mifp = ia->ia_ifp; if (!(mifp->if_flags & IFF_LOOPBACK) && (mifp->if_flags & IFF_MULTICAST)) { ifp = mifp; break; } } IN_IFADDR_RUNLOCK(&in_ifa_tracker); } } return (ifp); } /* * Join an IPv4 multicast group, possibly with a source. */ static int inp_join_group(struct inpcb *inp, struct sockopt *sopt) { struct group_source_req gsr; sockunion_t *gsa, *ssa; struct ifnet *ifp; struct in_mfilter *imf; struct ip_moptions *imo; struct in_multi *inm; struct in_msource *lims; size_t idx; int error, is_new; ifp = NULL; imf = NULL; lims = NULL; error = 0; is_new = 0; memset(&gsr, 0, sizeof(struct group_source_req)); gsa = (sockunion_t *)&gsr.gsr_group; gsa->ss.ss_family = AF_UNSPEC; ssa = (sockunion_t *)&gsr.gsr_source; ssa->ss.ss_family = AF_UNSPEC; switch (sopt->sopt_name) { case IP_ADD_MEMBERSHIP: case IP_ADD_SOURCE_MEMBERSHIP: { struct ip_mreq_source mreqs; if (sopt->sopt_name == IP_ADD_MEMBERSHIP) { error = sooptcopyin(sopt, &mreqs, sizeof(struct ip_mreq), sizeof(struct ip_mreq)); /* * Do argument switcharoo from ip_mreq into * ip_mreq_source to avoid using two instances. */ mreqs.imr_interface = mreqs.imr_sourceaddr; mreqs.imr_sourceaddr.s_addr = INADDR_ANY; } else if (sopt->sopt_name == IP_ADD_SOURCE_MEMBERSHIP) { error = sooptcopyin(sopt, &mreqs, sizeof(struct ip_mreq_source), sizeof(struct ip_mreq_source)); } if (error) return (error); gsa->sin.sin_family = AF_INET; gsa->sin.sin_len = sizeof(struct sockaddr_in); gsa->sin.sin_addr = mreqs.imr_multiaddr; if (sopt->sopt_name == IP_ADD_SOURCE_MEMBERSHIP) { ssa->sin.sin_family = AF_INET; ssa->sin.sin_len = sizeof(struct sockaddr_in); ssa->sin.sin_addr = mreqs.imr_sourceaddr; } if (!IN_MULTICAST(ntohl(gsa->sin.sin_addr.s_addr))) return (EINVAL); ifp = inp_lookup_mcast_ifp(inp, &gsa->sin, mreqs.imr_interface); CTR3(KTR_IGMPV3, "%s: imr_interface = 0x%08x, ifp = %p", __func__, ntohl(mreqs.imr_interface.s_addr), ifp); break; } case MCAST_JOIN_GROUP: case MCAST_JOIN_SOURCE_GROUP: if (sopt->sopt_name == MCAST_JOIN_GROUP) { error = sooptcopyin(sopt, &gsr, sizeof(struct group_req), sizeof(struct group_req)); } else if (sopt->sopt_name == MCAST_JOIN_SOURCE_GROUP) { error = sooptcopyin(sopt, &gsr, sizeof(struct group_source_req), sizeof(struct group_source_req)); } if (error) return (error); if (gsa->sin.sin_family != AF_INET || gsa->sin.sin_len != sizeof(struct sockaddr_in)) return (EINVAL); /* * Overwrite the port field if present, as the sockaddr * being copied in may be matched with a binary comparison. */ gsa->sin.sin_port = 0; if (sopt->sopt_name == MCAST_JOIN_SOURCE_GROUP) { if (ssa->sin.sin_family != AF_INET || ssa->sin.sin_len != sizeof(struct sockaddr_in)) return (EINVAL); ssa->sin.sin_port = 0; } if (!IN_MULTICAST(ntohl(gsa->sin.sin_addr.s_addr))) return (EINVAL); if (gsr.gsr_interface == 0 || V_if_index < gsr.gsr_interface) return (EADDRNOTAVAIL); ifp = ifnet_byindex(gsr.gsr_interface); break; default: CTR2(KTR_IGMPV3, "%s: unknown sopt_name %d", __func__, sopt->sopt_name); return (EOPNOTSUPP); break; } if (ifp == NULL || (ifp->if_flags & IFF_MULTICAST) == 0) return (EADDRNOTAVAIL); imo = inp_findmoptions(inp); idx = imo_match_group(imo, ifp, &gsa->sa); if (idx == -1) { is_new = 1; } else { inm = imo->imo_membership[idx]; imf = &imo->imo_mfilters[idx]; if (ssa->ss.ss_family != AF_UNSPEC) { /* * MCAST_JOIN_SOURCE_GROUP on an exclusive membership * is an error. On an existing inclusive membership, * it just adds the source to the filter list. */ if (imf->imf_st[1] != MCAST_INCLUDE) { error = EINVAL; goto out_inp_locked; } /* * Throw out duplicates. * * XXX FIXME: This makes a naive assumption that * even if entries exist for *ssa in this imf, * they will be rejected as dupes, even if they * are not valid in the current mode (in-mode). * * in_msource is transactioned just as for anything * else in SSM -- but note naive use of inm_graft() * below for allocating new filter entries. * * This is only an issue if someone mixes the * full-state SSM API with the delta-based API, * which is discouraged in the relevant RFCs. */ lims = imo_match_source(imo, idx, &ssa->sa); if (lims != NULL /*&& lims->imsl_st[1] == MCAST_INCLUDE*/) { error = EADDRNOTAVAIL; goto out_inp_locked; } } else { /* * MCAST_JOIN_GROUP on an existing exclusive * membership is an error; return EADDRINUSE * to preserve 4.4BSD API idempotence, and * avoid tedious detour to code below. * NOTE: This is bending RFC 3678 a bit. * * On an existing inclusive membership, this is also * an error; if you want to change filter mode, * you must use the userland API setsourcefilter(). * XXX We don't reject this for imf in UNDEFINED * state at t1, because allocation of a filter * is atomic with allocation of a membership. */ error = EINVAL; if (imf->imf_st[1] == MCAST_EXCLUDE) error = EADDRINUSE; goto out_inp_locked; } } /* * Begin state merge transaction at socket layer. */ INP_WLOCK_ASSERT(inp); if (is_new) { if (imo->imo_num_memberships == imo->imo_max_memberships) { error = imo_grow(imo); if (error) goto out_inp_locked; } /* * Allocate the new slot upfront so we can deal with * grafting the new source filter in same code path * as for join-source on existing membership. */ idx = imo->imo_num_memberships; imo->imo_membership[idx] = NULL; imo->imo_num_memberships++; KASSERT(imo->imo_mfilters != NULL, ("%s: imf_mfilters vector was not allocated", __func__)); imf = &imo->imo_mfilters[idx]; KASSERT(RB_EMPTY(&imf->imf_sources), ("%s: imf_sources not empty", __func__)); } /* * Graft new source into filter list for this inpcb's * membership of the group. The in_multi may not have * been allocated yet if this is a new membership, however, * the in_mfilter slot will be allocated and must be initialized. * * Note: Grafting of exclusive mode filters doesn't happen * in this path. * XXX: Should check for non-NULL lims (node exists but may * not be in-mode) for interop with full-state API. */ if (ssa->ss.ss_family != AF_UNSPEC) { /* Membership starts in IN mode */ if (is_new) { CTR1(KTR_IGMPV3, "%s: new join w/source", __func__); imf_init(imf, MCAST_UNDEFINED, MCAST_INCLUDE); } else { CTR2(KTR_IGMPV3, "%s: %s source", __func__, "allow"); } lims = imf_graft(imf, MCAST_INCLUDE, &ssa->sin); if (lims == NULL) { CTR1(KTR_IGMPV3, "%s: merge imf state failed", __func__); error = ENOMEM; goto out_imo_free; } } else { /* No address specified; Membership starts in EX mode */ if (is_new) { CTR1(KTR_IGMPV3, "%s: new join w/o source", __func__); imf_init(imf, MCAST_UNDEFINED, MCAST_EXCLUDE); } } /* * Begin state merge transaction at IGMP layer. */ in_pcbref(inp); INP_WUNLOCK(inp); IN_MULTI_LOCK(); if (is_new) { error = in_joingroup_locked(ifp, &gsa->sin.sin_addr, imf, &inm); if (error) { CTR1(KTR_IGMPV3, "%s: in_joingroup_locked failed", __func__); IN_MULTI_LIST_UNLOCK(); goto out_imo_free; } imo->imo_membership[idx] = inm; } else { CTR1(KTR_IGMPV3, "%s: merge inm state", __func__); IN_MULTI_LIST_LOCK(); error = inm_merge(inm, imf); if (error) { CTR1(KTR_IGMPV3, "%s: failed to merge inm state", __func__); IN_MULTI_LIST_UNLOCK(); goto out_in_multi_locked; } CTR1(KTR_IGMPV3, "%s: doing igmp downcall", __func__); error = igmp_change_state(inm); IN_MULTI_LIST_UNLOCK(); if (error) { CTR1(KTR_IGMPV3, "%s: failed igmp downcall", __func__); goto out_in_multi_locked; } } out_in_multi_locked: IN_MULTI_UNLOCK(); INP_WLOCK(inp); if (in_pcbrele_wlocked(inp)) return (ENXIO); if (error) { imf_rollback(imf); if (is_new) imf_purge(imf); else imf_reap(imf); } else { imf_commit(imf); } out_imo_free: if (error && is_new) { imo->imo_membership[idx] = NULL; --imo->imo_num_memberships; } out_inp_locked: INP_WUNLOCK(inp); return (error); } /* * Leave an IPv4 multicast group on an inpcb, possibly with a source. */ static int inp_leave_group(struct inpcb *inp, struct sockopt *sopt) { struct group_source_req gsr; struct ip_mreq_source mreqs; sockunion_t *gsa, *ssa; struct ifnet *ifp; struct in_mfilter *imf; struct ip_moptions *imo; struct in_msource *ims; struct in_multi *inm; size_t idx; int error, is_final; ifp = NULL; error = 0; is_final = 1; memset(&gsr, 0, sizeof(struct group_source_req)); gsa = (sockunion_t *)&gsr.gsr_group; gsa->ss.ss_family = AF_UNSPEC; ssa = (sockunion_t *)&gsr.gsr_source; ssa->ss.ss_family = AF_UNSPEC; switch (sopt->sopt_name) { case IP_DROP_MEMBERSHIP: case IP_DROP_SOURCE_MEMBERSHIP: if (sopt->sopt_name == IP_DROP_MEMBERSHIP) { error = sooptcopyin(sopt, &mreqs, sizeof(struct ip_mreq), sizeof(struct ip_mreq)); /* * Swap interface and sourceaddr arguments, * as ip_mreq and ip_mreq_source are laid * out differently. */ mreqs.imr_interface = mreqs.imr_sourceaddr; mreqs.imr_sourceaddr.s_addr = INADDR_ANY; } else if (sopt->sopt_name == IP_DROP_SOURCE_MEMBERSHIP) { error = sooptcopyin(sopt, &mreqs, sizeof(struct ip_mreq_source), sizeof(struct ip_mreq_source)); } if (error) return (error); gsa->sin.sin_family = AF_INET; gsa->sin.sin_len = sizeof(struct sockaddr_in); gsa->sin.sin_addr = mreqs.imr_multiaddr; if (sopt->sopt_name == IP_DROP_SOURCE_MEMBERSHIP) { ssa->sin.sin_family = AF_INET; ssa->sin.sin_len = sizeof(struct sockaddr_in); ssa->sin.sin_addr = mreqs.imr_sourceaddr; } /* * Attempt to look up hinted ifp from interface address. * Fallthrough with null ifp iff lookup fails, to * preserve 4.4BSD mcast API idempotence. * XXX NOTE WELL: The RFC 3678 API is preferred because * using an IPv4 address as a key is racy. */ if (!in_nullhost(mreqs.imr_interface)) INADDR_TO_IFP(mreqs.imr_interface, ifp); CTR3(KTR_IGMPV3, "%s: imr_interface = 0x%08x, ifp = %p", __func__, ntohl(mreqs.imr_interface.s_addr), ifp); break; case MCAST_LEAVE_GROUP: case MCAST_LEAVE_SOURCE_GROUP: if (sopt->sopt_name == MCAST_LEAVE_GROUP) { error = sooptcopyin(sopt, &gsr, sizeof(struct group_req), sizeof(struct group_req)); } else if (sopt->sopt_name == MCAST_LEAVE_SOURCE_GROUP) { error = sooptcopyin(sopt, &gsr, sizeof(struct group_source_req), sizeof(struct group_source_req)); } if (error) return (error); if (gsa->sin.sin_family != AF_INET || gsa->sin.sin_len != sizeof(struct sockaddr_in)) return (EINVAL); if (sopt->sopt_name == MCAST_LEAVE_SOURCE_GROUP) { if (ssa->sin.sin_family != AF_INET || ssa->sin.sin_len != sizeof(struct sockaddr_in)) return (EINVAL); } if (gsr.gsr_interface == 0 || V_if_index < gsr.gsr_interface) return (EADDRNOTAVAIL); ifp = ifnet_byindex(gsr.gsr_interface); if (ifp == NULL) return (EADDRNOTAVAIL); break; default: CTR2(KTR_IGMPV3, "%s: unknown sopt_name %d", __func__, sopt->sopt_name); return (EOPNOTSUPP); break; } if (!IN_MULTICAST(ntohl(gsa->sin.sin_addr.s_addr))) return (EINVAL); /* * Find the membership in the membership array. */ imo = inp_findmoptions(inp); idx = imo_match_group(imo, ifp, &gsa->sa); if (idx == -1) { error = EADDRNOTAVAIL; goto out_inp_locked; } inm = imo->imo_membership[idx]; imf = &imo->imo_mfilters[idx]; if (ssa->ss.ss_family != AF_UNSPEC) is_final = 0; /* * Begin state merge transaction at socket layer. */ INP_WLOCK_ASSERT(inp); /* * If we were instructed only to leave a given source, do so. * MCAST_LEAVE_SOURCE_GROUP is only valid for inclusive memberships. */ if (is_final) { imf_leave(imf); } else { if (imf->imf_st[0] == MCAST_EXCLUDE) { error = EADDRNOTAVAIL; goto out_inp_locked; } ims = imo_match_source(imo, idx, &ssa->sa); if (ims == NULL) { CTR3(KTR_IGMPV3, "%s: source 0x%08x %spresent", __func__, ntohl(ssa->sin.sin_addr.s_addr), "not "); error = EADDRNOTAVAIL; goto out_inp_locked; } CTR2(KTR_IGMPV3, "%s: %s source", __func__, "block"); error = imf_prune(imf, &ssa->sin); if (error) { CTR1(KTR_IGMPV3, "%s: merge imf state failed", __func__); goto out_inp_locked; } } /* * Begin state merge transaction at IGMP layer. */ in_pcbref(inp); INP_WUNLOCK(inp); IN_MULTI_LOCK(); if (is_final) { /* * Give up the multicast address record to which * the membership points. */ (void)in_leavegroup_locked(inm, imf); } else { CTR1(KTR_IGMPV3, "%s: merge inm state", __func__); IN_MULTI_LIST_LOCK(); error = inm_merge(inm, imf); if (error) { CTR1(KTR_IGMPV3, "%s: failed to merge inm state", __func__); goto out_in_multi_locked; } CTR1(KTR_IGMPV3, "%s: doing igmp downcall", __func__); error = igmp_change_state(inm); IN_MULTI_LIST_UNLOCK(); if (error) { CTR1(KTR_IGMPV3, "%s: failed igmp downcall", __func__); } } out_in_multi_locked: IN_MULTI_UNLOCK(); INP_WLOCK(inp); if (in_pcbrele_wlocked(inp)) return (ENXIO); if (error) imf_rollback(imf); else imf_commit(imf); imf_reap(imf); if (is_final) { /* Remove the gap in the membership and filter array. */ for (++idx; idx < imo->imo_num_memberships; ++idx) { imo->imo_membership[idx-1] = imo->imo_membership[idx]; imo->imo_mfilters[idx-1] = imo->imo_mfilters[idx]; } imo->imo_num_memberships--; } out_inp_locked: INP_WUNLOCK(inp); return (error); } /* * Select the interface for transmitting IPv4 multicast datagrams. * * Either an instance of struct in_addr or an instance of struct ip_mreqn * may be passed to this socket option. An address of INADDR_ANY or an * interface index of 0 is used to remove a previous selection. * When no interface is selected, one is chosen for every send. */ static int inp_set_multicast_if(struct inpcb *inp, struct sockopt *sopt) { struct in_addr addr; struct ip_mreqn mreqn; struct ifnet *ifp; struct ip_moptions *imo; int error; if (sopt->sopt_valsize == sizeof(struct ip_mreqn)) { /* * An interface index was specified using the * Linux-derived ip_mreqn structure. */ error = sooptcopyin(sopt, &mreqn, sizeof(struct ip_mreqn), sizeof(struct ip_mreqn)); if (error) return (error); if (mreqn.imr_ifindex < 0 || V_if_index < mreqn.imr_ifindex) return (EINVAL); if (mreqn.imr_ifindex == 0) { ifp = NULL; } else { ifp = ifnet_byindex(mreqn.imr_ifindex); if (ifp == NULL) return (EADDRNOTAVAIL); } } else { /* * An interface was specified by IPv4 address. * This is the traditional BSD usage. */ error = sooptcopyin(sopt, &addr, sizeof(struct in_addr), sizeof(struct in_addr)); if (error) return (error); if (in_nullhost(addr)) { ifp = NULL; } else { INADDR_TO_IFP(addr, ifp); if (ifp == NULL) return (EADDRNOTAVAIL); } CTR3(KTR_IGMPV3, "%s: ifp = %p, addr = 0x%08x", __func__, ifp, ntohl(addr.s_addr)); } /* Reject interfaces which do not support multicast. */ if (ifp != NULL && (ifp->if_flags & IFF_MULTICAST) == 0) return (EOPNOTSUPP); imo = inp_findmoptions(inp); imo->imo_multicast_ifp = ifp; imo->imo_multicast_addr.s_addr = INADDR_ANY; INP_WUNLOCK(inp); return (0); } /* * Atomically set source filters on a socket for an IPv4 multicast group. * * SMPng: NOTE: Potentially calls malloc(M_WAITOK) with Giant held. */ static int inp_set_source_filters(struct inpcb *inp, struct sockopt *sopt) { struct __msfilterreq msfr; sockunion_t *gsa; struct ifnet *ifp; struct in_mfilter *imf; struct ip_moptions *imo; struct in_multi *inm; size_t idx; int error; error = sooptcopyin(sopt, &msfr, sizeof(struct __msfilterreq), sizeof(struct __msfilterreq)); if (error) return (error); if (msfr.msfr_nsrcs > in_mcast_maxsocksrc) return (ENOBUFS); if ((msfr.msfr_fmode != MCAST_EXCLUDE && msfr.msfr_fmode != MCAST_INCLUDE)) return (EINVAL); if (msfr.msfr_group.ss_family != AF_INET || msfr.msfr_group.ss_len != sizeof(struct sockaddr_in)) return (EINVAL); gsa = (sockunion_t *)&msfr.msfr_group; if (!IN_MULTICAST(ntohl(gsa->sin.sin_addr.s_addr))) return (EINVAL); gsa->sin.sin_port = 0; /* ignore port */ if (msfr.msfr_ifindex == 0 || V_if_index < msfr.msfr_ifindex) return (EADDRNOTAVAIL); ifp = ifnet_byindex(msfr.msfr_ifindex); if (ifp == NULL) return (EADDRNOTAVAIL); /* * Take the INP write lock. * Check if this socket is a member of this group. */ imo = inp_findmoptions(inp); idx = imo_match_group(imo, ifp, &gsa->sa); if (idx == -1 || imo->imo_mfilters == NULL) { error = EADDRNOTAVAIL; goto out_inp_locked; } inm = imo->imo_membership[idx]; imf = &imo->imo_mfilters[idx]; /* * Begin state merge transaction at socket layer. */ INP_WLOCK_ASSERT(inp); imf->imf_st[1] = msfr.msfr_fmode; /* * Apply any new source filters, if present. * Make a copy of the user-space source vector so * that we may copy them with a single copyin. This * allows us to deal with page faults up-front. */ if (msfr.msfr_nsrcs > 0) { struct in_msource *lims; struct sockaddr_in *psin; struct sockaddr_storage *kss, *pkss; int i; INP_WUNLOCK(inp); CTR2(KTR_IGMPV3, "%s: loading %lu source list entries", __func__, (unsigned long)msfr.msfr_nsrcs); kss = malloc(sizeof(struct sockaddr_storage) * msfr.msfr_nsrcs, M_TEMP, M_WAITOK); error = copyin(msfr.msfr_srcs, kss, sizeof(struct sockaddr_storage) * msfr.msfr_nsrcs); if (error) { free(kss, M_TEMP); return (error); } INP_WLOCK(inp); /* * Mark all source filters as UNDEFINED at t1. * Restore new group filter mode, as imf_leave() * will set it to INCLUDE. */ imf_leave(imf); imf->imf_st[1] = msfr.msfr_fmode; /* * Update socket layer filters at t1, lazy-allocating * new entries. This saves a bunch of memory at the * cost of one RB_FIND() per source entry; duplicate * entries in the msfr_nsrcs vector are ignored. * If we encounter an error, rollback transaction. * * XXX This too could be replaced with a set-symmetric * difference like loop to avoid walking from root * every time, as the key space is common. */ for (i = 0, pkss = kss; i < msfr.msfr_nsrcs; i++, pkss++) { psin = (struct sockaddr_in *)pkss; if (psin->sin_family != AF_INET) { error = EAFNOSUPPORT; break; } if (psin->sin_len != sizeof(struct sockaddr_in)) { error = EINVAL; break; } error = imf_get_source(imf, psin, &lims); if (error) break; lims->imsl_st[1] = imf->imf_st[1]; } free(kss, M_TEMP); } if (error) goto out_imf_rollback; INP_WLOCK_ASSERT(inp); IN_MULTI_LOCK(); IN_MULTI_LIST_LOCK(); /* * Begin state merge transaction at IGMP layer. */ CTR1(KTR_IGMPV3, "%s: merge inm state", __func__); error = inm_merge(inm, imf); if (error) { CTR1(KTR_IGMPV3, "%s: failed to merge inm state", __func__); IN_MULTI_LIST_UNLOCK(); goto out_in_multi_locked; } CTR1(KTR_IGMPV3, "%s: doing igmp downcall", __func__); error = igmp_change_state(inm); IN_MULTI_LIST_UNLOCK(); if (error) CTR1(KTR_IGMPV3, "%s: failed igmp downcall", __func__); out_in_multi_locked: IN_MULTI_UNLOCK(); out_imf_rollback: if (error) imf_rollback(imf); else imf_commit(imf); imf_reap(imf); out_inp_locked: INP_WUNLOCK(inp); return (error); } /* * Set the IP multicast options in response to user setsockopt(). * * Many of the socket options handled in this function duplicate the * functionality of socket options in the regular unicast API. However, * it is not possible to merge the duplicate code, because the idempotence * of the IPv4 multicast part of the BSD Sockets API must be preserved; * the effects of these options must be treated as separate and distinct. * * SMPng: XXX: Unlocked read of inp_socket believed OK. * FUTURE: The IP_MULTICAST_VIF option may be eliminated if MROUTING * is refactored to no longer use vifs. */ int inp_setmoptions(struct inpcb *inp, struct sockopt *sopt) { struct ip_moptions *imo; int error; error = 0; /* * If socket is neither of type SOCK_RAW or SOCK_DGRAM, * or is a divert socket, reject it. */ if (inp->inp_socket->so_proto->pr_protocol == IPPROTO_DIVERT || (inp->inp_socket->so_proto->pr_type != SOCK_RAW && inp->inp_socket->so_proto->pr_type != SOCK_DGRAM)) return (EOPNOTSUPP); switch (sopt->sopt_name) { case IP_MULTICAST_VIF: { int vifi; /* * Select a multicast VIF for transmission. * Only useful if multicast forwarding is active. */ if (legal_vif_num == NULL) { error = EOPNOTSUPP; break; } error = sooptcopyin(sopt, &vifi, sizeof(int), sizeof(int)); if (error) break; if (!legal_vif_num(vifi) && (vifi != -1)) { error = EINVAL; break; } imo = inp_findmoptions(inp); imo->imo_multicast_vif = vifi; INP_WUNLOCK(inp); break; } case IP_MULTICAST_IF: error = inp_set_multicast_if(inp, sopt); break; case IP_MULTICAST_TTL: { u_char ttl; /* * Set the IP time-to-live for outgoing multicast packets. * The original multicast API required a char argument, * which is inconsistent with the rest of the socket API. * We allow either a char or an int. */ if (sopt->sopt_valsize == sizeof(u_char)) { error = sooptcopyin(sopt, &ttl, sizeof(u_char), sizeof(u_char)); if (error) break; } else { u_int ittl; error = sooptcopyin(sopt, &ittl, sizeof(u_int), sizeof(u_int)); if (error) break; if (ittl > 255) { error = EINVAL; break; } ttl = (u_char)ittl; } imo = inp_findmoptions(inp); imo->imo_multicast_ttl = ttl; INP_WUNLOCK(inp); break; } case IP_MULTICAST_LOOP: { u_char loop; /* * Set the loopback flag for outgoing multicast packets. * Must be zero or one. The original multicast API required a * char argument, which is inconsistent with the rest * of the socket API. We allow either a char or an int. */ if (sopt->sopt_valsize == sizeof(u_char)) { error = sooptcopyin(sopt, &loop, sizeof(u_char), sizeof(u_char)); if (error) break; } else { u_int iloop; error = sooptcopyin(sopt, &iloop, sizeof(u_int), sizeof(u_int)); if (error) break; loop = (u_char)iloop; } imo = inp_findmoptions(inp); imo->imo_multicast_loop = !!loop; INP_WUNLOCK(inp); break; } case IP_ADD_MEMBERSHIP: case IP_ADD_SOURCE_MEMBERSHIP: case MCAST_JOIN_GROUP: case MCAST_JOIN_SOURCE_GROUP: error = inp_join_group(inp, sopt); break; case IP_DROP_MEMBERSHIP: case IP_DROP_SOURCE_MEMBERSHIP: case MCAST_LEAVE_GROUP: case MCAST_LEAVE_SOURCE_GROUP: error = inp_leave_group(inp, sopt); break; case IP_BLOCK_SOURCE: case IP_UNBLOCK_SOURCE: case MCAST_BLOCK_SOURCE: case MCAST_UNBLOCK_SOURCE: error = inp_block_unblock_source(inp, sopt); break; case IP_MSFILTER: error = inp_set_source_filters(inp, sopt); break; default: error = EOPNOTSUPP; break; } INP_UNLOCK_ASSERT(inp); return (error); } /* * Expose IGMP's multicast filter mode and source list(s) to userland, * keyed by (ifindex, group). * The filter mode is written out as a uint32_t, followed by * 0..n of struct in_addr. * For use by ifmcstat(8). * SMPng: NOTE: unlocked read of ifindex space. */ static int sysctl_ip_mcast_filters(SYSCTL_HANDLER_ARGS) { struct in_addr src, group; struct ifnet *ifp; struct ifmultiaddr *ifma; struct in_multi *inm; struct ip_msource *ims; int *name; int retval; u_int namelen; uint32_t fmode, ifindex; name = (int *)arg1; namelen = arg2; if (req->newptr != NULL) return (EPERM); if (namelen != 2) return (EINVAL); ifindex = name[0]; if (ifindex <= 0 || ifindex > V_if_index) { CTR2(KTR_IGMPV3, "%s: ifindex %u out of range", __func__, ifindex); return (ENOENT); } group.s_addr = name[1]; if (!IN_MULTICAST(ntohl(group.s_addr))) { CTR2(KTR_IGMPV3, "%s: group 0x%08x is not multicast", __func__, ntohl(group.s_addr)); return (EINVAL); } ifp = ifnet_byindex(ifindex); if (ifp == NULL) { CTR2(KTR_IGMPV3, "%s: no ifp for ifindex %u", __func__, ifindex); return (ENOENT); } retval = sysctl_wire_old_buffer(req, sizeof(uint32_t) + (in_mcast_maxgrpsrc * sizeof(struct in_addr))); if (retval) return (retval); IN_MULTI_LIST_LOCK(); IF_ADDR_RLOCK(ifp); CK_STAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { if (ifma->ifma_addr->sa_family != AF_INET || ifma->ifma_protospec == NULL) continue; inm = (struct in_multi *)ifma->ifma_protospec; if (!in_hosteq(inm->inm_addr, group)) continue; fmode = inm->inm_st[1].iss_fmode; retval = SYSCTL_OUT(req, &fmode, sizeof(uint32_t)); if (retval != 0) break; RB_FOREACH(ims, ip_msource_tree, &inm->inm_srcs) { CTR2(KTR_IGMPV3, "%s: visit node 0x%08x", __func__, ims->ims_haddr); /* * Only copy-out sources which are in-mode. */ if (fmode != ims_get_mode(inm, ims, 1)) { CTR1(KTR_IGMPV3, "%s: skip non-in-mode", __func__); continue; } src.s_addr = htonl(ims->ims_haddr); retval = SYSCTL_OUT(req, &src, sizeof(struct in_addr)); if (retval != 0) break; } } IF_ADDR_RUNLOCK(ifp); IN_MULTI_LIST_UNLOCK(); return (retval); } #if defined(KTR) && (KTR_COMPILE & KTR_IGMPV3) static const char *inm_modestrs[] = { "un", "in", "ex" }; static const char * inm_mode_str(const int mode) { if (mode >= MCAST_UNDEFINED && mode <= MCAST_EXCLUDE) return (inm_modestrs[mode]); return ("??"); } static const char *inm_statestrs[] = { "not-member", "silent", "idle", "lazy", "sleeping", "awakening", "query-pending", "sg-query-pending", "leaving" }; static const char * inm_state_str(const int state) { if (state >= IGMP_NOT_MEMBER && state <= IGMP_LEAVING_MEMBER) return (inm_statestrs[state]); return ("??"); } /* * Dump an in_multi structure to the console. */ void inm_print(const struct in_multi *inm) { int t; char addrbuf[INET_ADDRSTRLEN]; if ((ktr_mask & KTR_IGMPV3) == 0) return; printf("%s: --- begin inm %p ---\n", __func__, inm); printf("addr %s ifp %p(%s) ifma %p\n", inet_ntoa_r(inm->inm_addr, addrbuf), inm->inm_ifp, inm->inm_ifp->if_xname, inm->inm_ifma); printf("timer %u state %s refcount %u scq.len %u\n", inm->inm_timer, inm_state_str(inm->inm_state), inm->inm_refcount, inm->inm_scq.mq_len); printf("igi %p nsrc %lu sctimer %u scrv %u\n", inm->inm_igi, inm->inm_nsrc, inm->inm_sctimer, inm->inm_scrv); for (t = 0; t < 2; t++) { printf("t%d: fmode %s asm %u ex %u in %u rec %u\n", t, inm_mode_str(inm->inm_st[t].iss_fmode), inm->inm_st[t].iss_asm, inm->inm_st[t].iss_ex, inm->inm_st[t].iss_in, inm->inm_st[t].iss_rec); } printf("%s: --- end inm %p ---\n", __func__, inm); } #else /* !KTR || !(KTR_COMPILE & KTR_IGMPV3) */ void inm_print(const struct in_multi *inm) { } #endif /* KTR && (KTR_COMPILE & KTR_IGMPV3) */ RB_GENERATE(ip_msource_tree, ip_msource, ims_link, ip_msource_cmp); Index: head/sys/netinet/in_pcb.c =================================================================== --- head/sys/netinet/in_pcb.c (revision 334117) +++ head/sys/netinet/in_pcb.c (revision 334118) @@ -1,3130 +1,3130 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1991, 1993, 1995 * The Regents of the University of California. * Copyright (c) 2007-2009 Robert N. M. Watson * Copyright (c) 2010-2011 Juniper Networks, Inc. * All rights reserved. * * Portions of this software were developed by Robert N. M. Watson under * contract to Juniper Networks, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)in_pcb.c 8.4 (Berkeley) 5/24/95 */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include "opt_ipsec.h" #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ratelimit.h" #include "opt_pcbgroup.h" #include "opt_rss.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DDB #include #endif #include #include #include #include #include #include #include #include #if defined(INET) || defined(INET6) #include #include #include #include #ifdef TCPHPTS #include #endif #include #include #endif #ifdef INET #include #endif #ifdef INET6 #include #include #include #include #endif /* INET6 */ #include #include static struct callout ipport_tick_callout; /* * These configure the range of local port addresses assigned to * "unspecified" outgoing connections/packets/whatever. */ VNET_DEFINE(int, ipport_lowfirstauto) = IPPORT_RESERVED - 1; /* 1023 */ VNET_DEFINE(int, ipport_lowlastauto) = IPPORT_RESERVEDSTART; /* 600 */ VNET_DEFINE(int, ipport_firstauto) = IPPORT_EPHEMERALFIRST; /* 10000 */ VNET_DEFINE(int, ipport_lastauto) = IPPORT_EPHEMERALLAST; /* 65535 */ VNET_DEFINE(int, ipport_hifirstauto) = IPPORT_HIFIRSTAUTO; /* 49152 */ VNET_DEFINE(int, ipport_hilastauto) = IPPORT_HILASTAUTO; /* 65535 */ /* * Reserved ports accessible only to root. There are significant * security considerations that must be accounted for when changing these, * but the security benefits can be great. Please be careful. */ VNET_DEFINE(int, ipport_reservedhigh) = IPPORT_RESERVED - 1; /* 1023 */ VNET_DEFINE(int, ipport_reservedlow); /* Variables dealing with random ephemeral port allocation. */ VNET_DEFINE(int, ipport_randomized) = 1; /* user controlled via sysctl */ VNET_DEFINE(int, ipport_randomcps) = 10; /* user controlled via sysctl */ VNET_DEFINE(int, ipport_randomtime) = 45; /* user controlled via sysctl */ VNET_DEFINE(int, ipport_stoprandom); /* toggled by ipport_tick */ VNET_DEFINE(int, ipport_tcpallocs); static VNET_DEFINE(int, ipport_tcplastcount); #define V_ipport_tcplastcount VNET(ipport_tcplastcount) static void in_pcbremlists(struct inpcb *inp); #ifdef INET static struct inpcb *in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo, struct in_addr faddr, u_int fport_arg, struct in_addr laddr, u_int lport_arg, int lookupflags, struct ifnet *ifp); #define RANGECHK(var, min, max) \ if ((var) < (min)) { (var) = (min); } \ else if ((var) > (max)) { (var) = (max); } static int sysctl_net_ipport_check(SYSCTL_HANDLER_ARGS) { int error; error = sysctl_handle_int(oidp, arg1, arg2, req); if (error == 0) { RANGECHK(V_ipport_lowfirstauto, 1, IPPORT_RESERVED - 1); RANGECHK(V_ipport_lowlastauto, 1, IPPORT_RESERVED - 1); RANGECHK(V_ipport_firstauto, IPPORT_RESERVED, IPPORT_MAX); RANGECHK(V_ipport_lastauto, IPPORT_RESERVED, IPPORT_MAX); RANGECHK(V_ipport_hifirstauto, IPPORT_RESERVED, IPPORT_MAX); RANGECHK(V_ipport_hilastauto, IPPORT_RESERVED, IPPORT_MAX); } return (error); } #undef RANGECHK static SYSCTL_NODE(_net_inet_ip, IPPROTO_IP, portrange, CTLFLAG_RW, 0, "IP Ports"); SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowfirst, CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW, &VNET_NAME(ipport_lowfirstauto), 0, &sysctl_net_ipport_check, "I", ""); SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowlast, CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW, &VNET_NAME(ipport_lowlastauto), 0, &sysctl_net_ipport_check, "I", ""); SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, first, CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW, &VNET_NAME(ipport_firstauto), 0, &sysctl_net_ipport_check, "I", ""); SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, last, CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW, &VNET_NAME(ipport_lastauto), 0, &sysctl_net_ipport_check, "I", ""); SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hifirst, CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW, &VNET_NAME(ipport_hifirstauto), 0, &sysctl_net_ipport_check, "I", ""); SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hilast, CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW, &VNET_NAME(ipport_hilastauto), 0, &sysctl_net_ipport_check, "I", ""); SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, reservedhigh, CTLFLAG_VNET | CTLFLAG_RW | CTLFLAG_SECURE, &VNET_NAME(ipport_reservedhigh), 0, ""); SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, reservedlow, CTLFLAG_RW|CTLFLAG_SECURE, &VNET_NAME(ipport_reservedlow), 0, ""); SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, randomized, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ipport_randomized), 0, "Enable random port allocation"); SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, randomcps, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ipport_randomcps), 0, "Maximum number of random port " "allocations before switching to a sequental one"); SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, randomtime, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ipport_randomtime), 0, "Minimum time to keep sequental port " "allocation before switching to a random one"); #endif /* INET */ /* * in_pcb.c: manage the Protocol Control Blocks. * * NOTE: It is assumed that most of these functions will be called with * the pcbinfo lock held, and often, the inpcb lock held, as these utility * functions often modify hash chains or addresses in pcbs. */ /* * Different protocols initialize their inpcbs differently - giving * different name to the lock. But they all are disposed the same. */ static void inpcb_fini(void *mem, int size) { struct inpcb *inp = mem; INP_LOCK_DESTROY(inp); } /* * Initialize an inpcbinfo -- we should be able to reduce the number of * arguments in time. */ void in_pcbinfo_init(struct inpcbinfo *pcbinfo, const char *name, struct inpcbhead *listhead, int hash_nelements, int porthash_nelements, char *inpcbzone_name, uma_init inpcbzone_init, u_int hashfields) { INP_INFO_LOCK_INIT(pcbinfo, name); INP_HASH_LOCK_INIT(pcbinfo, "pcbinfohash"); /* XXXRW: argument? */ INP_LIST_LOCK_INIT(pcbinfo, "pcbinfolist"); #ifdef VIMAGE pcbinfo->ipi_vnet = curvnet; #endif pcbinfo->ipi_listhead = listhead; LIST_INIT(pcbinfo->ipi_listhead); pcbinfo->ipi_count = 0; pcbinfo->ipi_hashbase = hashinit(hash_nelements, M_PCB, &pcbinfo->ipi_hashmask); pcbinfo->ipi_porthashbase = hashinit(porthash_nelements, M_PCB, &pcbinfo->ipi_porthashmask); #ifdef PCBGROUP in_pcbgroup_init(pcbinfo, hashfields, hash_nelements); #endif pcbinfo->ipi_zone = uma_zcreate(inpcbzone_name, sizeof(struct inpcb), NULL, NULL, inpcbzone_init, inpcb_fini, UMA_ALIGN_PTR, 0); uma_zone_set_max(pcbinfo->ipi_zone, maxsockets); uma_zone_set_warning(pcbinfo->ipi_zone, "kern.ipc.maxsockets limit reached"); } /* * Destroy an inpcbinfo. */ void in_pcbinfo_destroy(struct inpcbinfo *pcbinfo) { KASSERT(pcbinfo->ipi_count == 0, ("%s: ipi_count = %u", __func__, pcbinfo->ipi_count)); hashdestroy(pcbinfo->ipi_hashbase, M_PCB, pcbinfo->ipi_hashmask); hashdestroy(pcbinfo->ipi_porthashbase, M_PCB, pcbinfo->ipi_porthashmask); #ifdef PCBGROUP in_pcbgroup_destroy(pcbinfo); #endif uma_zdestroy(pcbinfo->ipi_zone); INP_LIST_LOCK_DESTROY(pcbinfo); INP_HASH_LOCK_DESTROY(pcbinfo); INP_INFO_LOCK_DESTROY(pcbinfo); } /* * Allocate a PCB and associate it with the socket. * On success return with the PCB locked. */ int in_pcballoc(struct socket *so, struct inpcbinfo *pcbinfo) { struct inpcb *inp; int error; #ifdef INVARIANTS if (pcbinfo == &V_tcbinfo) { INP_INFO_RLOCK_ASSERT(pcbinfo); } else { INP_INFO_WLOCK_ASSERT(pcbinfo); } #endif error = 0; inp = uma_zalloc(pcbinfo->ipi_zone, M_NOWAIT); if (inp == NULL) return (ENOBUFS); bzero(&inp->inp_start_zero, inp_zero_size); inp->inp_pcbinfo = pcbinfo; inp->inp_socket = so; inp->inp_cred = crhold(so->so_cred); inp->inp_inc.inc_fibnum = so->so_fibnum; #ifdef MAC error = mac_inpcb_init(inp, M_NOWAIT); if (error != 0) goto out; mac_inpcb_create(so, inp); #endif #if defined(IPSEC) || defined(IPSEC_SUPPORT) error = ipsec_init_pcbpolicy(inp); if (error != 0) { #ifdef MAC mac_inpcb_destroy(inp); #endif goto out; } #endif /*IPSEC*/ #ifdef INET6 if (INP_SOCKAF(so) == AF_INET6) { inp->inp_vflag |= INP_IPV6PROTO; if (V_ip6_v6only) inp->inp_flags |= IN6P_IPV6_V6ONLY; } #endif INP_WLOCK(inp); INP_LIST_WLOCK(pcbinfo); LIST_INSERT_HEAD(pcbinfo->ipi_listhead, inp, inp_list); pcbinfo->ipi_count++; so->so_pcb = (caddr_t)inp; #ifdef INET6 if (V_ip6_auto_flowlabel) inp->inp_flags |= IN6P_AUTOFLOWLABEL; #endif inp->inp_gencnt = ++pcbinfo->ipi_gencnt; refcount_init(&inp->inp_refcount, 1); /* Reference from inpcbinfo */ /* * Routes in inpcb's can cache L2 as well; they are guaranteed * to be cleaned up. */ inp->inp_route.ro_flags = RT_LLE_CACHE; INP_LIST_WUNLOCK(pcbinfo); #if defined(IPSEC) || defined(IPSEC_SUPPORT) || defined(MAC) out: if (error != 0) { crfree(inp->inp_cred); uma_zfree(pcbinfo->ipi_zone, inp); } #endif return (error); } #ifdef INET int in_pcbbind(struct inpcb *inp, struct sockaddr *nam, struct ucred *cred) { int anonport, error; INP_WLOCK_ASSERT(inp); INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo); if (inp->inp_lport != 0 || inp->inp_laddr.s_addr != INADDR_ANY) return (EINVAL); anonport = nam == NULL || ((struct sockaddr_in *)nam)->sin_port == 0; error = in_pcbbind_setup(inp, nam, &inp->inp_laddr.s_addr, &inp->inp_lport, cred); if (error) return (error); if (in_pcbinshash(inp) != 0) { inp->inp_laddr.s_addr = INADDR_ANY; inp->inp_lport = 0; return (EAGAIN); } if (anonport) inp->inp_flags |= INP_ANONPORT; return (0); } #endif /* * Select a local port (number) to use. */ #if defined(INET) || defined(INET6) int in_pcb_lport(struct inpcb *inp, struct in_addr *laddrp, u_short *lportp, struct ucred *cred, int lookupflags) { struct inpcbinfo *pcbinfo; struct inpcb *tmpinp; unsigned short *lastport; int count, dorandom, error; u_short aux, first, last, lport; #ifdef INET struct in_addr laddr; #endif pcbinfo = inp->inp_pcbinfo; /* * Because no actual state changes occur here, a global write lock on * the pcbinfo isn't required. */ INP_LOCK_ASSERT(inp); INP_HASH_LOCK_ASSERT(pcbinfo); if (inp->inp_flags & INP_HIGHPORT) { first = V_ipport_hifirstauto; /* sysctl */ last = V_ipport_hilastauto; lastport = &pcbinfo->ipi_lasthi; } else if (inp->inp_flags & INP_LOWPORT) { error = priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT, 0); if (error) return (error); first = V_ipport_lowfirstauto; /* 1023 */ last = V_ipport_lowlastauto; /* 600 */ lastport = &pcbinfo->ipi_lastlow; } else { first = V_ipport_firstauto; /* sysctl */ last = V_ipport_lastauto; lastport = &pcbinfo->ipi_lastport; } /* * For UDP(-Lite), use random port allocation as long as the user * allows it. For TCP (and as of yet unknown) connections, * use random port allocation only if the user allows it AND * ipport_tick() allows it. */ if (V_ipport_randomized && (!V_ipport_stoprandom || pcbinfo == &V_udbinfo || pcbinfo == &V_ulitecbinfo)) dorandom = 1; else dorandom = 0; /* * It makes no sense to do random port allocation if * we have the only port available. */ if (first == last) dorandom = 0; /* Make sure to not include UDP(-Lite) packets in the count. */ if (pcbinfo != &V_udbinfo || pcbinfo != &V_ulitecbinfo) V_ipport_tcpallocs++; /* * Instead of having two loops further down counting up or down * make sure that first is always <= last and go with only one * code path implementing all logic. */ if (first > last) { aux = first; first = last; last = aux; } #ifdef INET /* Make the compiler happy. */ laddr.s_addr = 0; if ((inp->inp_vflag & (INP_IPV4|INP_IPV6)) == INP_IPV4) { KASSERT(laddrp != NULL, ("%s: laddrp NULL for v4 inp %p", __func__, inp)); laddr = *laddrp; } #endif tmpinp = NULL; /* Make compiler happy. */ lport = *lportp; if (dorandom) *lastport = first + (arc4random() % (last - first)); count = last - first; do { if (count-- < 0) /* completely used? */ return (EADDRNOTAVAIL); ++*lastport; if (*lastport < first || *lastport > last) *lastport = first; lport = htons(*lastport); #ifdef INET6 if ((inp->inp_vflag & INP_IPV6) != 0) tmpinp = in6_pcblookup_local(pcbinfo, &inp->in6p_laddr, lport, lookupflags, cred); #endif #if defined(INET) && defined(INET6) else #endif #ifdef INET tmpinp = in_pcblookup_local(pcbinfo, laddr, lport, lookupflags, cred); #endif } while (tmpinp != NULL); #ifdef INET if ((inp->inp_vflag & (INP_IPV4|INP_IPV6)) == INP_IPV4) laddrp->s_addr = laddr.s_addr; #endif *lportp = lport; return (0); } /* * Return cached socket options. */ short inp_so_options(const struct inpcb *inp) { short so_options; so_options = 0; if ((inp->inp_flags2 & INP_REUSEPORT) != 0) so_options |= SO_REUSEPORT; if ((inp->inp_flags2 & INP_REUSEADDR) != 0) so_options |= SO_REUSEADDR; return (so_options); } #endif /* INET || INET6 */ /* * Check if a new BINDMULTI socket is allowed to be created. * * ni points to the new inp. * oi points to the exisitng inp. * * This checks whether the existing inp also has BINDMULTI and * whether the credentials match. */ int in_pcbbind_check_bindmulti(const struct inpcb *ni, const struct inpcb *oi) { /* Check permissions match */ if ((ni->inp_flags2 & INP_BINDMULTI) && (ni->inp_cred->cr_uid != oi->inp_cred->cr_uid)) return (0); /* Check the existing inp has BINDMULTI set */ if ((ni->inp_flags2 & INP_BINDMULTI) && ((oi->inp_flags2 & INP_BINDMULTI) == 0)) return (0); /* * We're okay - either INP_BINDMULTI isn't set on ni, or * it is and it matches the checks. */ return (1); } #ifdef INET /* * Set up a bind operation on a PCB, performing port allocation * as required, but do not actually modify the PCB. Callers can * either complete the bind by setting inp_laddr/inp_lport and * calling in_pcbinshash(), or they can just use the resulting * port and address to authorise the sending of a once-off packet. * * On error, the values of *laddrp and *lportp are not changed. */ int in_pcbbind_setup(struct inpcb *inp, struct sockaddr *nam, in_addr_t *laddrp, u_short *lportp, struct ucred *cred) { struct socket *so = inp->inp_socket; struct sockaddr_in *sin; struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; struct in_addr laddr; u_short lport = 0; int lookupflags = 0, reuseport = (so->so_options & SO_REUSEPORT); int error; /* * No state changes, so read locks are sufficient here. */ INP_LOCK_ASSERT(inp); INP_HASH_LOCK_ASSERT(pcbinfo); if (CK_STAILQ_EMPTY(&V_in_ifaddrhead)) /* XXX broken! */ return (EADDRNOTAVAIL); laddr.s_addr = *laddrp; if (nam != NULL && laddr.s_addr != INADDR_ANY) return (EINVAL); if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT)) == 0) lookupflags = INPLOOKUP_WILDCARD; if (nam == NULL) { if ((error = prison_local_ip4(cred, &laddr)) != 0) return (error); } else { sin = (struct sockaddr_in *)nam; if (nam->sa_len != sizeof (*sin)) return (EINVAL); #ifdef notdef /* * We should check the family, but old programs * incorrectly fail to initialize it. */ if (sin->sin_family != AF_INET) return (EAFNOSUPPORT); #endif error = prison_local_ip4(cred, &sin->sin_addr); if (error) return (error); if (sin->sin_port != *lportp) { /* Don't allow the port to change. */ if (*lportp != 0) return (EINVAL); lport = sin->sin_port; } /* NB: lport is left as 0 if the port isn't being changed. */ if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) { /* * Treat SO_REUSEADDR as SO_REUSEPORT for multicast; * allow complete duplication of binding if * SO_REUSEPORT is set, or if SO_REUSEADDR is set * and a multicast address is bound on both * new and duplicated sockets. */ if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT)) != 0) reuseport = SO_REUSEADDR|SO_REUSEPORT; } else if (sin->sin_addr.s_addr != INADDR_ANY) { sin->sin_port = 0; /* yech... */ bzero(&sin->sin_zero, sizeof(sin->sin_zero)); /* * Is the address a local IP address? * If INP_BINDANY is set, then the socket may be bound * to any endpoint address, local or not. */ if ((inp->inp_flags & INP_BINDANY) == 0 && ifa_ifwithaddr_check((struct sockaddr *)sin) == 0) return (EADDRNOTAVAIL); } laddr = sin->sin_addr; if (lport) { struct inpcb *t; struct tcptw *tw; /* GROSS */ if (ntohs(lport) <= V_ipport_reservedhigh && ntohs(lport) >= V_ipport_reservedlow && priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT, 0)) return (EACCES); if (!IN_MULTICAST(ntohl(sin->sin_addr.s_addr)) && priv_check_cred(inp->inp_cred, PRIV_NETINET_REUSEPORT, 0) != 0) { t = in_pcblookup_local(pcbinfo, sin->sin_addr, lport, INPLOOKUP_WILDCARD, cred); /* * XXX * This entire block sorely needs a rewrite. */ if (t && ((inp->inp_flags2 & INP_BINDMULTI) == 0) && ((t->inp_flags & INP_TIMEWAIT) == 0) && (so->so_type != SOCK_STREAM || ntohl(t->inp_faddr.s_addr) == INADDR_ANY) && (ntohl(sin->sin_addr.s_addr) != INADDR_ANY || ntohl(t->inp_laddr.s_addr) != INADDR_ANY || (t->inp_flags2 & INP_REUSEPORT) == 0) && (inp->inp_cred->cr_uid != t->inp_cred->cr_uid)) return (EADDRINUSE); /* * If the socket is a BINDMULTI socket, then * the credentials need to match and the * original socket also has to have been bound * with BINDMULTI. */ if (t && (! in_pcbbind_check_bindmulti(inp, t))) return (EADDRINUSE); } t = in_pcblookup_local(pcbinfo, sin->sin_addr, lport, lookupflags, cred); if (t && (t->inp_flags & INP_TIMEWAIT)) { /* * XXXRW: If an incpb has had its timewait * state recycled, we treat the address as * being in use (for now). This is better * than a panic, but not desirable. */ tw = intotw(t); if (tw == NULL || (reuseport & tw->tw_so_options) == 0) return (EADDRINUSE); } else if (t && ((inp->inp_flags2 & INP_BINDMULTI) == 0) && (reuseport & inp_so_options(t)) == 0) { #ifdef INET6 if (ntohl(sin->sin_addr.s_addr) != INADDR_ANY || ntohl(t->inp_laddr.s_addr) != INADDR_ANY || (inp->inp_vflag & INP_IPV6PROTO) == 0 || (t->inp_vflag & INP_IPV6PROTO) == 0) #endif return (EADDRINUSE); if (t && (! in_pcbbind_check_bindmulti(inp, t))) return (EADDRINUSE); } } } if (*lportp != 0) lport = *lportp; if (lport == 0) { error = in_pcb_lport(inp, &laddr, &lport, cred, lookupflags); if (error != 0) return (error); } *laddrp = laddr.s_addr; *lportp = lport; return (0); } /* * Connect from a socket to a specified address. * Both address and port must be specified in argument sin. * If don't have a local address for this socket yet, * then pick one. */ int in_pcbconnect_mbuf(struct inpcb *inp, struct sockaddr *nam, struct ucred *cred, struct mbuf *m) { u_short lport, fport; in_addr_t laddr, faddr; int anonport, error; INP_WLOCK_ASSERT(inp); INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo); lport = inp->inp_lport; laddr = inp->inp_laddr.s_addr; anonport = (lport == 0); error = in_pcbconnect_setup(inp, nam, &laddr, &lport, &faddr, &fport, NULL, cred); if (error) return (error); /* Do the initial binding of the local address if required. */ if (inp->inp_laddr.s_addr == INADDR_ANY && inp->inp_lport == 0) { inp->inp_lport = lport; inp->inp_laddr.s_addr = laddr; if (in_pcbinshash(inp) != 0) { inp->inp_laddr.s_addr = INADDR_ANY; inp->inp_lport = 0; return (EAGAIN); } } /* Commit the remaining changes. */ inp->inp_lport = lport; inp->inp_laddr.s_addr = laddr; inp->inp_faddr.s_addr = faddr; inp->inp_fport = fport; in_pcbrehash_mbuf(inp, m); if (anonport) inp->inp_flags |= INP_ANONPORT; return (0); } int in_pcbconnect(struct inpcb *inp, struct sockaddr *nam, struct ucred *cred) { return (in_pcbconnect_mbuf(inp, nam, cred, NULL)); } /* * Do proper source address selection on an unbound socket in case * of connect. Take jails into account as well. */ int in_pcbladdr(struct inpcb *inp, struct in_addr *faddr, struct in_addr *laddr, struct ucred *cred) { struct ifaddr *ifa; struct sockaddr *sa; struct sockaddr_in *sin; struct route sro; int error; KASSERT(laddr != NULL, ("%s: laddr NULL", __func__)); - /* * Bypass source address selection and use the primary jail IP * if requested. */ if (cred != NULL && !prison_saddrsel_ip4(cred, laddr)) return (0); error = 0; bzero(&sro, sizeof(sro)); sin = (struct sockaddr_in *)&sro.ro_dst; sin->sin_family = AF_INET; sin->sin_len = sizeof(struct sockaddr_in); sin->sin_addr.s_addr = faddr->s_addr; /* * If route is known our src addr is taken from the i/f, * else punt. * * Find out route to destination. */ if ((inp->inp_socket->so_options & SO_DONTROUTE) == 0) in_rtalloc_ign(&sro, 0, inp->inp_inc.inc_fibnum); /* * If we found a route, use the address corresponding to * the outgoing interface. * * Otherwise assume faddr is reachable on a directly connected * network and try to find a corresponding interface to take * the source address from. */ + NET_EPOCH_ENTER(); if (sro.ro_rt == NULL || sro.ro_rt->rt_ifp == NULL) { struct in_ifaddr *ia; struct ifnet *ifp; ia = ifatoia(ifa_ifwithdstaddr((struct sockaddr *)sin, inp->inp_socket->so_fibnum)); - if (ia == NULL) + if (ia == NULL) { ia = ifatoia(ifa_ifwithnet((struct sockaddr *)sin, 0, inp->inp_socket->so_fibnum)); + + } if (ia == NULL) { + printf("ifa_ifwithnet failed\n"); error = ENETUNREACH; goto done; } if (cred == NULL || !prison_flag(cred, PR_IP4)) { laddr->s_addr = ia->ia_addr.sin_addr.s_addr; - ifa_free(&ia->ia_ifa); goto done; } ifp = ia->ia_ifp; - ifa_free(&ia->ia_ifa); ia = NULL; IF_ADDR_RLOCK(ifp); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { sa = ifa->ifa_addr; if (sa->sa_family != AF_INET) continue; sin = (struct sockaddr_in *)sa; if (prison_check_ip4(cred, &sin->sin_addr) == 0) { ia = (struct in_ifaddr *)ifa; break; } } if (ia != NULL) { laddr->s_addr = ia->ia_addr.sin_addr.s_addr; IF_ADDR_RUNLOCK(ifp); goto done; } IF_ADDR_RUNLOCK(ifp); /* 3. As a last resort return the 'default' jail address. */ error = prison_get_ip4(cred, laddr); goto done; } /* * If the outgoing interface on the route found is not * a loopback interface, use the address from that interface. * In case of jails do those three steps: * 1. check if the interface address belongs to the jail. If so use it. * 2. check if we have any address on the outgoing interface * belonging to this jail. If so use it. * 3. as a last resort return the 'default' jail address. */ if ((sro.ro_rt->rt_ifp->if_flags & IFF_LOOPBACK) == 0) { struct in_ifaddr *ia; struct ifnet *ifp; /* If not jailed, use the default returned. */ if (cred == NULL || !prison_flag(cred, PR_IP4)) { ia = (struct in_ifaddr *)sro.ro_rt->rt_ifa; laddr->s_addr = ia->ia_addr.sin_addr.s_addr; goto done; } /* Jailed. */ /* 1. Check if the iface address belongs to the jail. */ sin = (struct sockaddr_in *)sro.ro_rt->rt_ifa->ifa_addr; if (prison_check_ip4(cred, &sin->sin_addr) == 0) { ia = (struct in_ifaddr *)sro.ro_rt->rt_ifa; laddr->s_addr = ia->ia_addr.sin_addr.s_addr; goto done; } /* * 2. Check if we have any address on the outgoing interface * belonging to this jail. */ ia = NULL; ifp = sro.ro_rt->rt_ifp; IF_ADDR_RLOCK(ifp); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { sa = ifa->ifa_addr; if (sa->sa_family != AF_INET) continue; sin = (struct sockaddr_in *)sa; if (prison_check_ip4(cred, &sin->sin_addr) == 0) { ia = (struct in_ifaddr *)ifa; break; } } if (ia != NULL) { laddr->s_addr = ia->ia_addr.sin_addr.s_addr; IF_ADDR_RUNLOCK(ifp); goto done; } IF_ADDR_RUNLOCK(ifp); /* 3. As a last resort return the 'default' jail address. */ error = prison_get_ip4(cred, laddr); goto done; } /* * The outgoing interface is marked with 'loopback net', so a route * to ourselves is here. * Try to find the interface of the destination address and then * take the address from there. That interface is not necessarily * a loopback interface. * In case of jails, check that it is an address of the jail * and if we cannot find, fall back to the 'default' jail address. */ if ((sro.ro_rt->rt_ifp->if_flags & IFF_LOOPBACK) != 0) { struct sockaddr_in sain; struct in_ifaddr *ia; bzero(&sain, sizeof(struct sockaddr_in)); sain.sin_family = AF_INET; sain.sin_len = sizeof(struct sockaddr_in); sain.sin_addr.s_addr = faddr->s_addr; ia = ifatoia(ifa_ifwithdstaddr(sintosa(&sain), inp->inp_socket->so_fibnum)); if (ia == NULL) ia = ifatoia(ifa_ifwithnet(sintosa(&sain), 0, inp->inp_socket->so_fibnum)); if (ia == NULL) ia = ifatoia(ifa_ifwithaddr(sintosa(&sain))); if (cred == NULL || !prison_flag(cred, PR_IP4)) { if (ia == NULL) { error = ENETUNREACH; goto done; } laddr->s_addr = ia->ia_addr.sin_addr.s_addr; - ifa_free(&ia->ia_ifa); goto done; } /* Jailed. */ if (ia != NULL) { struct ifnet *ifp; ifp = ia->ia_ifp; - ifa_free(&ia->ia_ifa); ia = NULL; IF_ADDR_RLOCK(ifp); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { sa = ifa->ifa_addr; if (sa->sa_family != AF_INET) continue; sin = (struct sockaddr_in *)sa; if (prison_check_ip4(cred, &sin->sin_addr) == 0) { ia = (struct in_ifaddr *)ifa; break; } } if (ia != NULL) { laddr->s_addr = ia->ia_addr.sin_addr.s_addr; IF_ADDR_RUNLOCK(ifp); goto done; } IF_ADDR_RUNLOCK(ifp); } /* 3. As a last resort return the 'default' jail address. */ error = prison_get_ip4(cred, laddr); goto done; } done: + NET_EPOCH_EXIT(); if (sro.ro_rt != NULL) RTFREE(sro.ro_rt); return (error); } /* * Set up for a connect from a socket to the specified address. * On entry, *laddrp and *lportp should contain the current local * address and port for the PCB; these are updated to the values * that should be placed in inp_laddr and inp_lport to complete * the connect. * * On success, *faddrp and *fportp will be set to the remote address * and port. These are not updated in the error case. * * If the operation fails because the connection already exists, * *oinpp will be set to the PCB of that connection so that the * caller can decide to override it. In all other cases, *oinpp * is set to NULL. */ int in_pcbconnect_setup(struct inpcb *inp, struct sockaddr *nam, in_addr_t *laddrp, u_short *lportp, in_addr_t *faddrp, u_short *fportp, struct inpcb **oinpp, struct ucred *cred) { struct rm_priotracker in_ifa_tracker; struct sockaddr_in *sin = (struct sockaddr_in *)nam; struct in_ifaddr *ia; struct inpcb *oinp; struct in_addr laddr, faddr; u_short lport, fport; int error; /* * Because a global state change doesn't actually occur here, a read * lock is sufficient. */ INP_LOCK_ASSERT(inp); INP_HASH_LOCK_ASSERT(inp->inp_pcbinfo); if (oinpp != NULL) *oinpp = NULL; if (nam->sa_len != sizeof (*sin)) return (EINVAL); if (sin->sin_family != AF_INET) return (EAFNOSUPPORT); if (sin->sin_port == 0) return (EADDRNOTAVAIL); laddr.s_addr = *laddrp; lport = *lportp; faddr = sin->sin_addr; fport = sin->sin_port; if (!CK_STAILQ_EMPTY(&V_in_ifaddrhead)) { /* * If the destination address is INADDR_ANY, * use the primary local address. * If the supplied address is INADDR_BROADCAST, * and the primary interface supports broadcast, * choose the broadcast address for that interface. */ if (faddr.s_addr == INADDR_ANY) { IN_IFADDR_RLOCK(&in_ifa_tracker); faddr = IA_SIN(CK_STAILQ_FIRST(&V_in_ifaddrhead))->sin_addr; IN_IFADDR_RUNLOCK(&in_ifa_tracker); if (cred != NULL && (error = prison_get_ip4(cred, &faddr)) != 0) return (error); } else if (faddr.s_addr == (u_long)INADDR_BROADCAST) { IN_IFADDR_RLOCK(&in_ifa_tracker); if (CK_STAILQ_FIRST(&V_in_ifaddrhead)->ia_ifp->if_flags & IFF_BROADCAST) faddr = satosin(&CK_STAILQ_FIRST( &V_in_ifaddrhead)->ia_broadaddr)->sin_addr; IN_IFADDR_RUNLOCK(&in_ifa_tracker); } } if (laddr.s_addr == INADDR_ANY) { error = in_pcbladdr(inp, &faddr, &laddr, cred); /* * If the destination address is multicast and an outgoing * interface has been set as a multicast option, prefer the * address of that interface as our source address. */ if (IN_MULTICAST(ntohl(faddr.s_addr)) && inp->inp_moptions != NULL) { struct ip_moptions *imo; struct ifnet *ifp; imo = inp->inp_moptions; if (imo->imo_multicast_ifp != NULL) { ifp = imo->imo_multicast_ifp; IN_IFADDR_RLOCK(&in_ifa_tracker); CK_STAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) { if ((ia->ia_ifp == ifp) && (cred == NULL || prison_check_ip4(cred, &ia->ia_addr.sin_addr) == 0)) break; } if (ia == NULL) error = EADDRNOTAVAIL; else { laddr = ia->ia_addr.sin_addr; error = 0; } IN_IFADDR_RUNLOCK(&in_ifa_tracker); } } if (error) return (error); } oinp = in_pcblookup_hash_locked(inp->inp_pcbinfo, faddr, fport, laddr, lport, 0, NULL); if (oinp != NULL) { if (oinpp != NULL) *oinpp = oinp; return (EADDRINUSE); } if (lport == 0) { error = in_pcbbind_setup(inp, NULL, &laddr.s_addr, &lport, cred); if (error) return (error); } *laddrp = laddr.s_addr; *lportp = lport; *faddrp = faddr.s_addr; *fportp = fport; return (0); } void in_pcbdisconnect(struct inpcb *inp) { INP_WLOCK_ASSERT(inp); INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo); inp->inp_faddr.s_addr = INADDR_ANY; inp->inp_fport = 0; in_pcbrehash(inp); } #endif /* INET */ /* * in_pcbdetach() is responsibe for disassociating a socket from an inpcb. * For most protocols, this will be invoked immediately prior to calling * in_pcbfree(). However, with TCP the inpcb may significantly outlive the * socket, in which case in_pcbfree() is deferred. */ void in_pcbdetach(struct inpcb *inp) { KASSERT(inp->inp_socket != NULL, ("%s: inp_socket == NULL", __func__)); #ifdef RATELIMIT if (inp->inp_snd_tag != NULL) in_pcbdetach_txrtlmt(inp); #endif inp->inp_socket->so_pcb = NULL; inp->inp_socket = NULL; } /* * in_pcbref() bumps the reference count on an inpcb in order to maintain * stability of an inpcb pointer despite the inpcb lock being released. This * is used in TCP when the inpcbinfo lock needs to be acquired or upgraded, * but where the inpcb lock may already held, or when acquiring a reference * via a pcbgroup. * * in_pcbref() should be used only to provide brief memory stability, and * must always be followed by a call to INP_WLOCK() and in_pcbrele() to * garbage collect the inpcb if it has been in_pcbfree()'d from another * context. Until in_pcbrele() has returned that the inpcb is still valid, * lock and rele are the *only* safe operations that may be performed on the * inpcb. * * While the inpcb will not be freed, releasing the inpcb lock means that the * connection's state may change, so the caller should be careful to * revalidate any cached state on reacquiring the lock. Drop the reference * using in_pcbrele(). */ void in_pcbref(struct inpcb *inp) { KASSERT(inp->inp_refcount > 0, ("%s: refcount 0", __func__)); refcount_acquire(&inp->inp_refcount); } /* * Drop a refcount on an inpcb elevated using in_pcbref(); because a call to * in_pcbfree() may have been made between in_pcbref() and in_pcbrele(), we * return a flag indicating whether or not the inpcb remains valid. If it is * valid, we return with the inpcb lock held. * * Notice that, unlike in_pcbref(), the inpcb lock must be held to drop a * reference on an inpcb. Historically more work was done here (actually, in * in_pcbfree_internal()) but has been moved to in_pcbfree() to avoid the * need for the pcbinfo lock in in_pcbrele(). Deferring the free is entirely * about memory stability (and continued use of the write lock). */ int in_pcbrele_rlocked(struct inpcb *inp) { struct inpcbinfo *pcbinfo; KASSERT(inp->inp_refcount > 0, ("%s: refcount 0", __func__)); INP_RLOCK_ASSERT(inp); if (refcount_release(&inp->inp_refcount) == 0) { /* * If the inpcb has been freed, let the caller know, even if * this isn't the last reference. */ if (inp->inp_flags2 & INP_FREED) { INP_RUNLOCK(inp); return (1); } return (0); } KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL", __func__)); #ifdef TCPHPTS if (inp->inp_in_hpts || inp->inp_in_input) { struct tcp_hpts_entry *hpts; /* * We should not be on the hpts at * this point in any form. we must * get the lock to be sure. */ hpts = tcp_hpts_lock(inp); if (inp->inp_in_hpts) panic("Hpts:%p inp:%p at free still on hpts", hpts, inp); mtx_unlock(&hpts->p_mtx); hpts = tcp_input_lock(inp); if (inp->inp_in_input) panic("Hpts:%p inp:%p at free still on input hpts", hpts, inp); mtx_unlock(&hpts->p_mtx); } #endif INP_RUNLOCK(inp); pcbinfo = inp->inp_pcbinfo; uma_zfree(pcbinfo->ipi_zone, inp); return (1); } int in_pcbrele_wlocked(struct inpcb *inp) { struct inpcbinfo *pcbinfo; KASSERT(inp->inp_refcount > 0, ("%s: refcount 0", __func__)); INP_WLOCK_ASSERT(inp); if (refcount_release(&inp->inp_refcount) == 0) { /* * If the inpcb has been freed, let the caller know, even if * this isn't the last reference. */ if (inp->inp_flags2 & INP_FREED) { INP_WUNLOCK(inp); return (1); } return (0); } KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL", __func__)); #ifdef TCPHPTS if (inp->inp_in_hpts || inp->inp_in_input) { struct tcp_hpts_entry *hpts; /* * We should not be on the hpts at * this point in any form. we must * get the lock to be sure. */ hpts = tcp_hpts_lock(inp); if (inp->inp_in_hpts) panic("Hpts:%p inp:%p at free still on hpts", hpts, inp); mtx_unlock(&hpts->p_mtx); hpts = tcp_input_lock(inp); if (inp->inp_in_input) panic("Hpts:%p inp:%p at free still on input hpts", hpts, inp); mtx_unlock(&hpts->p_mtx); } #endif INP_WUNLOCK(inp); pcbinfo = inp->inp_pcbinfo; uma_zfree(pcbinfo->ipi_zone, inp); return (1); } /* * Temporary wrapper. */ int in_pcbrele(struct inpcb *inp) { return (in_pcbrele_wlocked(inp)); } void in_pcblist_rele_rlocked(epoch_context_t ctx) { struct in_pcblist *il; struct inpcb *inp; struct inpcbinfo *pcbinfo; int i, n; il = __containerof(ctx, struct in_pcblist, il_epoch_ctx); pcbinfo = il->il_pcbinfo; n = il->il_count; INP_INFO_WLOCK(pcbinfo); for (i = 0; i < n; i++) { inp = il->il_inp_list[i]; INP_RLOCK(inp); if (!in_pcbrele_rlocked(inp)) INP_RUNLOCK(inp); } INP_INFO_WUNLOCK(pcbinfo); free(il, M_TEMP); } /* * Unconditionally schedule an inpcb to be freed by decrementing its * reference count, which should occur only after the inpcb has been detached * from its socket. If another thread holds a temporary reference (acquired * using in_pcbref()) then the free is deferred until that reference is * released using in_pcbrele(), but the inpcb is still unlocked. Almost all * work, including removal from global lists, is done in this context, where * the pcbinfo lock is held. */ void in_pcbfree(struct inpcb *inp) { struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; #ifdef INET6 struct ip6_moptions *im6o = NULL; #endif #ifdef INET struct ip_moptions *imo = NULL; #endif KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL", __func__)); KASSERT((inp->inp_flags2 & INP_FREED) == 0, ("%s: called twice for pcb %p", __func__, inp)); if (inp->inp_flags2 & INP_FREED) { INP_WUNLOCK(inp); return; } #ifdef INVARIANTS if (pcbinfo == &V_tcbinfo) { INP_INFO_LOCK_ASSERT(pcbinfo); } else { INP_INFO_WLOCK_ASSERT(pcbinfo); } #endif INP_WLOCK_ASSERT(inp); #ifdef INET imo = inp->inp_moptions; inp->inp_moptions = NULL; #endif /* XXXRW: Do as much as possible here. */ #if defined(IPSEC) || defined(IPSEC_SUPPORT) if (inp->inp_sp != NULL) ipsec_delete_pcbpolicy(inp); #endif INP_LIST_WLOCK(pcbinfo); inp->inp_gencnt = ++pcbinfo->ipi_gencnt; in_pcbremlists(inp); INP_LIST_WUNLOCK(pcbinfo); #ifdef INET6 if (inp->inp_vflag & INP_IPV6PROTO) { ip6_freepcbopts(inp->in6p_outputopts); im6o = inp->in6p_moptions; inp->in6p_moptions = NULL; } #endif if (inp->inp_options) (void)m_free(inp->inp_options); RO_INVALIDATE_CACHE(&inp->inp_route); inp->inp_vflag = 0; inp->inp_flags2 |= INP_FREED; crfree(inp->inp_cred); #ifdef MAC mac_inpcb_destroy(inp); #endif #ifdef INET6 ip6_freemoptions(im6o); #endif #ifdef INET inp_freemoptions(imo); #endif if (!in_pcbrele_wlocked(inp)) INP_WUNLOCK(inp); } /* * in_pcbdrop() removes an inpcb from hashed lists, releasing its address and * port reservation, and preventing it from being returned by inpcb lookups. * * It is used by TCP to mark an inpcb as unused and avoid future packet * delivery or event notification when a socket remains open but TCP has * closed. This might occur as a result of a shutdown()-initiated TCP close * or a RST on the wire, and allows the port binding to be reused while still * maintaining the invariant that so_pcb always points to a valid inpcb until * in_pcbdetach(). * * XXXRW: Possibly in_pcbdrop() should also prevent future notifications by * in_pcbnotifyall() and in_pcbpurgeif0()? */ void in_pcbdrop(struct inpcb *inp) { INP_WLOCK_ASSERT(inp); /* * XXXRW: Possibly we should protect the setting of INP_DROPPED with * the hash lock...? */ inp->inp_flags |= INP_DROPPED; if (inp->inp_flags & INP_INHASHLIST) { struct inpcbport *phd = inp->inp_phd; INP_HASH_WLOCK(inp->inp_pcbinfo); LIST_REMOVE(inp, inp_hash); LIST_REMOVE(inp, inp_portlist); if (LIST_FIRST(&phd->phd_pcblist) == NULL) { LIST_REMOVE(phd, phd_hash); free(phd, M_PCB); } INP_HASH_WUNLOCK(inp->inp_pcbinfo); inp->inp_flags &= ~INP_INHASHLIST; #ifdef PCBGROUP in_pcbgroup_remove(inp); #endif } } #ifdef INET /* * Common routines to return the socket addresses associated with inpcbs. */ struct sockaddr * in_sockaddr(in_port_t port, struct in_addr *addr_p) { struct sockaddr_in *sin; sin = malloc(sizeof *sin, M_SONAME, M_WAITOK | M_ZERO); sin->sin_family = AF_INET; sin->sin_len = sizeof(*sin); sin->sin_addr = *addr_p; sin->sin_port = port; return (struct sockaddr *)sin; } int in_getsockaddr(struct socket *so, struct sockaddr **nam) { struct inpcb *inp; struct in_addr addr; in_port_t port; inp = sotoinpcb(so); KASSERT(inp != NULL, ("in_getsockaddr: inp == NULL")); INP_RLOCK(inp); port = inp->inp_lport; addr = inp->inp_laddr; INP_RUNLOCK(inp); *nam = in_sockaddr(port, &addr); return 0; } int in_getpeeraddr(struct socket *so, struct sockaddr **nam) { struct inpcb *inp; struct in_addr addr; in_port_t port; inp = sotoinpcb(so); KASSERT(inp != NULL, ("in_getpeeraddr: inp == NULL")); INP_RLOCK(inp); port = inp->inp_fport; addr = inp->inp_faddr; INP_RUNLOCK(inp); *nam = in_sockaddr(port, &addr); return 0; } void in_pcbnotifyall(struct inpcbinfo *pcbinfo, struct in_addr faddr, int errno, struct inpcb *(*notify)(struct inpcb *, int)) { struct inpcb *inp, *inp_temp; INP_INFO_WLOCK(pcbinfo); LIST_FOREACH_SAFE(inp, pcbinfo->ipi_listhead, inp_list, inp_temp) { INP_WLOCK(inp); #ifdef INET6 if ((inp->inp_vflag & INP_IPV4) == 0) { INP_WUNLOCK(inp); continue; } #endif if (inp->inp_faddr.s_addr != faddr.s_addr || inp->inp_socket == NULL) { INP_WUNLOCK(inp); continue; } if ((*notify)(inp, errno)) INP_WUNLOCK(inp); } INP_INFO_WUNLOCK(pcbinfo); } void in_pcbpurgeif0(struct inpcbinfo *pcbinfo, struct ifnet *ifp) { struct inpcb *inp; struct ip_moptions *imo; int i, gap; INP_INFO_WLOCK(pcbinfo); LIST_FOREACH(inp, pcbinfo->ipi_listhead, inp_list) { INP_WLOCK(inp); imo = inp->inp_moptions; if ((inp->inp_vflag & INP_IPV4) && imo != NULL) { /* * Unselect the outgoing interface if it is being * detached. */ if (imo->imo_multicast_ifp == ifp) imo->imo_multicast_ifp = NULL; /* * Drop multicast group membership if we joined * through the interface being detached. * * XXX This can all be deferred to an epoch_call */ for (i = 0, gap = 0; i < imo->imo_num_memberships; i++) { if (imo->imo_membership[i]->inm_ifp == ifp) { IN_MULTI_LOCK_ASSERT(); in_leavegroup_locked(imo->imo_membership[i], NULL); gap++; } else if (gap != 0) imo->imo_membership[i - gap] = imo->imo_membership[i]; } imo->imo_num_memberships -= gap; } INP_WUNLOCK(inp); } INP_INFO_WUNLOCK(pcbinfo); } /* * Lookup a PCB based on the local address and port. Caller must hold the * hash lock. No inpcb locks or references are acquired. */ #define INP_LOOKUP_MAPPED_PCB_COST 3 struct inpcb * in_pcblookup_local(struct inpcbinfo *pcbinfo, struct in_addr laddr, u_short lport, int lookupflags, struct ucred *cred) { struct inpcb *inp; #ifdef INET6 int matchwild = 3 + INP_LOOKUP_MAPPED_PCB_COST; #else int matchwild = 3; #endif int wildcard; KASSERT((lookupflags & ~(INPLOOKUP_WILDCARD)) == 0, ("%s: invalid lookup flags %d", __func__, lookupflags)); INP_HASH_LOCK_ASSERT(pcbinfo); if ((lookupflags & INPLOOKUP_WILDCARD) == 0) { struct inpcbhead *head; /* * Look for an unconnected (wildcard foreign addr) PCB that * matches the local address and port we're looking for. */ head = &pcbinfo->ipi_hashbase[INP_PCBHASH(INADDR_ANY, lport, 0, pcbinfo->ipi_hashmask)]; LIST_FOREACH(inp, head, inp_hash) { #ifdef INET6 /* XXX inp locking */ if ((inp->inp_vflag & INP_IPV4) == 0) continue; #endif if (inp->inp_faddr.s_addr == INADDR_ANY && inp->inp_laddr.s_addr == laddr.s_addr && inp->inp_lport == lport) { /* * Found? */ if (cred == NULL || prison_equal_ip4(cred->cr_prison, inp->inp_cred->cr_prison)) return (inp); } } /* * Not found. */ return (NULL); } else { struct inpcbporthead *porthash; struct inpcbport *phd; struct inpcb *match = NULL; /* * Best fit PCB lookup. * * First see if this local port is in use by looking on the * port hash list. */ porthash = &pcbinfo->ipi_porthashbase[INP_PCBPORTHASH(lport, pcbinfo->ipi_porthashmask)]; LIST_FOREACH(phd, porthash, phd_hash) { if (phd->phd_port == lport) break; } if (phd != NULL) { /* * Port is in use by one or more PCBs. Look for best * fit. */ LIST_FOREACH(inp, &phd->phd_pcblist, inp_portlist) { wildcard = 0; if (cred != NULL && !prison_equal_ip4(inp->inp_cred->cr_prison, cred->cr_prison)) continue; #ifdef INET6 /* XXX inp locking */ if ((inp->inp_vflag & INP_IPV4) == 0) continue; /* * We never select the PCB that has * INP_IPV6 flag and is bound to :: if * we have another PCB which is bound * to 0.0.0.0. If a PCB has the * INP_IPV6 flag, then we set its cost * higher than IPv4 only PCBs. * * Note that the case only happens * when a socket is bound to ::, under * the condition that the use of the * mapped address is allowed. */ if ((inp->inp_vflag & INP_IPV6) != 0) wildcard += INP_LOOKUP_MAPPED_PCB_COST; #endif if (inp->inp_faddr.s_addr != INADDR_ANY) wildcard++; if (inp->inp_laddr.s_addr != INADDR_ANY) { if (laddr.s_addr == INADDR_ANY) wildcard++; else if (inp->inp_laddr.s_addr != laddr.s_addr) continue; } else { if (laddr.s_addr != INADDR_ANY) wildcard++; } if (wildcard < matchwild) { match = inp; matchwild = wildcard; if (matchwild == 0) break; } } } return (match); } } #undef INP_LOOKUP_MAPPED_PCB_COST #ifdef PCBGROUP /* * Lookup PCB in hash list, using pcbgroup tables. */ static struct inpcb * in_pcblookup_group(struct inpcbinfo *pcbinfo, struct inpcbgroup *pcbgroup, struct in_addr faddr, u_int fport_arg, struct in_addr laddr, u_int lport_arg, int lookupflags, struct ifnet *ifp) { struct inpcbhead *head; struct inpcb *inp, *tmpinp; u_short fport = fport_arg, lport = lport_arg; bool locked; /* * First look for an exact match. */ tmpinp = NULL; INP_GROUP_LOCK(pcbgroup); head = &pcbgroup->ipg_hashbase[INP_PCBHASH(faddr.s_addr, lport, fport, pcbgroup->ipg_hashmask)]; LIST_FOREACH(inp, head, inp_pcbgrouphash) { #ifdef INET6 /* XXX inp locking */ if ((inp->inp_vflag & INP_IPV4) == 0) continue; #endif if (inp->inp_faddr.s_addr == faddr.s_addr && inp->inp_laddr.s_addr == laddr.s_addr && inp->inp_fport == fport && inp->inp_lport == lport) { /* * XXX We should be able to directly return * the inp here, without any checks. * Well unless both bound with SO_REUSEPORT? */ if (prison_flag(inp->inp_cred, PR_IP4)) goto found; if (tmpinp == NULL) tmpinp = inp; } } if (tmpinp != NULL) { inp = tmpinp; goto found; } #ifdef RSS /* * For incoming connections, we may wish to do a wildcard * match for an RSS-local socket. */ if ((lookupflags & INPLOOKUP_WILDCARD) != 0) { struct inpcb *local_wild = NULL, *local_exact = NULL; #ifdef INET6 struct inpcb *local_wild_mapped = NULL; #endif struct inpcb *jail_wild = NULL; struct inpcbhead *head; int injail; /* * Order of socket selection - we always prefer jails. * 1. jailed, non-wild. * 2. jailed, wild. * 3. non-jailed, non-wild. * 4. non-jailed, wild. */ head = &pcbgroup->ipg_hashbase[INP_PCBHASH(INADDR_ANY, lport, 0, pcbgroup->ipg_hashmask)]; LIST_FOREACH(inp, head, inp_pcbgrouphash) { #ifdef INET6 /* XXX inp locking */ if ((inp->inp_vflag & INP_IPV4) == 0) continue; #endif if (inp->inp_faddr.s_addr != INADDR_ANY || inp->inp_lport != lport) continue; injail = prison_flag(inp->inp_cred, PR_IP4); if (injail) { if (prison_check_ip4(inp->inp_cred, &laddr) != 0) continue; } else { if (local_exact != NULL) continue; } if (inp->inp_laddr.s_addr == laddr.s_addr) { if (injail) goto found; else local_exact = inp; } else if (inp->inp_laddr.s_addr == INADDR_ANY) { #ifdef INET6 /* XXX inp locking, NULL check */ if (inp->inp_vflag & INP_IPV6PROTO) local_wild_mapped = inp; else #endif if (injail) jail_wild = inp; else local_wild = inp; } } /* LIST_FOREACH */ inp = jail_wild; if (inp == NULL) inp = local_exact; if (inp == NULL) inp = local_wild; #ifdef INET6 if (inp == NULL) inp = local_wild_mapped; #endif if (inp != NULL) goto found; } #endif /* * Then look for a wildcard match, if requested. */ if ((lookupflags & INPLOOKUP_WILDCARD) != 0) { struct inpcb *local_wild = NULL, *local_exact = NULL; #ifdef INET6 struct inpcb *local_wild_mapped = NULL; #endif struct inpcb *jail_wild = NULL; struct inpcbhead *head; int injail; /* * Order of socket selection - we always prefer jails. * 1. jailed, non-wild. * 2. jailed, wild. * 3. non-jailed, non-wild. * 4. non-jailed, wild. */ head = &pcbinfo->ipi_wildbase[INP_PCBHASH(INADDR_ANY, lport, 0, pcbinfo->ipi_wildmask)]; LIST_FOREACH(inp, head, inp_pcbgroup_wild) { #ifdef INET6 /* XXX inp locking */ if ((inp->inp_vflag & INP_IPV4) == 0) continue; #endif if (inp->inp_faddr.s_addr != INADDR_ANY || inp->inp_lport != lport) continue; injail = prison_flag(inp->inp_cred, PR_IP4); if (injail) { if (prison_check_ip4(inp->inp_cred, &laddr) != 0) continue; } else { if (local_exact != NULL) continue; } if (inp->inp_laddr.s_addr == laddr.s_addr) { if (injail) goto found; else local_exact = inp; } else if (inp->inp_laddr.s_addr == INADDR_ANY) { #ifdef INET6 /* XXX inp locking, NULL check */ if (inp->inp_vflag & INP_IPV6PROTO) local_wild_mapped = inp; else #endif if (injail) jail_wild = inp; else local_wild = inp; } } /* LIST_FOREACH */ inp = jail_wild; if (inp == NULL) inp = local_exact; if (inp == NULL) inp = local_wild; #ifdef INET6 if (inp == NULL) inp = local_wild_mapped; #endif if (inp != NULL) goto found; } /* if (lookupflags & INPLOOKUP_WILDCARD) */ INP_GROUP_UNLOCK(pcbgroup); return (NULL); found: if (lookupflags & INPLOOKUP_WLOCKPCB) locked = INP_TRY_WLOCK(inp); else if (lookupflags & INPLOOKUP_RLOCKPCB) locked = INP_TRY_RLOCK(inp); else panic("%s: locking bug", __func__); if (!locked) in_pcbref(inp); INP_GROUP_UNLOCK(pcbgroup); if (!locked) { if (lookupflags & INPLOOKUP_WLOCKPCB) { INP_WLOCK(inp); if (in_pcbrele_wlocked(inp)) return (NULL); } else { INP_RLOCK(inp); if (in_pcbrele_rlocked(inp)) return (NULL); } } #ifdef INVARIANTS if (lookupflags & INPLOOKUP_WLOCKPCB) INP_WLOCK_ASSERT(inp); else INP_RLOCK_ASSERT(inp); #endif return (inp); } #endif /* PCBGROUP */ /* * Lookup PCB in hash list, using pcbinfo tables. This variation assumes * that the caller has locked the hash list, and will not perform any further * locking or reference operations on either the hash list or the connection. */ static struct inpcb * in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo, struct in_addr faddr, u_int fport_arg, struct in_addr laddr, u_int lport_arg, int lookupflags, struct ifnet *ifp) { struct inpcbhead *head; struct inpcb *inp, *tmpinp; u_short fport = fport_arg, lport = lport_arg; KASSERT((lookupflags & ~(INPLOOKUP_WILDCARD)) == 0, ("%s: invalid lookup flags %d", __func__, lookupflags)); INP_HASH_LOCK_ASSERT(pcbinfo); /* * First look for an exact match. */ tmpinp = NULL; head = &pcbinfo->ipi_hashbase[INP_PCBHASH(faddr.s_addr, lport, fport, pcbinfo->ipi_hashmask)]; LIST_FOREACH(inp, head, inp_hash) { #ifdef INET6 /* XXX inp locking */ if ((inp->inp_vflag & INP_IPV4) == 0) continue; #endif if (inp->inp_faddr.s_addr == faddr.s_addr && inp->inp_laddr.s_addr == laddr.s_addr && inp->inp_fport == fport && inp->inp_lport == lport) { /* * XXX We should be able to directly return * the inp here, without any checks. * Well unless both bound with SO_REUSEPORT? */ if (prison_flag(inp->inp_cred, PR_IP4)) return (inp); if (tmpinp == NULL) tmpinp = inp; } } if (tmpinp != NULL) return (tmpinp); /* * Then look for a wildcard match, if requested. */ if ((lookupflags & INPLOOKUP_WILDCARD) != 0) { struct inpcb *local_wild = NULL, *local_exact = NULL; #ifdef INET6 struct inpcb *local_wild_mapped = NULL; #endif struct inpcb *jail_wild = NULL; int injail; /* * Order of socket selection - we always prefer jails. * 1. jailed, non-wild. * 2. jailed, wild. * 3. non-jailed, non-wild. * 4. non-jailed, wild. */ head = &pcbinfo->ipi_hashbase[INP_PCBHASH(INADDR_ANY, lport, 0, pcbinfo->ipi_hashmask)]; LIST_FOREACH(inp, head, inp_hash) { #ifdef INET6 /* XXX inp locking */ if ((inp->inp_vflag & INP_IPV4) == 0) continue; #endif if (inp->inp_faddr.s_addr != INADDR_ANY || inp->inp_lport != lport) continue; injail = prison_flag(inp->inp_cred, PR_IP4); if (injail) { if (prison_check_ip4(inp->inp_cred, &laddr) != 0) continue; } else { if (local_exact != NULL) continue; } if (inp->inp_laddr.s_addr == laddr.s_addr) { if (injail) return (inp); else local_exact = inp; } else if (inp->inp_laddr.s_addr == INADDR_ANY) { #ifdef INET6 /* XXX inp locking, NULL check */ if (inp->inp_vflag & INP_IPV6PROTO) local_wild_mapped = inp; else #endif if (injail) jail_wild = inp; else local_wild = inp; } } /* LIST_FOREACH */ if (jail_wild != NULL) return (jail_wild); if (local_exact != NULL) return (local_exact); if (local_wild != NULL) return (local_wild); #ifdef INET6 if (local_wild_mapped != NULL) return (local_wild_mapped); #endif } /* if ((lookupflags & INPLOOKUP_WILDCARD) != 0) */ return (NULL); } /* * Lookup PCB in hash list, using pcbinfo tables. This variation locks the * hash list lock, and will return the inpcb locked (i.e., requires * INPLOOKUP_LOCKPCB). */ static struct inpcb * in_pcblookup_hash(struct inpcbinfo *pcbinfo, struct in_addr faddr, u_int fport, struct in_addr laddr, u_int lport, int lookupflags, struct ifnet *ifp) { struct inpcb *inp; bool locked; INP_HASH_RLOCK(pcbinfo); inp = in_pcblookup_hash_locked(pcbinfo, faddr, fport, laddr, lport, (lookupflags & ~(INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)), ifp); if (inp != NULL) { if (lookupflags & INPLOOKUP_WLOCKPCB) locked = INP_TRY_WLOCK(inp); else if (lookupflags & INPLOOKUP_RLOCKPCB) locked = INP_TRY_RLOCK(inp); else panic("%s: locking bug", __func__); if (!locked) in_pcbref(inp); INP_HASH_RUNLOCK(pcbinfo); if (!locked) { if (lookupflags & INPLOOKUP_WLOCKPCB) { INP_WLOCK(inp); if (in_pcbrele_wlocked(inp)) return (NULL); } else { INP_RLOCK(inp); if (in_pcbrele_rlocked(inp)) return (NULL); } } #ifdef INVARIANTS if (lookupflags & INPLOOKUP_WLOCKPCB) INP_WLOCK_ASSERT(inp); else INP_RLOCK_ASSERT(inp); #endif } else INP_HASH_RUNLOCK(pcbinfo); return (inp); } /* * Public inpcb lookup routines, accepting a 4-tuple, and optionally, an mbuf * from which a pre-calculated hash value may be extracted. * * Possibly more of this logic should be in in_pcbgroup.c. */ struct inpcb * in_pcblookup(struct inpcbinfo *pcbinfo, struct in_addr faddr, u_int fport, struct in_addr laddr, u_int lport, int lookupflags, struct ifnet *ifp) { #if defined(PCBGROUP) && !defined(RSS) struct inpcbgroup *pcbgroup; #endif KASSERT((lookupflags & ~INPLOOKUP_MASK) == 0, ("%s: invalid lookup flags %d", __func__, lookupflags)); KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)) != 0, ("%s: LOCKPCB not set", __func__)); /* * When not using RSS, use connection groups in preference to the * reservation table when looking up 4-tuples. When using RSS, just * use the reservation table, due to the cost of the Toeplitz hash * in software. * * XXXRW: This policy belongs in the pcbgroup code, as in principle * we could be doing RSS with a non-Toeplitz hash that is affordable * in software. */ #if defined(PCBGROUP) && !defined(RSS) if (in_pcbgroup_enabled(pcbinfo)) { pcbgroup = in_pcbgroup_bytuple(pcbinfo, laddr, lport, faddr, fport); return (in_pcblookup_group(pcbinfo, pcbgroup, faddr, fport, laddr, lport, lookupflags, ifp)); } #endif return (in_pcblookup_hash(pcbinfo, faddr, fport, laddr, lport, lookupflags, ifp)); } struct inpcb * in_pcblookup_mbuf(struct inpcbinfo *pcbinfo, struct in_addr faddr, u_int fport, struct in_addr laddr, u_int lport, int lookupflags, struct ifnet *ifp, struct mbuf *m) { #ifdef PCBGROUP struct inpcbgroup *pcbgroup; #endif KASSERT((lookupflags & ~INPLOOKUP_MASK) == 0, ("%s: invalid lookup flags %d", __func__, lookupflags)); KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)) != 0, ("%s: LOCKPCB not set", __func__)); #ifdef PCBGROUP /* * If we can use a hardware-generated hash to look up the connection * group, use that connection group to find the inpcb. Otherwise * fall back on a software hash -- or the reservation table if we're * using RSS. * * XXXRW: As above, that policy belongs in the pcbgroup code. */ if (in_pcbgroup_enabled(pcbinfo) && !(M_HASHTYPE_TEST(m, M_HASHTYPE_NONE))) { pcbgroup = in_pcbgroup_byhash(pcbinfo, M_HASHTYPE_GET(m), m->m_pkthdr.flowid); if (pcbgroup != NULL) return (in_pcblookup_group(pcbinfo, pcbgroup, faddr, fport, laddr, lport, lookupflags, ifp)); #ifndef RSS pcbgroup = in_pcbgroup_bytuple(pcbinfo, laddr, lport, faddr, fport); return (in_pcblookup_group(pcbinfo, pcbgroup, faddr, fport, laddr, lport, lookupflags, ifp)); #endif } #endif return (in_pcblookup_hash(pcbinfo, faddr, fport, laddr, lport, lookupflags, ifp)); } #endif /* INET */ /* * Insert PCB onto various hash lists. */ static int in_pcbinshash_internal(struct inpcb *inp, int do_pcbgroup_update) { struct inpcbhead *pcbhash; struct inpcbporthead *pcbporthash; struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; struct inpcbport *phd; u_int32_t hashkey_faddr; INP_WLOCK_ASSERT(inp); INP_HASH_WLOCK_ASSERT(pcbinfo); KASSERT((inp->inp_flags & INP_INHASHLIST) == 0, ("in_pcbinshash: INP_INHASHLIST")); #ifdef INET6 if (inp->inp_vflag & INP_IPV6) hashkey_faddr = INP6_PCBHASHKEY(&inp->in6p_faddr); else #endif hashkey_faddr = inp->inp_faddr.s_addr; pcbhash = &pcbinfo->ipi_hashbase[INP_PCBHASH(hashkey_faddr, inp->inp_lport, inp->inp_fport, pcbinfo->ipi_hashmask)]; pcbporthash = &pcbinfo->ipi_porthashbase[ INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_porthashmask)]; /* * Go through port list and look for a head for this lport. */ LIST_FOREACH(phd, pcbporthash, phd_hash) { if (phd->phd_port == inp->inp_lport) break; } /* * If none exists, malloc one and tack it on. */ if (phd == NULL) { phd = malloc(sizeof(struct inpcbport), M_PCB, M_NOWAIT); if (phd == NULL) { return (ENOBUFS); /* XXX */ } phd->phd_port = inp->inp_lport; LIST_INIT(&phd->phd_pcblist); LIST_INSERT_HEAD(pcbporthash, phd, phd_hash); } inp->inp_phd = phd; LIST_INSERT_HEAD(&phd->phd_pcblist, inp, inp_portlist); LIST_INSERT_HEAD(pcbhash, inp, inp_hash); inp->inp_flags |= INP_INHASHLIST; #ifdef PCBGROUP if (do_pcbgroup_update) in_pcbgroup_update(inp); #endif return (0); } /* * For now, there are two public interfaces to insert an inpcb into the hash * lists -- one that does update pcbgroups, and one that doesn't. The latter * is used only in the TCP syncache, where in_pcbinshash is called before the * full 4-tuple is set for the inpcb, and we don't want to install in the * pcbgroup until later. * * XXXRW: This seems like a misfeature. in_pcbinshash should always update * connection groups, and partially initialised inpcbs should not be exposed * to either reservation hash tables or pcbgroups. */ int in_pcbinshash(struct inpcb *inp) { return (in_pcbinshash_internal(inp, 1)); } int in_pcbinshash_nopcbgroup(struct inpcb *inp) { return (in_pcbinshash_internal(inp, 0)); } /* * Move PCB to the proper hash bucket when { faddr, fport } have been * changed. NOTE: This does not handle the case of the lport changing (the * hashed port list would have to be updated as well), so the lport must * not change after in_pcbinshash() has been called. */ void in_pcbrehash_mbuf(struct inpcb *inp, struct mbuf *m) { struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; struct inpcbhead *head; u_int32_t hashkey_faddr; INP_WLOCK_ASSERT(inp); INP_HASH_WLOCK_ASSERT(pcbinfo); KASSERT(inp->inp_flags & INP_INHASHLIST, ("in_pcbrehash: !INP_INHASHLIST")); #ifdef INET6 if (inp->inp_vflag & INP_IPV6) hashkey_faddr = INP6_PCBHASHKEY(&inp->in6p_faddr); else #endif hashkey_faddr = inp->inp_faddr.s_addr; head = &pcbinfo->ipi_hashbase[INP_PCBHASH(hashkey_faddr, inp->inp_lport, inp->inp_fport, pcbinfo->ipi_hashmask)]; LIST_REMOVE(inp, inp_hash); LIST_INSERT_HEAD(head, inp, inp_hash); #ifdef PCBGROUP if (m != NULL) in_pcbgroup_update_mbuf(inp, m); else in_pcbgroup_update(inp); #endif } void in_pcbrehash(struct inpcb *inp) { in_pcbrehash_mbuf(inp, NULL); } /* * Remove PCB from various lists. */ static void in_pcbremlists(struct inpcb *inp) { struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; #ifdef INVARIANTS if (pcbinfo == &V_tcbinfo) { INP_INFO_RLOCK_ASSERT(pcbinfo); } else { INP_INFO_WLOCK_ASSERT(pcbinfo); } #endif INP_WLOCK_ASSERT(inp); INP_LIST_WLOCK_ASSERT(pcbinfo); inp->inp_gencnt = ++pcbinfo->ipi_gencnt; if (inp->inp_flags & INP_INHASHLIST) { struct inpcbport *phd = inp->inp_phd; INP_HASH_WLOCK(pcbinfo); LIST_REMOVE(inp, inp_hash); LIST_REMOVE(inp, inp_portlist); if (LIST_FIRST(&phd->phd_pcblist) == NULL) { LIST_REMOVE(phd, phd_hash); free(phd, M_PCB); } INP_HASH_WUNLOCK(pcbinfo); inp->inp_flags &= ~INP_INHASHLIST; } LIST_REMOVE(inp, inp_list); pcbinfo->ipi_count--; #ifdef PCBGROUP in_pcbgroup_remove(inp); #endif } /* * Check for alternatives when higher level complains * about service problems. For now, invalidate cached * routing information. If the route was created dynamically * (by a redirect), time to try a default gateway again. */ void in_losing(struct inpcb *inp) { RO_INVALIDATE_CACHE(&inp->inp_route); return; } /* * A set label operation has occurred at the socket layer, propagate the * label change into the in_pcb for the socket. */ void in_pcbsosetlabel(struct socket *so) { #ifdef MAC struct inpcb *inp; inp = sotoinpcb(so); KASSERT(inp != NULL, ("in_pcbsosetlabel: so->so_pcb == NULL")); INP_WLOCK(inp); SOCK_LOCK(so); mac_inpcb_sosetlabel(so, inp); SOCK_UNLOCK(so); INP_WUNLOCK(inp); #endif } /* * ipport_tick runs once per second, determining if random port allocation * should be continued. If more than ipport_randomcps ports have been * allocated in the last second, then we return to sequential port * allocation. We return to random allocation only once we drop below * ipport_randomcps for at least ipport_randomtime seconds. */ static void ipport_tick(void *xtp) { VNET_ITERATOR_DECL(vnet_iter); VNET_LIST_RLOCK_NOSLEEP(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); /* XXX appease INVARIANTS here */ if (V_ipport_tcpallocs <= V_ipport_tcplastcount + V_ipport_randomcps) { if (V_ipport_stoprandom > 0) V_ipport_stoprandom--; } else V_ipport_stoprandom = V_ipport_randomtime; V_ipport_tcplastcount = V_ipport_tcpallocs; CURVNET_RESTORE(); } VNET_LIST_RUNLOCK_NOSLEEP(); callout_reset(&ipport_tick_callout, hz, ipport_tick, NULL); } static void ip_fini(void *xtp) { callout_stop(&ipport_tick_callout); } /* * The ipport_callout should start running at about the time we attach the * inet or inet6 domains. */ static void ipport_tick_init(const void *unused __unused) { /* Start ipport_tick. */ callout_init(&ipport_tick_callout, 1); callout_reset(&ipport_tick_callout, 1, ipport_tick, NULL); EVENTHANDLER_REGISTER(shutdown_pre_sync, ip_fini, NULL, SHUTDOWN_PRI_DEFAULT); } SYSINIT(ipport_tick_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_MIDDLE, ipport_tick_init, NULL); void inp_wlock(struct inpcb *inp) { INP_WLOCK(inp); } void inp_wunlock(struct inpcb *inp) { INP_WUNLOCK(inp); } void inp_rlock(struct inpcb *inp) { INP_RLOCK(inp); } void inp_runlock(struct inpcb *inp) { INP_RUNLOCK(inp); } #ifdef INVARIANT_SUPPORT void inp_lock_assert(struct inpcb *inp) { INP_WLOCK_ASSERT(inp); } void inp_unlock_assert(struct inpcb *inp) { INP_UNLOCK_ASSERT(inp); } #endif void inp_apply_all(void (*func)(struct inpcb *, void *), void *arg) { struct inpcb *inp; INP_INFO_WLOCK(&V_tcbinfo); LIST_FOREACH(inp, V_tcbinfo.ipi_listhead, inp_list) { INP_WLOCK(inp); func(inp, arg); INP_WUNLOCK(inp); } INP_INFO_WUNLOCK(&V_tcbinfo); } struct socket * inp_inpcbtosocket(struct inpcb *inp) { INP_WLOCK_ASSERT(inp); return (inp->inp_socket); } struct tcpcb * inp_inpcbtotcpcb(struct inpcb *inp) { INP_WLOCK_ASSERT(inp); return ((struct tcpcb *)inp->inp_ppcb); } int inp_ip_tos_get(const struct inpcb *inp) { return (inp->inp_ip_tos); } void inp_ip_tos_set(struct inpcb *inp, int val) { inp->inp_ip_tos = val; } void inp_4tuple_get(struct inpcb *inp, uint32_t *laddr, uint16_t *lp, uint32_t *faddr, uint16_t *fp) { INP_LOCK_ASSERT(inp); *laddr = inp->inp_laddr.s_addr; *faddr = inp->inp_faddr.s_addr; *lp = inp->inp_lport; *fp = inp->inp_fport; } struct inpcb * so_sotoinpcb(struct socket *so) { return (sotoinpcb(so)); } struct tcpcb * so_sototcpcb(struct socket *so) { return (sototcpcb(so)); } /* * Create an external-format (``xinpcb'') structure using the information in * the kernel-format in_pcb structure pointed to by inp. This is done to * reduce the spew of irrelevant information over this interface, to isolate * user code from changes in the kernel structure, and potentially to provide * information-hiding if we decide that some of this information should be * hidden from users. */ void in_pcbtoxinpcb(const struct inpcb *inp, struct xinpcb *xi) { xi->xi_len = sizeof(struct xinpcb); if (inp->inp_socket) sotoxsocket(inp->inp_socket, &xi->xi_socket); else bzero(&xi->xi_socket, sizeof(struct xsocket)); bcopy(&inp->inp_inc, &xi->inp_inc, sizeof(struct in_conninfo)); xi->inp_gencnt = inp->inp_gencnt; xi->inp_ppcb = inp->inp_ppcb; xi->inp_flow = inp->inp_flow; xi->inp_flowid = inp->inp_flowid; xi->inp_flowtype = inp->inp_flowtype; xi->inp_flags = inp->inp_flags; xi->inp_flags2 = inp->inp_flags2; xi->inp_rss_listen_bucket = inp->inp_rss_listen_bucket; xi->in6p_cksum = inp->in6p_cksum; xi->in6p_hops = inp->in6p_hops; xi->inp_ip_tos = inp->inp_ip_tos; xi->inp_vflag = inp->inp_vflag; xi->inp_ip_ttl = inp->inp_ip_ttl; xi->inp_ip_p = inp->inp_ip_p; xi->inp_ip_minttl = inp->inp_ip_minttl; } #ifdef DDB static void db_print_indent(int indent) { int i; for (i = 0; i < indent; i++) db_printf(" "); } static void db_print_inconninfo(struct in_conninfo *inc, const char *name, int indent) { char faddr_str[48], laddr_str[48]; db_print_indent(indent); db_printf("%s at %p\n", name, inc); indent += 2; #ifdef INET6 if (inc->inc_flags & INC_ISIPV6) { /* IPv6. */ ip6_sprintf(laddr_str, &inc->inc6_laddr); ip6_sprintf(faddr_str, &inc->inc6_faddr); } else #endif { /* IPv4. */ inet_ntoa_r(inc->inc_laddr, laddr_str); inet_ntoa_r(inc->inc_faddr, faddr_str); } db_print_indent(indent); db_printf("inc_laddr %s inc_lport %u\n", laddr_str, ntohs(inc->inc_lport)); db_print_indent(indent); db_printf("inc_faddr %s inc_fport %u\n", faddr_str, ntohs(inc->inc_fport)); } static void db_print_inpflags(int inp_flags) { int comma; comma = 0; if (inp_flags & INP_RECVOPTS) { db_printf("%sINP_RECVOPTS", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_RECVRETOPTS) { db_printf("%sINP_RECVRETOPTS", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_RECVDSTADDR) { db_printf("%sINP_RECVDSTADDR", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_ORIGDSTADDR) { db_printf("%sINP_ORIGDSTADDR", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_HDRINCL) { db_printf("%sINP_HDRINCL", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_HIGHPORT) { db_printf("%sINP_HIGHPORT", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_LOWPORT) { db_printf("%sINP_LOWPORT", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_ANONPORT) { db_printf("%sINP_ANONPORT", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_RECVIF) { db_printf("%sINP_RECVIF", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_MTUDISC) { db_printf("%sINP_MTUDISC", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_RECVTTL) { db_printf("%sINP_RECVTTL", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_DONTFRAG) { db_printf("%sINP_DONTFRAG", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_RECVTOS) { db_printf("%sINP_RECVTOS", comma ? ", " : ""); comma = 1; } if (inp_flags & IN6P_IPV6_V6ONLY) { db_printf("%sIN6P_IPV6_V6ONLY", comma ? ", " : ""); comma = 1; } if (inp_flags & IN6P_PKTINFO) { db_printf("%sIN6P_PKTINFO", comma ? ", " : ""); comma = 1; } if (inp_flags & IN6P_HOPLIMIT) { db_printf("%sIN6P_HOPLIMIT", comma ? ", " : ""); comma = 1; } if (inp_flags & IN6P_HOPOPTS) { db_printf("%sIN6P_HOPOPTS", comma ? ", " : ""); comma = 1; } if (inp_flags & IN6P_DSTOPTS) { db_printf("%sIN6P_DSTOPTS", comma ? ", " : ""); comma = 1; } if (inp_flags & IN6P_RTHDR) { db_printf("%sIN6P_RTHDR", comma ? ", " : ""); comma = 1; } if (inp_flags & IN6P_RTHDRDSTOPTS) { db_printf("%sIN6P_RTHDRDSTOPTS", comma ? ", " : ""); comma = 1; } if (inp_flags & IN6P_TCLASS) { db_printf("%sIN6P_TCLASS", comma ? ", " : ""); comma = 1; } if (inp_flags & IN6P_AUTOFLOWLABEL) { db_printf("%sIN6P_AUTOFLOWLABEL", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_TIMEWAIT) { db_printf("%sINP_TIMEWAIT", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_ONESBCAST) { db_printf("%sINP_ONESBCAST", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_DROPPED) { db_printf("%sINP_DROPPED", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_SOCKREF) { db_printf("%sINP_SOCKREF", comma ? ", " : ""); comma = 1; } if (inp_flags & IN6P_RFC2292) { db_printf("%sIN6P_RFC2292", comma ? ", " : ""); comma = 1; } if (inp_flags & IN6P_MTU) { db_printf("IN6P_MTU%s", comma ? ", " : ""); comma = 1; } } static void db_print_inpvflag(u_char inp_vflag) { int comma; comma = 0; if (inp_vflag & INP_IPV4) { db_printf("%sINP_IPV4", comma ? ", " : ""); comma = 1; } if (inp_vflag & INP_IPV6) { db_printf("%sINP_IPV6", comma ? ", " : ""); comma = 1; } if (inp_vflag & INP_IPV6PROTO) { db_printf("%sINP_IPV6PROTO", comma ? ", " : ""); comma = 1; } } static void db_print_inpcb(struct inpcb *inp, const char *name, int indent) { db_print_indent(indent); db_printf("%s at %p\n", name, inp); indent += 2; db_print_indent(indent); db_printf("inp_flow: 0x%x\n", inp->inp_flow); db_print_inconninfo(&inp->inp_inc, "inp_conninfo", indent); db_print_indent(indent); db_printf("inp_ppcb: %p inp_pcbinfo: %p inp_socket: %p\n", inp->inp_ppcb, inp->inp_pcbinfo, inp->inp_socket); db_print_indent(indent); db_printf("inp_label: %p inp_flags: 0x%x (", inp->inp_label, inp->inp_flags); db_print_inpflags(inp->inp_flags); db_printf(")\n"); db_print_indent(indent); db_printf("inp_sp: %p inp_vflag: 0x%x (", inp->inp_sp, inp->inp_vflag); db_print_inpvflag(inp->inp_vflag); db_printf(")\n"); db_print_indent(indent); db_printf("inp_ip_ttl: %d inp_ip_p: %d inp_ip_minttl: %d\n", inp->inp_ip_ttl, inp->inp_ip_p, inp->inp_ip_minttl); db_print_indent(indent); #ifdef INET6 if (inp->inp_vflag & INP_IPV6) { db_printf("in6p_options: %p in6p_outputopts: %p " "in6p_moptions: %p\n", inp->in6p_options, inp->in6p_outputopts, inp->in6p_moptions); db_printf("in6p_icmp6filt: %p in6p_cksum %d " "in6p_hops %u\n", inp->in6p_icmp6filt, inp->in6p_cksum, inp->in6p_hops); } else #endif { db_printf("inp_ip_tos: %d inp_ip_options: %p " "inp_ip_moptions: %p\n", inp->inp_ip_tos, inp->inp_options, inp->inp_moptions); } db_print_indent(indent); db_printf("inp_phd: %p inp_gencnt: %ju\n", inp->inp_phd, (uintmax_t)inp->inp_gencnt); } DB_SHOW_COMMAND(inpcb, db_show_inpcb) { struct inpcb *inp; if (!have_addr) { db_printf("usage: show inpcb \n"); return; } inp = (struct inpcb *)addr; db_print_inpcb(inp, "inpcb", 0); } #endif /* DDB */ #ifdef RATELIMIT /* * Modify TX rate limit based on the existing "inp->inp_snd_tag", * if any. */ int in_pcbmodify_txrtlmt(struct inpcb *inp, uint32_t max_pacing_rate) { union if_snd_tag_modify_params params = { .rate_limit.max_rate = max_pacing_rate, }; struct m_snd_tag *mst; struct ifnet *ifp; int error; mst = inp->inp_snd_tag; if (mst == NULL) return (EINVAL); ifp = mst->ifp; if (ifp == NULL) return (EINVAL); if (ifp->if_snd_tag_modify == NULL) { error = EOPNOTSUPP; } else { error = ifp->if_snd_tag_modify(mst, ¶ms); } return (error); } /* * Query existing TX rate limit based on the existing * "inp->inp_snd_tag", if any. */ int in_pcbquery_txrtlmt(struct inpcb *inp, uint32_t *p_max_pacing_rate) { union if_snd_tag_query_params params = { }; struct m_snd_tag *mst; struct ifnet *ifp; int error; mst = inp->inp_snd_tag; if (mst == NULL) return (EINVAL); ifp = mst->ifp; if (ifp == NULL) return (EINVAL); if (ifp->if_snd_tag_query == NULL) { error = EOPNOTSUPP; } else { error = ifp->if_snd_tag_query(mst, ¶ms); if (error == 0 && p_max_pacing_rate != NULL) *p_max_pacing_rate = params.rate_limit.max_rate; } return (error); } /* * Query existing TX queue level based on the existing * "inp->inp_snd_tag", if any. */ int in_pcbquery_txrlevel(struct inpcb *inp, uint32_t *p_txqueue_level) { union if_snd_tag_query_params params = { }; struct m_snd_tag *mst; struct ifnet *ifp; int error; mst = inp->inp_snd_tag; if (mst == NULL) return (EINVAL); ifp = mst->ifp; if (ifp == NULL) return (EINVAL); if (ifp->if_snd_tag_query == NULL) return (EOPNOTSUPP); error = ifp->if_snd_tag_query(mst, ¶ms); if (error == 0 && p_txqueue_level != NULL) *p_txqueue_level = params.rate_limit.queue_level; return (error); } /* * Allocate a new TX rate limit send tag from the network interface * given by the "ifp" argument and save it in "inp->inp_snd_tag": */ int in_pcbattach_txrtlmt(struct inpcb *inp, struct ifnet *ifp, uint32_t flowtype, uint32_t flowid, uint32_t max_pacing_rate) { union if_snd_tag_alloc_params params = { .rate_limit.hdr.type = (max_pacing_rate == -1U) ? IF_SND_TAG_TYPE_UNLIMITED : IF_SND_TAG_TYPE_RATE_LIMIT, .rate_limit.hdr.flowid = flowid, .rate_limit.hdr.flowtype = flowtype, .rate_limit.max_rate = max_pacing_rate, }; int error; INP_WLOCK_ASSERT(inp); if (inp->inp_snd_tag != NULL) return (EINVAL); if (ifp->if_snd_tag_alloc == NULL) { error = EOPNOTSUPP; } else { error = ifp->if_snd_tag_alloc(ifp, ¶ms, &inp->inp_snd_tag); /* * At success increment the refcount on * the send tag's network interface: */ if (error == 0) if_ref(inp->inp_snd_tag->ifp); } return (error); } /* * Free an existing TX rate limit tag based on the "inp->inp_snd_tag", * if any: */ void in_pcbdetach_txrtlmt(struct inpcb *inp) { struct m_snd_tag *mst; struct ifnet *ifp; INP_WLOCK_ASSERT(inp); mst = inp->inp_snd_tag; inp->inp_snd_tag = NULL; if (mst == NULL) return; ifp = mst->ifp; if (ifp == NULL) return; /* * If the device was detached while we still had reference(s) * on the ifp, we assume if_snd_tag_free() was replaced with * stubs. */ ifp->if_snd_tag_free(mst); /* release reference count on network interface */ if_rele(ifp); } /* * This function should be called when the INP_RATE_LIMIT_CHANGED flag * is set in the fast path and will attach/detach/modify the TX rate * limit send tag based on the socket's so_max_pacing_rate value. */ void in_pcboutput_txrtlmt(struct inpcb *inp, struct ifnet *ifp, struct mbuf *mb) { struct socket *socket; uint32_t max_pacing_rate; bool did_upgrade; int error; if (inp == NULL) return; socket = inp->inp_socket; if (socket == NULL) return; if (!INP_WLOCKED(inp)) { /* * NOTE: If the write locking fails, we need to bail * out and use the non-ratelimited ring for the * transmit until there is a new chance to get the * write lock. */ if (!INP_TRY_UPGRADE(inp)) return; did_upgrade = 1; } else { did_upgrade = 0; } /* * NOTE: The so_max_pacing_rate value is read unlocked, * because atomic updates are not required since the variable * is checked at every mbuf we send. It is assumed that the * variable read itself will be atomic. */ max_pacing_rate = socket->so_max_pacing_rate; /* * NOTE: When attaching to a network interface a reference is * made to ensure the network interface doesn't go away until * all ratelimit connections are gone. The network interface * pointers compared below represent valid network interfaces, * except when comparing towards NULL. */ if (max_pacing_rate == 0 && inp->inp_snd_tag == NULL) { error = 0; } else if (!(ifp->if_capenable & IFCAP_TXRTLMT)) { if (inp->inp_snd_tag != NULL) in_pcbdetach_txrtlmt(inp); error = 0; } else if (inp->inp_snd_tag == NULL) { /* * In order to utilize packet pacing with RSS, we need * to wait until there is a valid RSS hash before we * can proceed: */ if (M_HASHTYPE_GET(mb) == M_HASHTYPE_NONE) { error = EAGAIN; } else { error = in_pcbattach_txrtlmt(inp, ifp, M_HASHTYPE_GET(mb), mb->m_pkthdr.flowid, max_pacing_rate); } } else { error = in_pcbmodify_txrtlmt(inp, max_pacing_rate); } if (error == 0 || error == EOPNOTSUPP) inp->inp_flags2 &= ~INP_RATE_LIMIT_CHANGED; if (did_upgrade) INP_DOWNGRADE(inp); } /* * Track route changes for TX rate limiting. */ void in_pcboutput_eagain(struct inpcb *inp) { struct socket *socket; bool did_upgrade; if (inp == NULL) return; socket = inp->inp_socket; if (socket == NULL) return; if (inp->inp_snd_tag == NULL) return; if (!INP_WLOCKED(inp)) { /* * NOTE: If the write locking fails, we need to bail * out and use the non-ratelimited ring for the * transmit until there is a new chance to get the * write lock. */ if (!INP_TRY_UPGRADE(inp)) return; did_upgrade = 1; } else { did_upgrade = 0; } /* detach rate limiting */ in_pcbdetach_txrtlmt(inp); /* make sure new mbuf send tag allocation is made */ inp->inp_flags2 |= INP_RATE_LIMIT_CHANGED; if (did_upgrade) INP_DOWNGRADE(inp); } #endif /* RATELIMIT */ Index: head/sys/netinet/in_var.h =================================================================== --- head/sys/netinet/in_var.h (revision 334117) +++ head/sys/netinet/in_var.h (revision 334118) @@ -1,433 +1,431 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1985, 1986, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)in_var.h 8.2 (Berkeley) 1/9/95 * $FreeBSD$ */ #ifndef _NETINET_IN_VAR_H_ #define _NETINET_IN_VAR_H_ /* * Argument structure for SIOCAIFADDR. */ struct in_aliasreq { char ifra_name[IFNAMSIZ]; /* if name, e.g. "en0" */ struct sockaddr_in ifra_addr; struct sockaddr_in ifra_broadaddr; #define ifra_dstaddr ifra_broadaddr struct sockaddr_in ifra_mask; int ifra_vhid; }; #ifdef _KERNEL #include #include #include struct igmp_ifsoftc; struct in_multi; struct lltable; SLIST_HEAD(in_multi_head, in_multi); /* * IPv4 per-interface state. */ struct in_ifinfo { struct lltable *ii_llt; /* ARP state */ struct igmp_ifsoftc *ii_igmp; /* IGMP state */ struct in_multi *ii_allhosts; /* 224.0.0.1 membership */ }; /* * Interface address, Internet version. One of these structures * is allocated for each Internet address on an interface. * The ifaddr structure contains the protocol-independent part * of the structure and is assumed to be first. */ struct in_ifaddr { struct ifaddr ia_ifa; /* protocol-independent info */ #define ia_ifp ia_ifa.ifa_ifp #define ia_flags ia_ifa.ifa_flags /* ia_subnet{,mask} in host order */ u_long ia_subnet; /* subnet address */ u_long ia_subnetmask; /* mask of subnet */ LIST_ENTRY(in_ifaddr) ia_hash; /* entry in bucket of inet addresses */ CK_STAILQ_ENTRY(in_ifaddr) ia_link; /* list of internet addresses */ struct sockaddr_in ia_addr; /* reserve space for interface name */ struct sockaddr_in ia_dstaddr; /* reserve space for broadcast addr */ #define ia_broadaddr ia_dstaddr struct sockaddr_in ia_sockmask; /* reserve space for general netmask */ struct callout ia_garp_timer; /* timer for retransmitting GARPs */ int ia_garp_count; /* count of retransmitted GARPs */ }; /* * Given a pointer to an in_ifaddr (ifaddr), * return a pointer to the addr as a sockaddr_in. */ #define IA_SIN(ia) (&(((struct in_ifaddr *)(ia))->ia_addr)) #define IA_DSTSIN(ia) (&(((struct in_ifaddr *)(ia))->ia_dstaddr)) #define IA_MASKSIN(ia) (&(((struct in_ifaddr *)(ia))->ia_sockmask)) #define IN_LNAOF(in, ifa) \ ((ntohl((in).s_addr) & ~((struct in_ifaddr *)(ifa)->ia_subnetmask)) extern u_char inetctlerrmap[]; #define LLTABLE(ifp) \ ((struct in_ifinfo *)(ifp)->if_afdata[AF_INET])->ii_llt /* * Hash table for IP addresses. */ CK_STAILQ_HEAD(in_ifaddrhead, in_ifaddr); LIST_HEAD(in_ifaddrhashhead, in_ifaddr); VNET_DECLARE(struct in_ifaddrhashhead *, in_ifaddrhashtbl); VNET_DECLARE(struct in_ifaddrhead, in_ifaddrhead); VNET_DECLARE(u_long, in_ifaddrhmask); /* mask for hash table */ #define V_in_ifaddrhashtbl VNET(in_ifaddrhashtbl) #define V_in_ifaddrhead VNET(in_ifaddrhead) #define V_in_ifaddrhmask VNET(in_ifaddrhmask) #define INADDR_NHASH_LOG2 9 #define INADDR_NHASH (1 << INADDR_NHASH_LOG2) #define INADDR_HASHVAL(x) fnv_32_buf((&(x)), sizeof(x), FNV1_32_INIT) #define INADDR_HASH(x) \ (&V_in_ifaddrhashtbl[INADDR_HASHVAL(x) & V_in_ifaddrhmask]) extern struct rmlock in_ifaddr_lock; #define IN_IFADDR_LOCK_ASSERT() rm_assert(&in_ifaddr_lock, RA_LOCKED) #define IN_IFADDR_RLOCK(t) rm_rlock(&in_ifaddr_lock, (t)) #define IN_IFADDR_RLOCK_ASSERT() rm_assert(&in_ifaddr_lock, RA_RLOCKED) #define IN_IFADDR_RUNLOCK(t) rm_runlock(&in_ifaddr_lock, (t)) #define IN_IFADDR_WLOCK() rm_wlock(&in_ifaddr_lock) #define IN_IFADDR_WLOCK_ASSERT() rm_assert(&in_ifaddr_lock, RA_WLOCKED) #define IN_IFADDR_WUNLOCK() rm_wunlock(&in_ifaddr_lock) /* * Macro for finding the internet address structure (in_ifaddr) * corresponding to one of our IP addresses (in_addr). */ #define INADDR_TO_IFADDR(addr, ia) \ /* struct in_addr addr; */ \ /* struct in_ifaddr *ia; */ \ do { \ \ LIST_FOREACH(ia, INADDR_HASH((addr).s_addr), ia_hash) \ if (IA_SIN(ia)->sin_addr.s_addr == (addr).s_addr) \ break; \ } while (0) /* * Macro for finding the interface (ifnet structure) corresponding to one * of our IP addresses. */ #define INADDR_TO_IFP(addr, ifp) \ /* struct in_addr addr; */ \ /* struct ifnet *ifp; */ \ { \ struct in_ifaddr *ia; \ \ INADDR_TO_IFADDR(addr, ia); \ (ifp) = (ia == NULL) ? NULL : ia->ia_ifp; \ } /* * Macro for finding the internet address structure (in_ifaddr) corresponding * to a given interface (ifnet structure). */ #define IFP_TO_IA(ifp, ia, t) \ /* struct ifnet *ifp; */ \ /* struct in_ifaddr *ia; */ \ /* struct rm_priotracker *t; */ \ do { \ IN_IFADDR_RLOCK((t)); \ for ((ia) = CK_STAILQ_FIRST(&V_in_ifaddrhead); \ (ia) != NULL && (ia)->ia_ifp != (ifp); \ (ia) = CK_STAILQ_NEXT((ia), ia_link)) \ continue; \ - if ((ia) != NULL) \ - ifa_ref(&(ia)->ia_ifa); \ IN_IFADDR_RUNLOCK((t)); \ } while (0) /* * Legacy IPv4 IGMP per-link structure. */ struct router_info { struct ifnet *rti_ifp; int rti_type; /* type of router which is querier on this interface */ int rti_time; /* # of slow timeouts since last old query */ SLIST_ENTRY(router_info) rti_list; }; /* * IPv4 multicast IGMP-layer source entry. */ struct ip_msource { RB_ENTRY(ip_msource) ims_link; /* RB tree links */ in_addr_t ims_haddr; /* host byte order */ struct ims_st { uint16_t ex; /* # of exclusive members */ uint16_t in; /* # of inclusive members */ } ims_st[2]; /* state at t0, t1 */ uint8_t ims_stp; /* pending query */ }; /* * IPv4 multicast PCB-layer source entry. */ struct in_msource { RB_ENTRY(ip_msource) ims_link; /* RB tree links */ in_addr_t ims_haddr; /* host byte order */ uint8_t imsl_st[2]; /* state before/at commit */ }; RB_HEAD(ip_msource_tree, ip_msource); /* define struct ip_msource_tree */ static __inline int ip_msource_cmp(const struct ip_msource *a, const struct ip_msource *b) { if (a->ims_haddr < b->ims_haddr) return (-1); if (a->ims_haddr == b->ims_haddr) return (0); return (1); } RB_PROTOTYPE(ip_msource_tree, ip_msource, ims_link, ip_msource_cmp); /* * IPv4 multicast PCB-layer group filter descriptor. */ struct in_mfilter { struct ip_msource_tree imf_sources; /* source list for (S,G) */ u_long imf_nsrc; /* # of source entries */ uint8_t imf_st[2]; /* state before/at commit */ }; /* * IPv4 group descriptor. * * For every entry on an ifnet's if_multiaddrs list which represents * an IP multicast group, there is one of these structures. * * If any source filters are present, then a node will exist in the RB-tree * to permit fast lookup by source whenever an operation takes place. * This permits pre-order traversal when we issue reports. * Source filter trees are kept separately from the socket layer to * greatly simplify locking. * * When IGMPv3 is active, inm_timer is the response to group query timer. * The state-change timer inm_sctimer is separate; whenever state changes * for the group the state change record is generated and transmitted, * and kept if retransmissions are necessary. * * FUTURE: inm_link is now only used when groups are being purged * on a detaching ifnet. It could be demoted to a SLIST_ENTRY, but * because it is at the very start of the struct, we can't do this * w/o breaking the ABI for ifmcstat. */ struct in_multi { LIST_ENTRY(in_multi) inm_link; /* to-be-released by in_ifdetach */ struct in_addr inm_addr; /* IP multicast address, convenience */ struct ifnet *inm_ifp; /* back pointer to ifnet */ struct ifmultiaddr *inm_ifma; /* back pointer to ifmultiaddr */ u_int inm_timer; /* IGMPv1/v2 group / v3 query timer */ u_int inm_state; /* state of the membership */ void *inm_rti; /* unused, legacy field */ u_int inm_refcount; /* reference count */ /* New fields for IGMPv3 follow. */ struct igmp_ifsoftc *inm_igi; /* IGMP info */ SLIST_ENTRY(in_multi) inm_nrele; /* to-be-released by IGMP */ struct ip_msource_tree inm_srcs; /* tree of sources */ u_long inm_nsrc; /* # of tree entries */ struct mbufq inm_scq; /* queue of pending * state-change packets */ struct timeval inm_lastgsrtv; /* Time of last G-S-R query */ uint16_t inm_sctimer; /* state-change timer */ uint16_t inm_scrv; /* state-change rexmit count */ /* * SSM state counters which track state at T0 (the time the last * state-change report's RV timer went to zero) and T1 * (time of pending report, i.e. now). * Used for computing IGMPv3 state-change reports. Several refcounts * are maintained here to optimize for common use-cases. */ struct inm_st { uint16_t iss_fmode; /* IGMP filter mode */ uint16_t iss_asm; /* # of ASM listeners */ uint16_t iss_ex; /* # of exclusive members */ uint16_t iss_in; /* # of inclusive members */ uint16_t iss_rec; /* # of recorded sources */ } inm_st[2]; /* state at t0, t1 */ }; /* * Helper function to derive the filter mode on a source entry * from its internal counters. Predicates are: * A source is only excluded if all listeners exclude it. * A source is only included if no listeners exclude it, * and at least one listener includes it. * May be used by ifmcstat(8). */ static __inline uint8_t ims_get_mode(const struct in_multi *inm, const struct ip_msource *ims, uint8_t t) { t = !!t; if (inm->inm_st[t].iss_ex > 0 && inm->inm_st[t].iss_ex == ims->ims_st[t].ex) return (MCAST_EXCLUDE); else if (ims->ims_st[t].in > 0 && ims->ims_st[t].ex == 0) return (MCAST_INCLUDE); return (MCAST_UNDEFINED); } #ifdef SYSCTL_DECL SYSCTL_DECL(_net_inet); SYSCTL_DECL(_net_inet_ip); SYSCTL_DECL(_net_inet_raw); #endif /* * Lock macros for IPv4 layer multicast address lists. IPv4 lock goes * before link layer multicast locks in the lock order. In most cases, * consumers of IN_*_MULTI() macros should acquire the locks before * calling them; users of the in_{add,del}multi() functions should not. */ extern struct mtx in_multi_list_mtx; extern struct sx in_multi_sx; #define IN_MULTI_LIST_LOCK() mtx_lock(&in_multi_list_mtx) #define IN_MULTI_LIST_UNLOCK() mtx_unlock(&in_multi_list_mtx) #define IN_MULTI_LIST_LOCK_ASSERT() mtx_assert(&in_multi_list_mtx, MA_OWNED) #define IN_MULTI_LIST_UNLOCK_ASSERT() mtx_assert(&in_multi_list_mtx, MA_NOTOWNED) #define IN_MULTI_LOCK() sx_xlock(&in_multi_sx) #define IN_MULTI_UNLOCK() sx_xunlock(&in_multi_sx) #define IN_MULTI_LOCK_ASSERT() sx_assert(&in_multi_sx, SA_XLOCKED) #define IN_MULTI_UNLOCK_ASSERT() sx_assert(&in_multi_sx, SA_XUNLOCKED) void inm_disconnect(struct in_multi *inm); extern int ifma_restart; /* Acquire an in_multi record. */ static __inline void inm_acquire_locked(struct in_multi *inm) { IN_MULTI_LIST_LOCK_ASSERT(); ++inm->inm_refcount; } static __inline void inm_acquire(struct in_multi *inm) { IN_MULTI_LIST_LOCK(); inm_acquire_locked(inm); IN_MULTI_LIST_UNLOCK(); } static __inline void inm_rele_locked(struct in_multi_head *inmh, struct in_multi *inm) { MPASS(inm->inm_refcount > 0); IN_MULTI_LIST_LOCK_ASSERT(); if (--inm->inm_refcount == 0) { MPASS(inmh != NULL); inm_disconnect(inm); inm->inm_ifma->ifma_protospec = NULL; SLIST_INSERT_HEAD(inmh, inm, inm_nrele); } } /* * Return values for imo_multi_filter(). */ #define MCAST_PASS 0 /* Pass */ #define MCAST_NOTGMEMBER 1 /* This host not a member of group */ #define MCAST_NOTSMEMBER 2 /* This host excluded source */ #define MCAST_MUTED 3 /* [deprecated] */ struct rtentry; struct route; struct ip_moptions; struct in_multi *inm_lookup_locked(struct ifnet *, const struct in_addr); struct in_multi *inm_lookup(struct ifnet *, const struct in_addr); int imo_multi_filter(const struct ip_moptions *, const struct ifnet *, const struct sockaddr *, const struct sockaddr *); void inm_commit(struct in_multi *); void inm_clear_recorded(struct in_multi *); void inm_print(const struct in_multi *); int inm_record_source(struct in_multi *inm, const in_addr_t); void inm_release_deferred(struct in_multi *); void inm_release_list_deferred(struct in_multi_head *); struct in_multi * in_addmulti(struct in_addr *, struct ifnet *); int in_joingroup(struct ifnet *, const struct in_addr *, /*const*/ struct in_mfilter *, struct in_multi **); int in_joingroup_locked(struct ifnet *, const struct in_addr *, /*const*/ struct in_mfilter *, struct in_multi **); int in_leavegroup(struct in_multi *, /*const*/ struct in_mfilter *); int in_leavegroup_locked(struct in_multi *, /*const*/ struct in_mfilter *); int in_control(struct socket *, u_long, caddr_t, struct ifnet *, struct thread *); int in_addprefix(struct in_ifaddr *, int); int in_scrubprefix(struct in_ifaddr *, u_int); void in_ifscrub_all(void); void ip_input(struct mbuf *); void ip_direct_input(struct mbuf *); void in_ifadown(struct ifaddr *ifa, int); struct mbuf *ip_tryforward(struct mbuf *); void *in_domifattach(struct ifnet *); void in_domifdetach(struct ifnet *, void *); /* XXX */ void in_rtalloc_ign(struct route *ro, u_long ignflags, u_int fibnum); void in_rtredirect(struct sockaddr *, struct sockaddr *, struct sockaddr *, int, struct sockaddr *, u_int); #endif /* _KERNEL */ /* INET6 stuff */ #include #endif /* _NETINET_IN_VAR_H_ */ Index: head/sys/netinet/ip_divert.c =================================================================== --- head/sys/netinet/ip_divert.c (revision 334117) +++ head/sys/netinet/ip_divert.c (revision 334118) @@ -1,824 +1,826 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1988, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_sctp.h" #ifndef INET #error "IPDIVERT requires INET" #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET6 #include #include #endif #ifdef SCTP #include #endif #include /* * Divert sockets */ /* * Allocate enough space to hold a full IP packet */ #define DIVSNDQ (65536 + 100) #define DIVRCVQ (65536 + 100) /* * Divert sockets work in conjunction with ipfw or other packet filters, * see the divert(4) manpage for features. * Packets are selected by the packet filter and tagged with an * MTAG_IPFW_RULE tag carrying the 'divert port' number (as set by * the packet filter) and information on the matching filter rule for * subsequent reinjection. The divert_port is used to put the packet * on the corresponding divert socket, while the rule number is passed * up (at least partially) as the sin_port in the struct sockaddr. * * Packets written to the divert socket carry in sin_addr a * destination address, and in sin_port the number of the filter rule * after which to continue processing. * If the destination address is INADDR_ANY, the packet is treated as * as outgoing and sent to ip_output(); otherwise it is treated as * incoming and sent to ip_input(). * Further, sin_zero carries some information on the interface, * which can be used in the reinject -- see comments in the code. * * On reinjection, processing in ip_input() and ip_output() * will be exactly the same as for the original packet, except that * packet filter processing will start at the rule number after the one * written in the sin_port (ipfw does not allow a rule #0, so sin_port=0 * will apply the entire ruleset to the packet). */ /* Internal variables. */ static VNET_DEFINE(struct inpcbhead, divcb); static VNET_DEFINE(struct inpcbinfo, divcbinfo); #define V_divcb VNET(divcb) #define V_divcbinfo VNET(divcbinfo) static u_long div_sendspace = DIVSNDQ; /* XXX sysctl ? */ static u_long div_recvspace = DIVRCVQ; /* XXX sysctl ? */ static eventhandler_tag ip_divert_event_tag; /* * Initialize divert connection block queue. */ static void div_zone_change(void *tag) { uma_zone_set_max(V_divcbinfo.ipi_zone, maxsockets); } static int div_inpcb_init(void *mem, int size, int flags) { struct inpcb *inp = mem; INP_LOCK_INIT(inp, "inp", "divinp"); return (0); } static void div_init(void) { /* * XXX We don't use the hash list for divert IP, but it's easier to * allocate one-entry hash lists than it is to check all over the * place for hashbase == NULL. */ in_pcbinfo_init(&V_divcbinfo, "div", &V_divcb, 1, 1, "divcb", div_inpcb_init, IPI_HASHFIELDS_NONE); } static void div_destroy(void *unused __unused) { in_pcbinfo_destroy(&V_divcbinfo); } VNET_SYSUNINIT(divert, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY, div_destroy, NULL); /* * IPPROTO_DIVERT is not in the real IP protocol number space; this * function should never be called. Just in case, drop any packets. */ static int div_input(struct mbuf **mp, int *offp, int proto) { struct mbuf *m = *mp; KMOD_IPSTAT_INC(ips_noproto); m_freem(m); return (IPPROTO_DONE); } /* * Divert a packet by passing it up to the divert socket at port 'port'. * * Setup generic address and protocol structures for div_input routine, * then pass them along with mbuf chain. */ static void divert_packet(struct mbuf *m, int incoming) { struct ip *ip; struct inpcb *inp; struct socket *sa; u_int16_t nport; struct sockaddr_in divsrc; struct m_tag *mtag; mtag = m_tag_locate(m, MTAG_IPFW_RULE, 0, NULL); if (mtag == NULL) { m_freem(m); return; } /* Assure header */ if (m->m_len < sizeof(struct ip) && (m = m_pullup(m, sizeof(struct ip))) == NULL) return; ip = mtod(m, struct ip *); /* Delayed checksums are currently not compatible with divert. */ if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { in_delayed_cksum(m); m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA; } #ifdef SCTP if (m->m_pkthdr.csum_flags & CSUM_SCTP) { sctp_delayed_cksum(m, (uint32_t)(ip->ip_hl << 2)); m->m_pkthdr.csum_flags &= ~CSUM_SCTP; } #endif bzero(&divsrc, sizeof(divsrc)); divsrc.sin_len = sizeof(divsrc); divsrc.sin_family = AF_INET; /* record matching rule, in host format */ divsrc.sin_port = ((struct ipfw_rule_ref *)(mtag+1))->rulenum; /* * Record receive interface address, if any. * But only for incoming packets. */ if (incoming) { struct ifaddr *ifa; struct ifnet *ifp; /* Sanity check */ M_ASSERTPKTHDR(m); /* Find IP address for receive interface */ ifp = m->m_pkthdr.rcvif; if_addr_rlock(ifp); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET) continue; divsrc.sin_addr = ((struct sockaddr_in *) ifa->ifa_addr)->sin_addr; break; } if_addr_runlock(ifp); } /* * Record the incoming interface name whenever we have one. */ if (m->m_pkthdr.rcvif) { /* * Hide the actual interface name in there in the * sin_zero array. XXX This needs to be moved to a * different sockaddr type for divert, e.g. * sockaddr_div with multiple fields like * sockaddr_dl. Presently we have only 7 bytes * but that will do for now as most interfaces * are 4 or less + 2 or less bytes for unit. * There is probably a faster way of doing this, * possibly taking it from the sockaddr_dl on the iface. * This solves the problem of a P2P link and a LAN interface * having the same address, which can result in the wrong * interface being assigned to the packet when fed back * into the divert socket. Theoretically if the daemon saves * and re-uses the sockaddr_in as suggested in the man pages, * this iface name will come along for the ride. * (see div_output for the other half of this.) */ strlcpy(divsrc.sin_zero, m->m_pkthdr.rcvif->if_xname, sizeof(divsrc.sin_zero)); } /* Put packet on socket queue, if any */ sa = NULL; nport = htons((u_int16_t)(((struct ipfw_rule_ref *)(mtag+1))->info)); INP_INFO_RLOCK(&V_divcbinfo); LIST_FOREACH(inp, &V_divcb, inp_list) { /* XXX why does only one socket match? */ if (inp->inp_lport == nport) { INP_RLOCK(inp); sa = inp->inp_socket; SOCKBUF_LOCK(&sa->so_rcv); if (sbappendaddr_locked(&sa->so_rcv, (struct sockaddr *)&divsrc, m, (struct mbuf *)0) == 0) { SOCKBUF_UNLOCK(&sa->so_rcv); sa = NULL; /* force mbuf reclaim below */ } else sorwakeup_locked(sa); INP_RUNLOCK(inp); break; } } INP_INFO_RUNLOCK(&V_divcbinfo); if (sa == NULL) { m_freem(m); KMOD_IPSTAT_INC(ips_noproto); KMOD_IPSTAT_DEC(ips_delivered); } } /* * Deliver packet back into the IP processing machinery. * * If no address specified, or address is 0.0.0.0, send to ip_output(); * otherwise, send to ip_input() and mark as having been received on * the interface with that address. */ static int div_output(struct socket *so, struct mbuf *m, struct sockaddr_in *sin, struct mbuf *control) { struct ip *const ip = mtod(m, struct ip *); struct m_tag *mtag; struct ipfw_rule_ref *dt; int error = 0; /* * An mbuf may hasn't come from userland, but we pretend * that it has. */ m->m_pkthdr.rcvif = NULL; m->m_nextpkt = NULL; M_SETFIB(m, so->so_fibnum); if (control) m_freem(control); /* XXX */ mtag = m_tag_locate(m, MTAG_IPFW_RULE, 0, NULL); if (mtag == NULL) { /* this should be normal */ mtag = m_tag_alloc(MTAG_IPFW_RULE, 0, sizeof(struct ipfw_rule_ref), M_NOWAIT | M_ZERO); if (mtag == NULL) { error = ENOBUFS; goto cantsend; } m_tag_prepend(m, mtag); } dt = (struct ipfw_rule_ref *)(mtag+1); /* Loopback avoidance and state recovery */ if (sin) { int i; /* set the starting point. We provide a non-zero slot, * but a non_matching chain_id to skip that info and use * the rulenum/rule_id. */ dt->slot = 1; /* dummy, chain_id is invalid */ dt->chain_id = 0; dt->rulenum = sin->sin_port+1; /* host format ? */ dt->rule_id = 0; /* * Find receive interface with the given name, stuffed * (if it exists) in the sin_zero[] field. * The name is user supplied data so don't trust its size * or that it is zero terminated. */ for (i = 0; i < sizeof(sin->sin_zero) && sin->sin_zero[i]; i++) ; if ( i > 0 && i < sizeof(sin->sin_zero)) m->m_pkthdr.rcvif = ifunit(sin->sin_zero); } /* Reinject packet into the system as incoming or outgoing */ if (!sin || sin->sin_addr.s_addr == 0) { struct mbuf *options = NULL; struct inpcb *inp; dt->info |= IPFW_IS_DIVERT | IPFW_INFO_OUT; inp = sotoinpcb(so); INP_RLOCK(inp); switch (ip->ip_v) { case IPVERSION: /* * Don't allow both user specified and setsockopt * options, and don't allow packet length sizes that * will crash. */ if ((((ip->ip_hl << 2) != sizeof(struct ip)) && inp->inp_options != NULL) || ((u_short)ntohs(ip->ip_len) > m->m_pkthdr.len)) { error = EINVAL; INP_RUNLOCK(inp); goto cantsend; } break; #ifdef INET6 case IPV6_VERSION >> 4: { struct ip6_hdr *const ip6 = mtod(m, struct ip6_hdr *); /* Don't allow packet length sizes that will crash */ if (((u_short)ntohs(ip6->ip6_plen) > m->m_pkthdr.len)) { error = EINVAL; INP_RUNLOCK(inp); goto cantsend; } break; } #endif default: error = EINVAL; INP_RUNLOCK(inp); goto cantsend; } /* Send packet to output processing */ KMOD_IPSTAT_INC(ips_rawout); /* XXX */ #ifdef MAC mac_inpcb_create_mbuf(inp, m); #endif /* * Get ready to inject the packet into ip_output(). * Just in case socket options were specified on the * divert socket, we duplicate them. This is done * to avoid having to hold the PCB locks over the call * to ip_output(), as doing this results in a number of * lock ordering complexities. * * Note that we set the multicast options argument for * ip_output() to NULL since it should be invariant that * they are not present. */ KASSERT(inp->inp_moptions == NULL, ("multicast options set on a divert socket")); /* * XXXCSJP: It is unclear to me whether or not it makes * sense for divert sockets to have options. However, * for now we will duplicate them with the INP locks * held so we can use them in ip_output() without * requring a reference to the pcb. */ if (inp->inp_options != NULL) { options = m_dup(inp->inp_options, M_NOWAIT); if (options == NULL) { INP_RUNLOCK(inp); error = ENOBUFS; goto cantsend; } } INP_RUNLOCK(inp); switch (ip->ip_v) { case IPVERSION: error = ip_output(m, options, NULL, ((so->so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0) | IP_ALLOWBROADCAST | IP_RAWOUTPUT, NULL, NULL); break; #ifdef INET6 case IPV6_VERSION >> 4: error = ip6_output(m, NULL, NULL, 0, NULL, NULL, NULL); break; #endif } if (options != NULL) m_freem(options); } else { dt->info |= IPFW_IS_DIVERT | IPFW_INFO_IN; if (m->m_pkthdr.rcvif == NULL) { /* * No luck with the name, check by IP address. * Clear the port and the ifname to make sure * there are no distractions for ifa_ifwithaddr. */ struct ifaddr *ifa; bzero(sin->sin_zero, sizeof(sin->sin_zero)); sin->sin_port = 0; + NET_EPOCH_ENTER(); ifa = ifa_ifwithaddr((struct sockaddr *) sin); if (ifa == NULL) { error = EADDRNOTAVAIL; + NET_EPOCH_EXIT(); goto cantsend; } m->m_pkthdr.rcvif = ifa->ifa_ifp; - ifa_free(ifa); + NET_EPOCH_EXIT(); } #ifdef MAC mac_socket_create_mbuf(so, m); #endif /* Send packet to input processing via netisr */ switch (ip->ip_v) { case IPVERSION: /* * Restore M_BCAST flag when destination address is * broadcast. It is expected by ip_tryforward(). */ if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) m->m_flags |= M_MCAST; else if (in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif)) m->m_flags |= M_BCAST; netisr_queue_src(NETISR_IP, (uintptr_t)so, m); break; #ifdef INET6 case IPV6_VERSION >> 4: netisr_queue_src(NETISR_IPV6, (uintptr_t)so, m); break; #endif default: error = EINVAL; goto cantsend; } } return (error); cantsend: m_freem(m); return (error); } static int div_attach(struct socket *so, int proto, struct thread *td) { struct inpcb *inp; int error; inp = sotoinpcb(so); KASSERT(inp == NULL, ("div_attach: inp != NULL")); if (td != NULL) { error = priv_check(td, PRIV_NETINET_DIVERT); if (error) return (error); } error = soreserve(so, div_sendspace, div_recvspace); if (error) return error; INP_INFO_WLOCK(&V_divcbinfo); error = in_pcballoc(so, &V_divcbinfo); if (error) { INP_INFO_WUNLOCK(&V_divcbinfo); return error; } inp = (struct inpcb *)so->so_pcb; INP_INFO_WUNLOCK(&V_divcbinfo); inp->inp_ip_p = proto; inp->inp_vflag |= INP_IPV4; inp->inp_flags |= INP_HDRINCL; INP_WUNLOCK(inp); return 0; } static void div_detach(struct socket *so) { struct inpcb *inp; inp = sotoinpcb(so); KASSERT(inp != NULL, ("div_detach: inp == NULL")); INP_INFO_WLOCK(&V_divcbinfo); INP_WLOCK(inp); /* XXX defer destruction to epoch_call */ in_pcbdetach(inp); in_pcbfree(inp); INP_INFO_WUNLOCK(&V_divcbinfo); } static int div_bind(struct socket *so, struct sockaddr *nam, struct thread *td) { struct inpcb *inp; int error; inp = sotoinpcb(so); KASSERT(inp != NULL, ("div_bind: inp == NULL")); /* in_pcbbind assumes that nam is a sockaddr_in * and in_pcbbind requires a valid address. Since divert * sockets don't we need to make sure the address is * filled in properly. * XXX -- divert should not be abusing in_pcbind * and should probably have its own family. */ if (nam->sa_family != AF_INET) return EAFNOSUPPORT; ((struct sockaddr_in *)nam)->sin_addr.s_addr = INADDR_ANY; INP_INFO_WLOCK(&V_divcbinfo); INP_WLOCK(inp); INP_HASH_WLOCK(&V_divcbinfo); error = in_pcbbind(inp, nam, td->td_ucred); INP_HASH_WUNLOCK(&V_divcbinfo); INP_WUNLOCK(inp); INP_INFO_WUNLOCK(&V_divcbinfo); return error; } static int div_shutdown(struct socket *so) { struct inpcb *inp; inp = sotoinpcb(so); KASSERT(inp != NULL, ("div_shutdown: inp == NULL")); INP_WLOCK(inp); socantsendmore(so); INP_WUNLOCK(inp); return 0; } static int div_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam, struct mbuf *control, struct thread *td) { /* Packet must have a header (but that's about it) */ if (m->m_len < sizeof (struct ip) && (m = m_pullup(m, sizeof (struct ip))) == NULL) { KMOD_IPSTAT_INC(ips_toosmall); m_freem(m); return EINVAL; } /* Send packet */ return div_output(so, m, (struct sockaddr_in *)nam, control); } static void div_ctlinput(int cmd, struct sockaddr *sa, void *vip) { struct in_addr faddr; faddr = ((struct sockaddr_in *)sa)->sin_addr; if (sa->sa_family != AF_INET || faddr.s_addr == INADDR_ANY) return; if (PRC_IS_REDIRECT(cmd)) return; } static int div_pcblist(SYSCTL_HANDLER_ARGS) { int error, i, n; struct in_pcblist *il; struct inpcb *inp, **inp_list; inp_gen_t gencnt; struct xinpgen xig; /* * The process of preparing the TCB list is too time-consuming and * resource-intensive to repeat twice on every request. */ if (req->oldptr == 0) { n = V_divcbinfo.ipi_count; n += imax(n / 8, 10); req->oldidx = 2 * (sizeof xig) + n * sizeof(struct xinpcb); return 0; } if (req->newptr != 0) return EPERM; /* * OK, now we're committed to doing something. */ INP_INFO_RLOCK(&V_divcbinfo); gencnt = V_divcbinfo.ipi_gencnt; n = V_divcbinfo.ipi_count; INP_INFO_RUNLOCK(&V_divcbinfo); error = sysctl_wire_old_buffer(req, 2 * sizeof(xig) + n*sizeof(struct xinpcb)); if (error != 0) return (error); xig.xig_len = sizeof xig; xig.xig_count = n; xig.xig_gen = gencnt; xig.xig_sogen = so_gencnt; error = SYSCTL_OUT(req, &xig, sizeof xig); if (error) return error; il = malloc(sizeof(struct in_pcblist) + n * sizeof(struct inpcb *), M_TEMP, M_EPOCH_CALL_WAITOK); inp_list = il->il_inp_list; INP_INFO_RLOCK(&V_divcbinfo); for (inp = LIST_FIRST(V_divcbinfo.ipi_listhead), i = 0; inp && i < n; inp = LIST_NEXT(inp, inp_list)) { INP_WLOCK(inp); if (inp->inp_gencnt <= gencnt && cr_canseeinpcb(req->td->td_ucred, inp) == 0) { in_pcbref(inp); inp_list[i++] = inp; } INP_WUNLOCK(inp); } INP_INFO_RUNLOCK(&V_divcbinfo); n = i; error = 0; for (i = 0; i < n; i++) { inp = inp_list[i]; INP_RLOCK(inp); if (inp->inp_gencnt <= gencnt) { struct xinpcb xi; in_pcbtoxinpcb(inp, &xi); INP_RUNLOCK(inp); error = SYSCTL_OUT(req, &xi, sizeof xi); } else INP_RUNLOCK(inp); } il->il_count = n; il->il_pcbinfo = &V_divcbinfo; epoch_call(net_epoch_preempt, &il->il_epoch_ctx, in_pcblist_rele_rlocked); if (!error) { /* * Give the user an updated idea of our state. * If the generation differs from what we told * her before, she knows that something happened * while we were processing this request, and it * might be necessary to retry. */ INP_INFO_RLOCK(&V_divcbinfo); xig.xig_gen = V_divcbinfo.ipi_gencnt; xig.xig_sogen = so_gencnt; xig.xig_count = V_divcbinfo.ipi_count; INP_INFO_RUNLOCK(&V_divcbinfo); error = SYSCTL_OUT(req, &xig, sizeof xig); } return error; } #ifdef SYSCTL_NODE static SYSCTL_NODE(_net_inet, IPPROTO_DIVERT, divert, CTLFLAG_RW, 0, "IPDIVERT"); SYSCTL_PROC(_net_inet_divert, OID_AUTO, pcblist, CTLTYPE_OPAQUE | CTLFLAG_RD, NULL, 0, div_pcblist, "S,xinpcb", "List of active divert sockets"); #endif struct pr_usrreqs div_usrreqs = { .pru_attach = div_attach, .pru_bind = div_bind, .pru_control = in_control, .pru_detach = div_detach, .pru_peeraddr = in_getpeeraddr, .pru_send = div_send, .pru_shutdown = div_shutdown, .pru_sockaddr = in_getsockaddr, .pru_sosetlabel = in_pcbsosetlabel }; struct protosw div_protosw = { .pr_type = SOCK_RAW, .pr_protocol = IPPROTO_DIVERT, .pr_flags = PR_ATOMIC|PR_ADDR, .pr_input = div_input, .pr_ctlinput = div_ctlinput, .pr_ctloutput = ip_ctloutput, .pr_init = div_init, .pr_usrreqs = &div_usrreqs }; static int div_modevent(module_t mod, int type, void *unused) { int err = 0; switch (type) { case MOD_LOAD: /* * Protocol will be initialized by pf_proto_register(). * We don't have to register ip_protox because we are not * a true IP protocol that goes over the wire. */ err = pf_proto_register(PF_INET, &div_protosw); if (err != 0) return (err); ip_divert_ptr = divert_packet; ip_divert_event_tag = EVENTHANDLER_REGISTER(maxsockets_change, div_zone_change, NULL, EVENTHANDLER_PRI_ANY); break; case MOD_QUIESCE: /* * IPDIVERT may normally not be unloaded because of the * potential race conditions. Tell kldunload we can't be * unloaded unless the unload is forced. */ err = EPERM; break; case MOD_UNLOAD: /* * Forced unload. * * Module ipdivert can only be unloaded if no sockets are * connected. Maybe this can be changed later to forcefully * disconnect any open sockets. * * XXXRW: Note that there is a slight race here, as a new * socket open request could be spinning on the lock and then * we destroy the lock. */ INP_INFO_WLOCK(&V_divcbinfo); if (V_divcbinfo.ipi_count != 0) { err = EBUSY; INP_INFO_WUNLOCK(&V_divcbinfo); break; } ip_divert_ptr = NULL; /* XXX defer to epoch_call ? */ err = pf_proto_unregister(PF_INET, IPPROTO_DIVERT, SOCK_RAW); INP_INFO_WUNLOCK(&V_divcbinfo); #ifndef VIMAGE div_destroy(NULL); #endif EVENTHANDLER_DEREGISTER(maxsockets_change, ip_divert_event_tag); break; default: err = EOPNOTSUPP; break; } return err; } static moduledata_t ipdivertmod = { "ipdivert", div_modevent, 0 }; DECLARE_MODULE(ipdivert, ipdivertmod, SI_SUB_PROTO_FIREWALL, SI_ORDER_ANY); MODULE_DEPEND(ipdivert, ipfw, 3, 3, 3); MODULE_VERSION(ipdivert, 1); Index: head/sys/netinet/ip_icmp.c =================================================================== --- head/sys/netinet/ip_icmp.c (revision 334117) +++ head/sys/netinet/ip_icmp.c (revision 334118) @@ -1,1052 +1,1055 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1988, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ip_icmp.c 8.2 (Berkeley) 1/4/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET #include #include #endif /* INET */ /* * ICMP routines: error generation, receive packet processing, and * routines to turnaround packets back to the originator, and * host table maintenance routines. */ static VNET_DEFINE(int, icmplim) = 200; #define V_icmplim VNET(icmplim) SYSCTL_INT(_net_inet_icmp, ICMPCTL_ICMPLIM, icmplim, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(icmplim), 0, "Maximum number of ICMP responses per second"); static VNET_DEFINE(int, icmplim_output) = 1; #define V_icmplim_output VNET(icmplim_output) SYSCTL_INT(_net_inet_icmp, OID_AUTO, icmplim_output, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(icmplim_output), 0, "Enable logging of ICMP response rate limiting"); #ifdef INET VNET_PCPUSTAT_DEFINE(struct icmpstat, icmpstat); VNET_PCPUSTAT_SYSINIT(icmpstat); SYSCTL_VNET_PCPUSTAT(_net_inet_icmp, ICMPCTL_STATS, stats, struct icmpstat, icmpstat, "ICMP statistics (struct icmpstat, netinet/icmp_var.h)"); #ifdef VIMAGE VNET_PCPUSTAT_SYSUNINIT(icmpstat); #endif /* VIMAGE */ static VNET_DEFINE(int, icmpmaskrepl) = 0; #define V_icmpmaskrepl VNET(icmpmaskrepl) SYSCTL_INT(_net_inet_icmp, ICMPCTL_MASKREPL, maskrepl, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(icmpmaskrepl), 0, "Reply to ICMP Address Mask Request packets"); static VNET_DEFINE(u_int, icmpmaskfake) = 0; #define V_icmpmaskfake VNET(icmpmaskfake) SYSCTL_UINT(_net_inet_icmp, OID_AUTO, maskfake, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(icmpmaskfake), 0, "Fake reply to ICMP Address Mask Request packets"); VNET_DEFINE(int, drop_redirect) = 0; #define V_drop_redirect VNET(drop_redirect) SYSCTL_INT(_net_inet_icmp, OID_AUTO, drop_redirect, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(drop_redirect), 0, "Ignore ICMP redirects"); static VNET_DEFINE(int, log_redirect) = 0; #define V_log_redirect VNET(log_redirect) SYSCTL_INT(_net_inet_icmp, OID_AUTO, log_redirect, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(log_redirect), 0, "Log ICMP redirects to the console"); static VNET_DEFINE(char, reply_src[IFNAMSIZ]); #define V_reply_src VNET(reply_src) SYSCTL_STRING(_net_inet_icmp, OID_AUTO, reply_src, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(reply_src), IFNAMSIZ, "ICMP reply source for non-local packets"); static VNET_DEFINE(int, icmp_rfi) = 0; #define V_icmp_rfi VNET(icmp_rfi) SYSCTL_INT(_net_inet_icmp, OID_AUTO, reply_from_interface, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(icmp_rfi), 0, "ICMP reply from incoming interface for non-local packets"); static VNET_DEFINE(int, icmp_quotelen) = 8; #define V_icmp_quotelen VNET(icmp_quotelen) SYSCTL_INT(_net_inet_icmp, OID_AUTO, quotelen, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(icmp_quotelen), 0, "Number of bytes from original packet to quote in ICMP reply"); static VNET_DEFINE(int, icmpbmcastecho) = 0; #define V_icmpbmcastecho VNET(icmpbmcastecho) SYSCTL_INT(_net_inet_icmp, OID_AUTO, bmcastecho, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(icmpbmcastecho), 0, "Reply to multicast ICMP Echo Request and Timestamp packets"); static VNET_DEFINE(int, icmptstamprepl) = 1; #define V_icmptstamprepl VNET(icmptstamprepl) SYSCTL_INT(_net_inet_icmp, OID_AUTO, tstamprepl, CTLFLAG_RW, &VNET_NAME(icmptstamprepl), 0, "Respond to ICMP Timestamp packets"); #ifdef ICMPPRINTFS int icmpprintfs = 0; #endif static void icmp_reflect(struct mbuf *); static void icmp_send(struct mbuf *, struct mbuf *); extern struct protosw inetsw[]; /* * Kernel module interface for updating icmpstat. The argument is an index * into icmpstat treated as an array of u_long. While this encodes the * general layout of icmpstat into the caller, it doesn't encode its * location, so that future changes to add, for example, per-CPU stats * support won't cause binary compatibility problems for kernel modules. */ void kmod_icmpstat_inc(int statnum) { counter_u64_add(VNET(icmpstat)[statnum], 1); } /* * Generate an error packet of type error * in response to bad packet ip. */ void icmp_error(struct mbuf *n, int type, int code, uint32_t dest, int mtu) { struct ip *oip, *nip; struct icmp *icp; struct mbuf *m; unsigned icmplen, icmpelen, nlen, oiphlen; KASSERT((u_int)type <= ICMP_MAXTYPE, ("%s: illegal ICMP type", __func__)); if (type != ICMP_REDIRECT) ICMPSTAT_INC(icps_error); /* * Don't send error: * if the original packet was encrypted. * if not the first fragment of message. * in response to a multicast or broadcast packet. * if the old packet protocol was an ICMP error message. */ if (n->m_flags & M_DECRYPTED) goto freeit; if (n->m_flags & (M_BCAST|M_MCAST)) goto freeit; /* Drop if IP header plus 8 bytes is not contiguous in first mbuf. */ if (n->m_len < sizeof(struct ip) + ICMP_MINLEN) goto freeit; oip = mtod(n, struct ip *); oiphlen = oip->ip_hl << 2; if (n->m_len < oiphlen + ICMP_MINLEN) goto freeit; #ifdef ICMPPRINTFS if (icmpprintfs) printf("icmp_error(%p, %x, %d)\n", oip, type, code); #endif if (oip->ip_off & htons(~(IP_MF|IP_DF))) goto freeit; if (oip->ip_p == IPPROTO_ICMP && type != ICMP_REDIRECT && !ICMP_INFOTYPE(((struct icmp *)((caddr_t)oip + oiphlen))->icmp_type)) { ICMPSTAT_INC(icps_oldicmp); goto freeit; } /* * Calculate length to quote from original packet and * prevent the ICMP mbuf from overflowing. * Unfortunately this is non-trivial since ip_forward() * sends us truncated packets. */ nlen = m_length(n, NULL); if (oip->ip_p == IPPROTO_TCP) { struct tcphdr *th; int tcphlen; if (oiphlen + sizeof(struct tcphdr) > n->m_len && n->m_next == NULL) goto stdreply; if (n->m_len < oiphlen + sizeof(struct tcphdr) && (n = m_pullup(n, oiphlen + sizeof(struct tcphdr))) == NULL) goto freeit; oip = mtod(n, struct ip *); th = mtodo(n, oiphlen); tcphlen = th->th_off << 2; if (tcphlen < sizeof(struct tcphdr)) goto freeit; if (ntohs(oip->ip_len) < oiphlen + tcphlen) goto freeit; if (oiphlen + tcphlen > n->m_len && n->m_next == NULL) goto stdreply; if (n->m_len < oiphlen + tcphlen && (n = m_pullup(n, oiphlen + tcphlen)) == NULL) goto freeit; icmpelen = max(tcphlen, min(V_icmp_quotelen, ntohs(oip->ip_len) - oiphlen)); } else if (oip->ip_p == IPPROTO_SCTP) { struct sctphdr *sh; struct sctp_chunkhdr *ch; if (ntohs(oip->ip_len) < oiphlen + sizeof(struct sctphdr)) goto stdreply; if (oiphlen + sizeof(struct sctphdr) > n->m_len && n->m_next == NULL) goto stdreply; if (n->m_len < oiphlen + sizeof(struct sctphdr) && (n = m_pullup(n, oiphlen + sizeof(struct sctphdr))) == NULL) goto freeit; oip = mtod(n, struct ip *); icmpelen = max(sizeof(struct sctphdr), min(V_icmp_quotelen, ntohs(oip->ip_len) - oiphlen)); sh = mtodo(n, oiphlen); if (ntohl(sh->v_tag) == 0 && ntohs(oip->ip_len) >= oiphlen + sizeof(struct sctphdr) + 8 && (n->m_len >= oiphlen + sizeof(struct sctphdr) + 8 || n->m_next != NULL)) { if (n->m_len < oiphlen + sizeof(struct sctphdr) + 8 && (n = m_pullup(n, oiphlen + sizeof(struct sctphdr) + 8)) == NULL) goto freeit; oip = mtod(n, struct ip *); sh = mtodo(n, oiphlen); ch = (struct sctp_chunkhdr *)(sh + 1); if (ch->chunk_type == SCTP_INITIATION) { icmpelen = max(sizeof(struct sctphdr) + 8, min(V_icmp_quotelen, ntohs(oip->ip_len) - oiphlen)); } } } else stdreply: icmpelen = max(8, min(V_icmp_quotelen, ntohs(oip->ip_len) - oiphlen)); icmplen = min(oiphlen + icmpelen, nlen); if (icmplen < sizeof(struct ip)) goto freeit; if (MHLEN > sizeof(struct ip) + ICMP_MINLEN + icmplen) m = m_gethdr(M_NOWAIT, MT_DATA); else m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); if (m == NULL) goto freeit; #ifdef MAC mac_netinet_icmp_reply(n, m); #endif icmplen = min(icmplen, M_TRAILINGSPACE(m) - sizeof(struct ip) - ICMP_MINLEN); m_align(m, ICMP_MINLEN + icmplen); m->m_len = ICMP_MINLEN + icmplen; /* XXX MRT make the outgoing packet use the same FIB * that was associated with the incoming packet */ M_SETFIB(m, M_GETFIB(n)); icp = mtod(m, struct icmp *); ICMPSTAT_INC(icps_outhist[type]); icp->icmp_type = type; if (type == ICMP_REDIRECT) icp->icmp_gwaddr.s_addr = dest; else { icp->icmp_void = 0; /* * The following assignments assume an overlay with the * just zeroed icmp_void field. */ if (type == ICMP_PARAMPROB) { icp->icmp_pptr = code; code = 0; } else if (type == ICMP_UNREACH && code == ICMP_UNREACH_NEEDFRAG && mtu) { icp->icmp_nextmtu = htons(mtu); } } icp->icmp_code = code; /* * Copy the quotation into ICMP message and * convert quoted IP header back to network representation. */ m_copydata(n, 0, icmplen, (caddr_t)&icp->icmp_ip); nip = &icp->icmp_ip; /* * Set up ICMP message mbuf and copy old IP header (without options * in front of ICMP message. * If the original mbuf was meant to bypass the firewall, the error * reply should bypass as well. */ m->m_flags |= n->m_flags & M_SKIP_FIREWALL; m->m_data -= sizeof(struct ip); m->m_len += sizeof(struct ip); m->m_pkthdr.len = m->m_len; m->m_pkthdr.rcvif = n->m_pkthdr.rcvif; nip = mtod(m, struct ip *); bcopy((caddr_t)oip, (caddr_t)nip, sizeof(struct ip)); nip->ip_len = htons(m->m_len); nip->ip_v = IPVERSION; nip->ip_hl = 5; nip->ip_p = IPPROTO_ICMP; nip->ip_tos = 0; nip->ip_off = 0; icmp_reflect(m); freeit: m_freem(n); } /* * Process a received ICMP message. */ int icmp_input(struct mbuf **mp, int *offp, int proto) { struct icmp *icp; struct in_ifaddr *ia; struct mbuf *m = *mp; struct ip *ip = mtod(m, struct ip *); struct sockaddr_in icmpsrc, icmpdst, icmpgw; int hlen = *offp; int icmplen = ntohs(ip->ip_len) - *offp; int i, code; void (*ctlfunc)(int, struct sockaddr *, void *); int fibnum; *mp = NULL; /* * Locate icmp structure in mbuf, and check * that not corrupted and of at least minimum length. */ #ifdef ICMPPRINTFS if (icmpprintfs) { char srcbuf[INET_ADDRSTRLEN]; char dstbuf[INET_ADDRSTRLEN]; printf("icmp_input from %s to %s, len %d\n", inet_ntoa_r(ip->ip_src, srcbuf), inet_ntoa_r(ip->ip_dst, dstbuf), icmplen); } #endif + NET_EPOCH_ENTER(); if (icmplen < ICMP_MINLEN) { ICMPSTAT_INC(icps_tooshort); goto freeit; } i = hlen + min(icmplen, ICMP_ADVLENMIN); if (m->m_len < i && (m = m_pullup(m, i)) == NULL) { ICMPSTAT_INC(icps_tooshort); + NET_EPOCH_EXIT(); return (IPPROTO_DONE); } ip = mtod(m, struct ip *); m->m_len -= hlen; m->m_data += hlen; icp = mtod(m, struct icmp *); if (in_cksum(m, icmplen)) { ICMPSTAT_INC(icps_checksum); goto freeit; } m->m_len += hlen; m->m_data -= hlen; #ifdef ICMPPRINTFS if (icmpprintfs) printf("icmp_input, type %d code %d\n", icp->icmp_type, icp->icmp_code); #endif /* * Message type specific processing. */ if (icp->icmp_type > ICMP_MAXTYPE) goto raw; /* Initialize */ bzero(&icmpsrc, sizeof(icmpsrc)); icmpsrc.sin_len = sizeof(struct sockaddr_in); icmpsrc.sin_family = AF_INET; bzero(&icmpdst, sizeof(icmpdst)); icmpdst.sin_len = sizeof(struct sockaddr_in); icmpdst.sin_family = AF_INET; bzero(&icmpgw, sizeof(icmpgw)); icmpgw.sin_len = sizeof(struct sockaddr_in); icmpgw.sin_family = AF_INET; ICMPSTAT_INC(icps_inhist[icp->icmp_type]); code = icp->icmp_code; switch (icp->icmp_type) { case ICMP_UNREACH: switch (code) { case ICMP_UNREACH_NET: case ICMP_UNREACH_HOST: case ICMP_UNREACH_SRCFAIL: case ICMP_UNREACH_NET_UNKNOWN: case ICMP_UNREACH_HOST_UNKNOWN: case ICMP_UNREACH_ISOLATED: case ICMP_UNREACH_TOSNET: case ICMP_UNREACH_TOSHOST: case ICMP_UNREACH_HOST_PRECEDENCE: case ICMP_UNREACH_PRECEDENCE_CUTOFF: code = PRC_UNREACH_NET; break; case ICMP_UNREACH_NEEDFRAG: code = PRC_MSGSIZE; break; /* * RFC 1122, Sections 3.2.2.1 and 4.2.3.9. * Treat subcodes 2,3 as immediate RST */ case ICMP_UNREACH_PROTOCOL: code = PRC_UNREACH_PROTOCOL; break; case ICMP_UNREACH_PORT: code = PRC_UNREACH_PORT; break; case ICMP_UNREACH_NET_PROHIB: case ICMP_UNREACH_HOST_PROHIB: case ICMP_UNREACH_FILTER_PROHIB: code = PRC_UNREACH_ADMIN_PROHIB; break; default: goto badcode; } goto deliver; case ICMP_TIMXCEED: if (code > 1) goto badcode; code += PRC_TIMXCEED_INTRANS; goto deliver; case ICMP_PARAMPROB: if (code > 1) goto badcode; code = PRC_PARAMPROB; deliver: /* * Problem with datagram; advise higher level routines. */ if (icmplen < ICMP_ADVLENMIN || icmplen < ICMP_ADVLEN(icp) || icp->icmp_ip.ip_hl < (sizeof(struct ip) >> 2)) { ICMPSTAT_INC(icps_badlen); goto freeit; } /* Discard ICMP's in response to multicast packets */ if (IN_MULTICAST(ntohl(icp->icmp_ip.ip_dst.s_addr))) goto badcode; #ifdef ICMPPRINTFS if (icmpprintfs) printf("deliver to protocol %d\n", icp->icmp_ip.ip_p); #endif icmpsrc.sin_addr = icp->icmp_ip.ip_dst; /* * XXX if the packet contains [IPv4 AH TCP], we can't make a * notification to TCP layer. */ i = sizeof(struct ip) + min(icmplen, ICMP_ADVLENPREF(icp)); ip_stripoptions(m); if (m->m_len < i && (m = m_pullup(m, i)) == NULL) { /* This should actually not happen */ ICMPSTAT_INC(icps_tooshort); + NET_EPOCH_EXIT(); return (IPPROTO_DONE); } ip = mtod(m, struct ip *); icp = (struct icmp *)(ip + 1); /* * The upper layer handler can rely on: * - The outer IP header has no options. * - The outer IP header, the ICMP header, the inner IP header, * and the first n bytes of the inner payload are contiguous. * n is at least 8, but might be larger based on * ICMP_ADVLENPREF. See its definition in ip_icmp.h. */ ctlfunc = inetsw[ip_protox[icp->icmp_ip.ip_p]].pr_ctlinput; if (ctlfunc) (*ctlfunc)(code, (struct sockaddr *)&icmpsrc, (void *)&icp->icmp_ip); break; badcode: ICMPSTAT_INC(icps_badcode); break; case ICMP_ECHO: if (!V_icmpbmcastecho && (m->m_flags & (M_MCAST | M_BCAST)) != 0) { ICMPSTAT_INC(icps_bmcastecho); break; } if (badport_bandlim(BANDLIM_ICMP_ECHO) < 0) goto freeit; icp->icmp_type = ICMP_ECHOREPLY; goto reflect; case ICMP_TSTAMP: if (V_icmptstamprepl == 0) break; if (!V_icmpbmcastecho && (m->m_flags & (M_MCAST | M_BCAST)) != 0) { ICMPSTAT_INC(icps_bmcasttstamp); break; } if (icmplen < ICMP_TSLEN) { ICMPSTAT_INC(icps_badlen); break; } if (badport_bandlim(BANDLIM_ICMP_TSTAMP) < 0) goto freeit; icp->icmp_type = ICMP_TSTAMPREPLY; icp->icmp_rtime = iptime(); icp->icmp_ttime = icp->icmp_rtime; /* bogus, do later! */ goto reflect; case ICMP_MASKREQ: if (V_icmpmaskrepl == 0) break; /* * We are not able to respond with all ones broadcast * unless we receive it over a point-to-point interface. */ if (icmplen < ICMP_MASKLEN) break; switch (ip->ip_dst.s_addr) { case INADDR_BROADCAST: case INADDR_ANY: icmpdst.sin_addr = ip->ip_src; break; default: icmpdst.sin_addr = ip->ip_dst; } ia = (struct in_ifaddr *)ifaof_ifpforaddr( (struct sockaddr *)&icmpdst, m->m_pkthdr.rcvif); if (ia == NULL) break; - if (ia->ia_ifp == NULL) { - ifa_free(&ia->ia_ifa); + if (ia->ia_ifp == NULL) break; - } icp->icmp_type = ICMP_MASKREPLY; if (V_icmpmaskfake == 0) icp->icmp_mask = ia->ia_sockmask.sin_addr.s_addr; else icp->icmp_mask = V_icmpmaskfake; if (ip->ip_src.s_addr == 0) { if (ia->ia_ifp->if_flags & IFF_BROADCAST) ip->ip_src = satosin(&ia->ia_broadaddr)->sin_addr; else if (ia->ia_ifp->if_flags & IFF_POINTOPOINT) ip->ip_src = satosin(&ia->ia_dstaddr)->sin_addr; } - ifa_free(&ia->ia_ifa); reflect: ICMPSTAT_INC(icps_reflect); ICMPSTAT_INC(icps_outhist[icp->icmp_type]); icmp_reflect(m); + NET_EPOCH_EXIT(); return (IPPROTO_DONE); case ICMP_REDIRECT: if (V_log_redirect) { u_long src, dst, gw; src = ntohl(ip->ip_src.s_addr); dst = ntohl(icp->icmp_ip.ip_dst.s_addr); gw = ntohl(icp->icmp_gwaddr.s_addr); printf("icmp redirect from %d.%d.%d.%d: " "%d.%d.%d.%d => %d.%d.%d.%d\n", (int)(src >> 24), (int)((src >> 16) & 0xff), (int)((src >> 8) & 0xff), (int)(src & 0xff), (int)(dst >> 24), (int)((dst >> 16) & 0xff), (int)((dst >> 8) & 0xff), (int)(dst & 0xff), (int)(gw >> 24), (int)((gw >> 16) & 0xff), (int)((gw >> 8) & 0xff), (int)(gw & 0xff)); } /* * RFC1812 says we must ignore ICMP redirects if we * are acting as router. */ if (V_drop_redirect || V_ipforwarding) break; if (code > 3) goto badcode; if (icmplen < ICMP_ADVLENMIN || icmplen < ICMP_ADVLEN(icp) || icp->icmp_ip.ip_hl < (sizeof(struct ip) >> 2)) { ICMPSTAT_INC(icps_badlen); break; } /* * Short circuit routing redirects to force * immediate change in the kernel's routing * tables. The message is also handed to anyone * listening on a raw socket (e.g. the routing * daemon for use in updating its tables). */ icmpgw.sin_addr = ip->ip_src; icmpdst.sin_addr = icp->icmp_gwaddr; #ifdef ICMPPRINTFS if (icmpprintfs) { char dstbuf[INET_ADDRSTRLEN]; char gwbuf[INET_ADDRSTRLEN]; printf("redirect dst %s to %s\n", inet_ntoa_r(icp->icmp_ip.ip_dst, dstbuf), inet_ntoa_r(icp->icmp_gwaddr, gwbuf)); } #endif icmpsrc.sin_addr = icp->icmp_ip.ip_dst; for ( fibnum = 0; fibnum < rt_numfibs; fibnum++) { in_rtredirect((struct sockaddr *)&icmpsrc, (struct sockaddr *)&icmpdst, (struct sockaddr *)0, RTF_GATEWAY | RTF_HOST, (struct sockaddr *)&icmpgw, fibnum); } pfctlinput(PRC_REDIRECT_HOST, (struct sockaddr *)&icmpsrc); break; /* * No kernel processing for the following; * just fall through to send to raw listener. */ case ICMP_ECHOREPLY: case ICMP_ROUTERADVERT: case ICMP_ROUTERSOLICIT: case ICMP_TSTAMPREPLY: case ICMP_IREQREPLY: case ICMP_MASKREPLY: case ICMP_SOURCEQUENCH: default: break; } raw: + NET_EPOCH_EXIT(); *mp = m; rip_input(mp, offp, proto); return (IPPROTO_DONE); freeit: + NET_EPOCH_EXIT(); m_freem(m); return (IPPROTO_DONE); } /* * Reflect the ip packet back to the source */ static void icmp_reflect(struct mbuf *m) { struct rm_priotracker in_ifa_tracker; struct ip *ip = mtod(m, struct ip *); struct ifaddr *ifa; struct ifnet *ifp; struct in_ifaddr *ia; struct in_addr t; struct nhop4_extended nh_ext; struct mbuf *opts = NULL; int optlen = (ip->ip_hl << 2) - sizeof(struct ip); if (IN_MULTICAST(ntohl(ip->ip_src.s_addr)) || IN_EXPERIMENTAL(ntohl(ip->ip_src.s_addr)) || IN_ZERONET(ntohl(ip->ip_src.s_addr)) ) { m_freem(m); /* Bad return address */ ICMPSTAT_INC(icps_badaddr); goto done; /* Ip_output() will check for broadcast */ } t = ip->ip_dst; ip->ip_dst = ip->ip_src; /* * Source selection for ICMP replies: * * If the incoming packet was addressed directly to one of our * own addresses, use dst as the src for the reply. */ IN_IFADDR_RLOCK(&in_ifa_tracker); LIST_FOREACH(ia, INADDR_HASH(t.s_addr), ia_hash) { if (t.s_addr == IA_SIN(ia)->sin_addr.s_addr) { t = IA_SIN(ia)->sin_addr; IN_IFADDR_RUNLOCK(&in_ifa_tracker); goto match; } } IN_IFADDR_RUNLOCK(&in_ifa_tracker); /* * If the incoming packet was addressed to one of our broadcast * addresses, use the first non-broadcast address which corresponds * to the incoming interface. */ ifp = m->m_pkthdr.rcvif; if (ifp != NULL && ifp->if_flags & IFF_BROADCAST) { IF_ADDR_RLOCK(ifp); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET) continue; ia = ifatoia(ifa); if (satosin(&ia->ia_broadaddr)->sin_addr.s_addr == t.s_addr) { t = IA_SIN(ia)->sin_addr; IF_ADDR_RUNLOCK(ifp); goto match; } } IF_ADDR_RUNLOCK(ifp); } /* * If the packet was transiting through us, use the address of * the interface the packet came through in. If that interface * doesn't have a suitable IP address, the normal selection * criteria apply. */ if (V_icmp_rfi && ifp != NULL) { IF_ADDR_RLOCK(ifp); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET) continue; ia = ifatoia(ifa); t = IA_SIN(ia)->sin_addr; IF_ADDR_RUNLOCK(ifp); goto match; } IF_ADDR_RUNLOCK(ifp); } /* * If the incoming packet was not addressed directly to us, use * designated interface for icmp replies specified by sysctl * net.inet.icmp.reply_src (default not set). Otherwise continue * with normal source selection. */ if (V_reply_src[0] != '\0' && (ifp = ifunit(V_reply_src))) { IF_ADDR_RLOCK(ifp); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET) continue; ia = ifatoia(ifa); t = IA_SIN(ia)->sin_addr; IF_ADDR_RUNLOCK(ifp); goto match; } IF_ADDR_RUNLOCK(ifp); } /* * If the packet was transiting through us, use the address of * the interface that is the closest to the packet source. * When we don't have a route back to the packet source, stop here * and drop the packet. */ if (fib4_lookup_nh_ext(M_GETFIB(m), ip->ip_dst, 0, 0, &nh_ext) != 0) { m_freem(m); ICMPSTAT_INC(icps_noroute); goto done; } t = nh_ext.nh_src; match: #ifdef MAC mac_netinet_icmp_replyinplace(m); #endif ip->ip_src = t; ip->ip_ttl = V_ip_defttl; if (optlen > 0) { u_char *cp; int opt, cnt; u_int len; /* * Retrieve any source routing from the incoming packet; * add on any record-route or timestamp options. */ cp = (u_char *) (ip + 1); if ((opts = ip_srcroute(m)) == NULL && (opts = m_gethdr(M_NOWAIT, MT_DATA))) { opts->m_len = sizeof(struct in_addr); mtod(opts, struct in_addr *)->s_addr = 0; } if (opts) { #ifdef ICMPPRINTFS if (icmpprintfs) printf("icmp_reflect optlen %d rt %d => ", optlen, opts->m_len); #endif for (cnt = optlen; cnt > 0; cnt -= len, cp += len) { opt = cp[IPOPT_OPTVAL]; if (opt == IPOPT_EOL) break; if (opt == IPOPT_NOP) len = 1; else { if (cnt < IPOPT_OLEN + sizeof(*cp)) break; len = cp[IPOPT_OLEN]; if (len < IPOPT_OLEN + sizeof(*cp) || len > cnt) break; } /* * Should check for overflow, but it "can't happen" */ if (opt == IPOPT_RR || opt == IPOPT_TS || opt == IPOPT_SECURITY) { bcopy((caddr_t)cp, mtod(opts, caddr_t) + opts->m_len, len); opts->m_len += len; } } /* Terminate & pad, if necessary */ cnt = opts->m_len % 4; if (cnt) { for (; cnt < 4; cnt++) { *(mtod(opts, caddr_t) + opts->m_len) = IPOPT_EOL; opts->m_len++; } } #ifdef ICMPPRINTFS if (icmpprintfs) printf("%d\n", opts->m_len); #endif } ip_stripoptions(m); } m_tag_delete_nonpersistent(m); m->m_flags &= ~(M_BCAST|M_MCAST); icmp_send(m, opts); done: if (opts) (void)m_free(opts); } /* * Send an icmp packet back to the ip level, * after supplying a checksum. */ static void icmp_send(struct mbuf *m, struct mbuf *opts) { struct ip *ip = mtod(m, struct ip *); int hlen; struct icmp *icp; hlen = ip->ip_hl << 2; m->m_data += hlen; m->m_len -= hlen; icp = mtod(m, struct icmp *); icp->icmp_cksum = 0; icp->icmp_cksum = in_cksum(m, ntohs(ip->ip_len) - hlen); m->m_data -= hlen; m->m_len += hlen; m->m_pkthdr.rcvif = (struct ifnet *)0; #ifdef ICMPPRINTFS if (icmpprintfs) { char dstbuf[INET_ADDRSTRLEN]; char srcbuf[INET_ADDRSTRLEN]; printf("icmp_send dst %s src %s\n", inet_ntoa_r(ip->ip_dst, dstbuf), inet_ntoa_r(ip->ip_src, srcbuf)); } #endif (void) ip_output(m, opts, NULL, 0, NULL, NULL); } /* * Return milliseconds since 00:00 UTC in network format. */ uint32_t iptime(void) { struct timeval atv; u_long t; getmicrotime(&atv); t = (atv.tv_sec % (24*60*60)) * 1000 + atv.tv_usec / 1000; return (htonl(t)); } /* * Return the next larger or smaller MTU plateau (table from RFC 1191) * given current value MTU. If DIR is less than zero, a larger plateau * is returned; otherwise, a smaller value is returned. */ int ip_next_mtu(int mtu, int dir) { static int mtutab[] = { 65535, 32000, 17914, 8166, 4352, 2002, 1492, 1280, 1006, 508, 296, 68, 0 }; int i, size; size = (sizeof mtutab) / (sizeof mtutab[0]); if (dir >= 0) { for (i = 0; i < size; i++) if (mtu > mtutab[i]) return mtutab[i]; } else { for (i = size - 1; i >= 0; i--) if (mtu < mtutab[i]) return mtutab[i]; if (mtu == mtutab[0]) return mtutab[0]; } return 0; } #endif /* INET */ /* * badport_bandlim() - check for ICMP bandwidth limit * * Return 0 if it is ok to send an ICMP error response, -1 if we have * hit our bandwidth limit and it is not ok. * * If icmplim is <= 0, the feature is disabled and 0 is returned. * * For now we separate the TCP and UDP subsystems w/ different 'which' * values. We may eventually remove this separation (and simplify the * code further). * * Note that the printing of the error message is delayed so we can * properly print the icmp error rate that the system was trying to do * (i.e. 22000/100 pps, etc...). This can cause long delays in printing * the 'final' error, but it doesn't make sense to solve the printing * delay with more complex code. */ struct icmp_rate { const char *descr; struct counter_rate cr; }; static VNET_DEFINE(struct icmp_rate, icmp_rates[BANDLIM_MAX]) = { { "icmp unreach response" }, { "icmp ping response" }, { "icmp tstamp response" }, { "closed port RST response" }, { "open port RST response" }, { "icmp6 unreach response" }, { "sctp ootb response" } }; #define V_icmp_rates VNET(icmp_rates) static void icmp_bandlimit_init(void) { for (int i = 0; i < BANDLIM_MAX; i++) { V_icmp_rates[i].cr.cr_rate = counter_u64_alloc(M_WAITOK); V_icmp_rates[i].cr.cr_ticks = ticks; } } VNET_SYSINIT(icmp_bandlimit, SI_SUB_PROTO_DOMAIN, SI_ORDER_ANY, icmp_bandlimit_init, NULL); static void icmp_bandlimit_uninit(void) { for (int i = 0; i < BANDLIM_MAX; i++) counter_u64_free(V_icmp_rates[i].cr.cr_rate); } VNET_SYSUNINIT(icmp_bandlimit, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, icmp_bandlimit_uninit, NULL); int badport_bandlim(int which) { int64_t pps; if (V_icmplim == 0 || which == BANDLIM_UNLIMITED) return (0); KASSERT(which >= 0 && which < BANDLIM_MAX, ("%s: which %d", __func__, which)); pps = counter_ratecheck(&V_icmp_rates[which].cr, V_icmplim); if (pps == -1) return (-1); if (pps > 0 && V_icmplim_output) log(LOG_NOTICE, "Limiting %s from %jd to %d packets/sec\n", V_icmp_rates[which].descr, (intmax_t )pps, V_icmplim); return (0); } Index: head/sys/netinet/ip_input.c =================================================================== --- head/sys/netinet/ip_input.c (revision 334117) +++ head/sys/netinet/ip_input.c (revision 334118) @@ -1,1433 +1,1427 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1988, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ip_input.c 8.2 (Berkeley) 1/4/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_bootp.h" #include "opt_ipstealth.h" #include "opt_ipsec.h" #include "opt_route.h" #include "opt_rss.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef CTASSERT CTASSERT(sizeof(struct ip) == 20); #endif /* IP reassembly functions are defined in ip_reass.c. */ extern void ipreass_init(void); extern void ipreass_drain(void); extern void ipreass_slowtimo(void); #ifdef VIMAGE extern void ipreass_destroy(void); #endif struct rmlock in_ifaddr_lock; RM_SYSINIT(in_ifaddr_lock, &in_ifaddr_lock, "in_ifaddr_lock"); VNET_DEFINE(int, rsvp_on); VNET_DEFINE(int, ipforwarding); SYSCTL_INT(_net_inet_ip, IPCTL_FORWARDING, forwarding, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ipforwarding), 0, "Enable IP forwarding between interfaces"); static VNET_DEFINE(int, ipsendredirects) = 1; /* XXX */ #define V_ipsendredirects VNET(ipsendredirects) SYSCTL_INT(_net_inet_ip, IPCTL_SENDREDIRECTS, redirect, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ipsendredirects), 0, "Enable sending IP redirects"); /* * XXX - Setting ip_checkinterface mostly implements the receive side of * the Strong ES model described in RFC 1122, but since the routing table * and transmit implementation do not implement the Strong ES model, * setting this to 1 results in an odd hybrid. * * XXX - ip_checkinterface currently must be disabled if you use ipnat * to translate the destination address to another local interface. * * XXX - ip_checkinterface must be disabled if you add IP aliases * to the loopback interface instead of the interface where the * packets for those addresses are received. */ static VNET_DEFINE(int, ip_checkinterface); #define V_ip_checkinterface VNET(ip_checkinterface) SYSCTL_INT(_net_inet_ip, OID_AUTO, check_interface, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip_checkinterface), 0, "Verify packet arrives on correct interface"); VNET_DEFINE(struct pfil_head, inet_pfil_hook); /* Packet filter hooks */ static struct netisr_handler ip_nh = { .nh_name = "ip", .nh_handler = ip_input, .nh_proto = NETISR_IP, #ifdef RSS .nh_m2cpuid = rss_soft_m2cpuid_v4, .nh_policy = NETISR_POLICY_CPU, .nh_dispatch = NETISR_DISPATCH_HYBRID, #else .nh_policy = NETISR_POLICY_FLOW, #endif }; #ifdef RSS /* * Directly dispatched frames are currently assumed * to have a flowid already calculated. * * It should likely have something that assert it * actually has valid flow details. */ static struct netisr_handler ip_direct_nh = { .nh_name = "ip_direct", .nh_handler = ip_direct_input, .nh_proto = NETISR_IP_DIRECT, .nh_m2cpuid = rss_soft_m2cpuid_v4, .nh_policy = NETISR_POLICY_CPU, .nh_dispatch = NETISR_DISPATCH_HYBRID, }; #endif extern struct domain inetdomain; extern struct protosw inetsw[]; u_char ip_protox[IPPROTO_MAX]; VNET_DEFINE(struct in_ifaddrhead, in_ifaddrhead); /* first inet address */ VNET_DEFINE(struct in_ifaddrhashhead *, in_ifaddrhashtbl); /* inet addr hash table */ VNET_DEFINE(u_long, in_ifaddrhmask); /* mask for hash table */ #ifdef IPCTL_DEFMTU SYSCTL_INT(_net_inet_ip, IPCTL_DEFMTU, mtu, CTLFLAG_RW, &ip_mtu, 0, "Default MTU"); #endif #ifdef IPSTEALTH VNET_DEFINE(int, ipstealth); SYSCTL_INT(_net_inet_ip, OID_AUTO, stealth, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ipstealth), 0, "IP stealth mode, no TTL decrementation on forwarding"); #endif /* * IP statistics are stored in the "array" of counter(9)s. */ VNET_PCPUSTAT_DEFINE(struct ipstat, ipstat); VNET_PCPUSTAT_SYSINIT(ipstat); SYSCTL_VNET_PCPUSTAT(_net_inet_ip, IPCTL_STATS, stats, struct ipstat, ipstat, "IP statistics (struct ipstat, netinet/ip_var.h)"); #ifdef VIMAGE VNET_PCPUSTAT_SYSUNINIT(ipstat); #endif /* VIMAGE */ /* * Kernel module interface for updating ipstat. The argument is an index * into ipstat treated as an array. */ void kmod_ipstat_inc(int statnum) { counter_u64_add(VNET(ipstat)[statnum], 1); } void kmod_ipstat_dec(int statnum) { counter_u64_add(VNET(ipstat)[statnum], -1); } static int sysctl_netinet_intr_queue_maxlen(SYSCTL_HANDLER_ARGS) { int error, qlimit; netisr_getqlimit(&ip_nh, &qlimit); error = sysctl_handle_int(oidp, &qlimit, 0, req); if (error || !req->newptr) return (error); if (qlimit < 1) return (EINVAL); return (netisr_setqlimit(&ip_nh, qlimit)); } SYSCTL_PROC(_net_inet_ip, IPCTL_INTRQMAXLEN, intr_queue_maxlen, CTLTYPE_INT|CTLFLAG_RW, 0, 0, sysctl_netinet_intr_queue_maxlen, "I", "Maximum size of the IP input queue"); static int sysctl_netinet_intr_queue_drops(SYSCTL_HANDLER_ARGS) { u_int64_t qdrops_long; int error, qdrops; netisr_getqdrops(&ip_nh, &qdrops_long); qdrops = qdrops_long; error = sysctl_handle_int(oidp, &qdrops, 0, req); if (error || !req->newptr) return (error); if (qdrops != 0) return (EINVAL); netisr_clearqdrops(&ip_nh); return (0); } SYSCTL_PROC(_net_inet_ip, IPCTL_INTRQDROPS, intr_queue_drops, CTLTYPE_INT|CTLFLAG_RD, 0, 0, sysctl_netinet_intr_queue_drops, "I", "Number of packets dropped from the IP input queue"); #ifdef RSS static int sysctl_netinet_intr_direct_queue_maxlen(SYSCTL_HANDLER_ARGS) { int error, qlimit; netisr_getqlimit(&ip_direct_nh, &qlimit); error = sysctl_handle_int(oidp, &qlimit, 0, req); if (error || !req->newptr) return (error); if (qlimit < 1) return (EINVAL); return (netisr_setqlimit(&ip_direct_nh, qlimit)); } SYSCTL_PROC(_net_inet_ip, IPCTL_INTRDQMAXLEN, intr_direct_queue_maxlen, CTLTYPE_INT|CTLFLAG_RW, 0, 0, sysctl_netinet_intr_direct_queue_maxlen, "I", "Maximum size of the IP direct input queue"); static int sysctl_netinet_intr_direct_queue_drops(SYSCTL_HANDLER_ARGS) { u_int64_t qdrops_long; int error, qdrops; netisr_getqdrops(&ip_direct_nh, &qdrops_long); qdrops = qdrops_long; error = sysctl_handle_int(oidp, &qdrops, 0, req); if (error || !req->newptr) return (error); if (qdrops != 0) return (EINVAL); netisr_clearqdrops(&ip_direct_nh); return (0); } SYSCTL_PROC(_net_inet_ip, IPCTL_INTRDQDROPS, intr_direct_queue_drops, CTLTYPE_INT|CTLFLAG_RD, 0, 0, sysctl_netinet_intr_direct_queue_drops, "I", "Number of packets dropped from the IP direct input queue"); #endif /* RSS */ /* * IP initialization: fill in IP protocol switch table. * All protocols not implemented in kernel go to raw IP protocol handler. */ void ip_init(void) { struct protosw *pr; int i; CK_STAILQ_INIT(&V_in_ifaddrhead); V_in_ifaddrhashtbl = hashinit(INADDR_NHASH, M_IFADDR, &V_in_ifaddrhmask); /* Initialize IP reassembly queue. */ ipreass_init(); /* Initialize packet filter hooks. */ V_inet_pfil_hook.ph_type = PFIL_TYPE_AF; V_inet_pfil_hook.ph_af = AF_INET; if ((i = pfil_head_register(&V_inet_pfil_hook)) != 0) printf("%s: WARNING: unable to register pfil hook, " "error %d\n", __func__, i); if (hhook_head_register(HHOOK_TYPE_IPSEC_IN, AF_INET, &V_ipsec_hhh_in[HHOOK_IPSEC_INET], HHOOK_WAITOK | HHOOK_HEADISINVNET) != 0) printf("%s: WARNING: unable to register input helper hook\n", __func__); if (hhook_head_register(HHOOK_TYPE_IPSEC_OUT, AF_INET, &V_ipsec_hhh_out[HHOOK_IPSEC_INET], HHOOK_WAITOK | HHOOK_HEADISINVNET) != 0) printf("%s: WARNING: unable to register output helper hook\n", __func__); /* Skip initialization of globals for non-default instances. */ #ifdef VIMAGE if (!IS_DEFAULT_VNET(curvnet)) { netisr_register_vnet(&ip_nh); #ifdef RSS netisr_register_vnet(&ip_direct_nh); #endif return; } #endif pr = pffindproto(PF_INET, IPPROTO_RAW, SOCK_RAW); if (pr == NULL) panic("ip_init: PF_INET not found"); /* Initialize the entire ip_protox[] array to IPPROTO_RAW. */ for (i = 0; i < IPPROTO_MAX; i++) ip_protox[i] = pr - inetsw; /* * Cycle through IP protocols and put them into the appropriate place * in ip_protox[]. */ for (pr = inetdomain.dom_protosw; pr < inetdomain.dom_protoswNPROTOSW; pr++) if (pr->pr_domain->dom_family == PF_INET && pr->pr_protocol && pr->pr_protocol != IPPROTO_RAW) { /* Be careful to only index valid IP protocols. */ if (pr->pr_protocol < IPPROTO_MAX) ip_protox[pr->pr_protocol] = pr - inetsw; } netisr_register(&ip_nh); #ifdef RSS netisr_register(&ip_direct_nh); #endif } #ifdef VIMAGE static void ip_destroy(void *unused __unused) { struct ifnet *ifp; int error; #ifdef RSS netisr_unregister_vnet(&ip_direct_nh); #endif netisr_unregister_vnet(&ip_nh); if ((error = pfil_head_unregister(&V_inet_pfil_hook)) != 0) printf("%s: WARNING: unable to unregister pfil hook, " "error %d\n", __func__, error); error = hhook_head_deregister(V_ipsec_hhh_in[HHOOK_IPSEC_INET]); if (error != 0) { printf("%s: WARNING: unable to deregister input helper hook " "type HHOOK_TYPE_IPSEC_IN, id HHOOK_IPSEC_INET: " "error %d returned\n", __func__, error); } error = hhook_head_deregister(V_ipsec_hhh_out[HHOOK_IPSEC_INET]); if (error != 0) { printf("%s: WARNING: unable to deregister output helper hook " "type HHOOK_TYPE_IPSEC_OUT, id HHOOK_IPSEC_INET: " "error %d returned\n", __func__, error); } /* Remove the IPv4 addresses from all interfaces. */ in_ifscrub_all(); /* Make sure the IPv4 routes are gone as well. */ IFNET_RLOCK(); - TAILQ_FOREACH(ifp, &V_ifnet, if_link) + CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) rt_flushifroutes_af(ifp, AF_INET); IFNET_RUNLOCK(); /* Destroy IP reassembly queue. */ ipreass_destroy(); /* Cleanup in_ifaddr hash table; should be empty. */ hashdestroy(V_in_ifaddrhashtbl, M_IFADDR, V_in_ifaddrhmask); } VNET_SYSUNINIT(ip, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, ip_destroy, NULL); #endif #ifdef RSS /* * IP direct input routine. * * This is called when reinjecting completed fragments where * all of the previous checking and book-keeping has been done. */ void ip_direct_input(struct mbuf *m) { struct ip *ip; int hlen; ip = mtod(m, struct ip *); hlen = ip->ip_hl << 2; #if defined(IPSEC) || defined(IPSEC_SUPPORT) if (IPSEC_ENABLED(ipv4)) { if (IPSEC_INPUT(ipv4, m, hlen, ip->ip_p) != 0) return; } #endif /* IPSEC */ IPSTAT_INC(ips_delivered); (*inetsw[ip_protox[ip->ip_p]].pr_input)(&m, &hlen, ip->ip_p); return; } #endif /* * Ip input routine. Checksum and byte swap header. If fragmented * try to reassemble. Process options. Pass to next level. */ void ip_input(struct mbuf *m) { struct ip *ip = NULL; struct in_ifaddr *ia = NULL; struct ifaddr *ifa; struct ifnet *ifp; int checkif, hlen = 0; uint16_t sum, ip_len; int dchg = 0; /* dest changed after fw */ struct in_addr odst; /* original dst address */ M_ASSERTPKTHDR(m); if (m->m_flags & M_FASTFWD_OURS) { m->m_flags &= ~M_FASTFWD_OURS; /* Set up some basics that will be used later. */ ip = mtod(m, struct ip *); hlen = ip->ip_hl << 2; ip_len = ntohs(ip->ip_len); goto ours; } IPSTAT_INC(ips_total); if (m->m_pkthdr.len < sizeof(struct ip)) goto tooshort; if (m->m_len < sizeof (struct ip) && (m = m_pullup(m, sizeof (struct ip))) == NULL) { IPSTAT_INC(ips_toosmall); return; } ip = mtod(m, struct ip *); if (ip->ip_v != IPVERSION) { IPSTAT_INC(ips_badvers); goto bad; } hlen = ip->ip_hl << 2; if (hlen < sizeof(struct ip)) { /* minimum header length */ IPSTAT_INC(ips_badhlen); goto bad; } if (hlen > m->m_len) { if ((m = m_pullup(m, hlen)) == NULL) { IPSTAT_INC(ips_badhlen); return; } ip = mtod(m, struct ip *); } IP_PROBE(receive, NULL, NULL, ip, m->m_pkthdr.rcvif, ip, NULL); /* 127/8 must not appear on wire - RFC1122 */ ifp = m->m_pkthdr.rcvif; if ((ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET || (ntohl(ip->ip_src.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET) { if ((ifp->if_flags & IFF_LOOPBACK) == 0) { IPSTAT_INC(ips_badaddr); goto bad; } } if (m->m_pkthdr.csum_flags & CSUM_IP_CHECKED) { sum = !(m->m_pkthdr.csum_flags & CSUM_IP_VALID); } else { if (hlen == sizeof(struct ip)) { sum = in_cksum_hdr(ip); } else { sum = in_cksum(m, hlen); } } if (sum) { IPSTAT_INC(ips_badsum); goto bad; } #ifdef ALTQ if (altq_input != NULL && (*altq_input)(m, AF_INET) == 0) /* packet is dropped by traffic conditioner */ return; #endif ip_len = ntohs(ip->ip_len); if (ip_len < hlen) { IPSTAT_INC(ips_badlen); goto bad; } /* * Check that the amount of data in the buffers * is as at least much as the IP header would have us expect. * Trim mbufs if longer than we expect. * Drop packet if shorter than we expect. */ if (m->m_pkthdr.len < ip_len) { tooshort: IPSTAT_INC(ips_tooshort); goto bad; } if (m->m_pkthdr.len > ip_len) { if (m->m_len == m->m_pkthdr.len) { m->m_len = ip_len; m->m_pkthdr.len = ip_len; } else m_adj(m, ip_len - m->m_pkthdr.len); } /* * Try to forward the packet, but if we fail continue. * ip_tryforward() does inbound and outbound packet firewall * processing. If firewall has decided that destination becomes * our local address, it sets M_FASTFWD_OURS flag. In this * case skip another inbound firewall processing and update * ip pointer. */ if (V_ipforwarding != 0 #if defined(IPSEC) || defined(IPSEC_SUPPORT) && (!IPSEC_ENABLED(ipv4) || IPSEC_CAPS(ipv4, m, IPSEC_CAP_OPERABLE) == 0) #endif ) { if ((m = ip_tryforward(m)) == NULL) return; if (m->m_flags & M_FASTFWD_OURS) { m->m_flags &= ~M_FASTFWD_OURS; ip = mtod(m, struct ip *); goto ours; } } #if defined(IPSEC) || defined(IPSEC_SUPPORT) /* * Bypass packet filtering for packets previously handled by IPsec. */ if (IPSEC_ENABLED(ipv4) && IPSEC_CAPS(ipv4, m, IPSEC_CAP_BYPASS_FILTER) != 0) goto passin; #endif /* * Run through list of hooks for input packets. * * NB: Beware of the destination address changing (e.g. * by NAT rewriting). When this happens, tell * ip_forward to do the right thing. */ /* Jump over all PFIL processing if hooks are not active. */ if (!PFIL_HOOKED(&V_inet_pfil_hook)) goto passin; odst = ip->ip_dst; if (pfil_run_hooks(&V_inet_pfil_hook, &m, ifp, PFIL_IN, 0, NULL) != 0) return; if (m == NULL) /* consumed by filter */ return; ip = mtod(m, struct ip *); dchg = (odst.s_addr != ip->ip_dst.s_addr); ifp = m->m_pkthdr.rcvif; if (m->m_flags & M_FASTFWD_OURS) { m->m_flags &= ~M_FASTFWD_OURS; goto ours; } if (m->m_flags & M_IP_NEXTHOP) { if (m_tag_find(m, PACKET_TAG_IPFORWARD, NULL) != NULL) { /* * Directly ship the packet on. This allows * forwarding packets originally destined to us * to some other directly connected host. */ ip_forward(m, 1); return; } } passin: /* * Process options and, if not destined for us, * ship it on. ip_dooptions returns 1 when an * error was detected (causing an icmp message * to be sent and the original packet to be freed). */ if (hlen > sizeof (struct ip) && ip_dooptions(m, 0)) return; /* greedy RSVP, snatches any PATH packet of the RSVP protocol and no * matter if it is destined to another node, or whether it is * a multicast one, RSVP wants it! and prevents it from being forwarded * anywhere else. Also checks if the rsvp daemon is running before * grabbing the packet. */ if (V_rsvp_on && ip->ip_p==IPPROTO_RSVP) goto ours; /* * Check our list of addresses, to see if the packet is for us. * If we don't have any addresses, assume any unicast packet * we receive might be for us (and let the upper layers deal * with it). */ if (CK_STAILQ_EMPTY(&V_in_ifaddrhead) && (m->m_flags & (M_MCAST|M_BCAST)) == 0) goto ours; /* * Enable a consistency check between the destination address * and the arrival interface for a unicast packet (the RFC 1122 * strong ES model) if IP forwarding is disabled and the packet * is not locally generated and the packet is not subject to * 'ipfw fwd'. * * XXX - Checking also should be disabled if the destination * address is ipnat'ed to a different interface. * * XXX - Checking is incompatible with IP aliases added * to the loopback interface instead of the interface where * the packets are received. * * XXX - This is the case for carp vhost IPs as well so we * insert a workaround. If the packet got here, we already * checked with carp_iamatch() and carp_forus(). */ checkif = V_ip_checkinterface && (V_ipforwarding == 0) && ifp != NULL && ((ifp->if_flags & IFF_LOOPBACK) == 0) && ifp->if_carp == NULL && (dchg == 0); /* * Check for exact addresses in the hash bucket. */ /* IN_IFADDR_RLOCK(); */ LIST_FOREACH(ia, INADDR_HASH(ip->ip_dst.s_addr), ia_hash) { /* * If the address matches, verify that the packet * arrived via the correct interface if checking is * enabled. */ if (IA_SIN(ia)->sin_addr.s_addr == ip->ip_dst.s_addr && (!checkif || ia->ia_ifp == ifp)) { counter_u64_add(ia->ia_ifa.ifa_ipackets, 1); counter_u64_add(ia->ia_ifa.ifa_ibytes, m->m_pkthdr.len); /* IN_IFADDR_RUNLOCK(); */ goto ours; } } /* IN_IFADDR_RUNLOCK(); */ /* * Check for broadcast addresses. * * Only accept broadcast packets that arrive via the matching * interface. Reception of forwarded directed broadcasts would * be handled via ip_forward() and ether_output() with the loopback * into the stack for SIMPLEX interfaces handled by ether_output(). */ if (ifp != NULL && ifp->if_flags & IFF_BROADCAST) { IF_ADDR_RLOCK(ifp); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET) continue; ia = ifatoia(ifa); if (satosin(&ia->ia_broadaddr)->sin_addr.s_addr == ip->ip_dst.s_addr) { counter_u64_add(ia->ia_ifa.ifa_ipackets, 1); counter_u64_add(ia->ia_ifa.ifa_ibytes, m->m_pkthdr.len); IF_ADDR_RUNLOCK(ifp); goto ours; } #ifdef BOOTP_COMPAT if (IA_SIN(ia)->sin_addr.s_addr == INADDR_ANY) { counter_u64_add(ia->ia_ifa.ifa_ipackets, 1); counter_u64_add(ia->ia_ifa.ifa_ibytes, m->m_pkthdr.len); IF_ADDR_RUNLOCK(ifp); goto ours; } #endif } IF_ADDR_RUNLOCK(ifp); ia = NULL; } /* RFC 3927 2.7: Do not forward datagrams for 169.254.0.0/16. */ if (IN_LINKLOCAL(ntohl(ip->ip_dst.s_addr))) { IPSTAT_INC(ips_cantforward); m_freem(m); return; } if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) { if (V_ip_mrouter) { /* * If we are acting as a multicast router, all * incoming multicast packets are passed to the * kernel-level multicast forwarding function. * The packet is returned (relatively) intact; if * ip_mforward() returns a non-zero value, the packet * must be discarded, else it may be accepted below. */ if (ip_mforward && ip_mforward(ip, ifp, m, 0) != 0) { IPSTAT_INC(ips_cantforward); m_freem(m); return; } /* * The process-level routing daemon needs to receive * all multicast IGMP packets, whether or not this * host belongs to their destination groups. */ if (ip->ip_p == IPPROTO_IGMP) goto ours; IPSTAT_INC(ips_forward); } /* * Assume the packet is for us, to avoid prematurely taking * a lock on the in_multi hash. Protocols must perform * their own filtering and update statistics accordingly. */ goto ours; } if (ip->ip_dst.s_addr == (u_long)INADDR_BROADCAST) goto ours; if (ip->ip_dst.s_addr == INADDR_ANY) goto ours; /* * Not for us; forward if possible and desirable. */ if (V_ipforwarding == 0) { IPSTAT_INC(ips_cantforward); m_freem(m); } else { ip_forward(m, dchg); } return; ours: #ifdef IPSTEALTH /* * IPSTEALTH: Process non-routing options only * if the packet is destined for us. */ if (V_ipstealth && hlen > sizeof (struct ip) && ip_dooptions(m, 1)) return; #endif /* IPSTEALTH */ /* * Attempt reassembly; if it succeeds, proceed. * ip_reass() will return a different mbuf. */ if (ip->ip_off & htons(IP_MF | IP_OFFMASK)) { /* XXXGL: shouldn't we save & set m_flags? */ m = ip_reass(m); if (m == NULL) return; ip = mtod(m, struct ip *); /* Get the header length of the reassembled packet */ hlen = ip->ip_hl << 2; } #if defined(IPSEC) || defined(IPSEC_SUPPORT) if (IPSEC_ENABLED(ipv4)) { if (IPSEC_INPUT(ipv4, m, hlen, ip->ip_p) != 0) return; } #endif /* IPSEC */ /* * Switch out to protocol's input routine. */ IPSTAT_INC(ips_delivered); (*inetsw[ip_protox[ip->ip_p]].pr_input)(&m, &hlen, ip->ip_p); return; bad: m_freem(m); } /* * IP timer processing; * if a timer expires on a reassembly * queue, discard it. */ void ip_slowtimo(void) { VNET_ITERATOR_DECL(vnet_iter); VNET_LIST_RLOCK_NOSLEEP(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); ipreass_slowtimo(); CURVNET_RESTORE(); } VNET_LIST_RUNLOCK_NOSLEEP(); } void ip_drain(void) { VNET_ITERATOR_DECL(vnet_iter); VNET_LIST_RLOCK_NOSLEEP(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); ipreass_drain(); CURVNET_RESTORE(); } VNET_LIST_RUNLOCK_NOSLEEP(); } /* * The protocol to be inserted into ip_protox[] must be already registered * in inetsw[], either statically or through pf_proto_register(). */ int ipproto_register(short ipproto) { struct protosw *pr; /* Sanity checks. */ if (ipproto <= 0 || ipproto >= IPPROTO_MAX) return (EPROTONOSUPPORT); /* * The protocol slot must not be occupied by another protocol * already. An index pointing to IPPROTO_RAW is unused. */ pr = pffindproto(PF_INET, IPPROTO_RAW, SOCK_RAW); if (pr == NULL) return (EPFNOSUPPORT); if (ip_protox[ipproto] != pr - inetsw) /* IPPROTO_RAW */ return (EEXIST); /* Find the protocol position in inetsw[] and set the index. */ for (pr = inetdomain.dom_protosw; pr < inetdomain.dom_protoswNPROTOSW; pr++) { if (pr->pr_domain->dom_family == PF_INET && pr->pr_protocol && pr->pr_protocol == ipproto) { ip_protox[pr->pr_protocol] = pr - inetsw; return (0); } } return (EPROTONOSUPPORT); } int ipproto_unregister(short ipproto) { struct protosw *pr; /* Sanity checks. */ if (ipproto <= 0 || ipproto >= IPPROTO_MAX) return (EPROTONOSUPPORT); /* Check if the protocol was indeed registered. */ pr = pffindproto(PF_INET, IPPROTO_RAW, SOCK_RAW); if (pr == NULL) return (EPFNOSUPPORT); if (ip_protox[ipproto] == pr - inetsw) /* IPPROTO_RAW */ return (ENOENT); /* Reset the protocol slot to IPPROTO_RAW. */ ip_protox[ipproto] = pr - inetsw; return (0); } u_char inetctlerrmap[PRC_NCMDS] = { 0, 0, 0, 0, 0, EMSGSIZE, EHOSTDOWN, EHOSTUNREACH, EHOSTUNREACH, EHOSTUNREACH, ECONNREFUSED, ECONNREFUSED, EMSGSIZE, EHOSTUNREACH, 0, 0, 0, 0, EHOSTUNREACH, 0, ENOPROTOOPT, ECONNREFUSED }; /* * Forward a packet. If some error occurs return the sender * an icmp packet. Note we can't always generate a meaningful * icmp message because icmp doesn't have a large enough repertoire * of codes and types. * * If not forwarding, just drop the packet. This could be confusing * if ipforwarding was zero but some routing protocol was advancing * us as a gateway to somewhere. However, we must let the routing * protocol deal with that. * * The srcrt parameter indicates whether the packet is being forwarded * via a source route. */ void ip_forward(struct mbuf *m, int srcrt) { struct ip *ip = mtod(m, struct ip *); struct in_ifaddr *ia; struct mbuf *mcopy; struct sockaddr_in *sin; struct in_addr dest; struct route ro; int error, type = 0, code = 0, mtu = 0; if (m->m_flags & (M_BCAST|M_MCAST) || in_canforward(ip->ip_dst) == 0) { IPSTAT_INC(ips_cantforward); m_freem(m); return; } if ( #ifdef IPSTEALTH V_ipstealth == 0 && #endif ip->ip_ttl <= IPTTLDEC) { icmp_error(m, ICMP_TIMXCEED, ICMP_TIMXCEED_INTRANS, 0, 0); return; } bzero(&ro, sizeof(ro)); sin = (struct sockaddr_in *)&ro.ro_dst; sin->sin_family = AF_INET; sin->sin_len = sizeof(*sin); sin->sin_addr = ip->ip_dst; #ifdef RADIX_MPATH rtalloc_mpath_fib(&ro, ntohl(ip->ip_src.s_addr ^ ip->ip_dst.s_addr), M_GETFIB(m)); #else in_rtalloc_ign(&ro, 0, M_GETFIB(m)); #endif + NET_EPOCH_ENTER(); if (ro.ro_rt != NULL) { ia = ifatoia(ro.ro_rt->rt_ifa); - ifa_ref(&ia->ia_ifa); } else ia = NULL; /* * Save the IP header and at most 8 bytes of the payload, * in case we need to generate an ICMP message to the src. * * XXX this can be optimized a lot by saving the data in a local * buffer on the stack (72 bytes at most), and only allocating the * mbuf if really necessary. The vast majority of the packets * are forwarded without having to send an ICMP back (either * because unnecessary, or because rate limited), so we are * really we are wasting a lot of work here. * * We don't use m_copym() because it might return a reference * to a shared cluster. Both this function and ip_output() * assume exclusive access to the IP header in `m', so any * data in a cluster may change before we reach icmp_error(). */ mcopy = m_gethdr(M_NOWAIT, m->m_type); if (mcopy != NULL && !m_dup_pkthdr(mcopy, m, M_NOWAIT)) { /* * It's probably ok if the pkthdr dup fails (because * the deep copy of the tag chain failed), but for now * be conservative and just discard the copy since * code below may some day want the tags. */ m_free(mcopy); mcopy = NULL; } if (mcopy != NULL) { mcopy->m_len = min(ntohs(ip->ip_len), M_TRAILINGSPACE(mcopy)); mcopy->m_pkthdr.len = mcopy->m_len; m_copydata(m, 0, mcopy->m_len, mtod(mcopy, caddr_t)); } #ifdef IPSTEALTH if (V_ipstealth == 0) #endif ip->ip_ttl -= IPTTLDEC; #if defined(IPSEC) || defined(IPSEC_SUPPORT) if (IPSEC_ENABLED(ipv4)) { if ((error = IPSEC_FORWARD(ipv4, m)) != 0) { /* mbuf consumed by IPsec */ m_freem(mcopy); if (error != EINPROGRESS) IPSTAT_INC(ips_cantforward); - return; + goto out; } /* No IPsec processing required */ } #endif /* IPSEC */ /* * If forwarding packet using same interface that it came in on, * perhaps should send a redirect to sender to shortcut a hop. * Only send redirect if source is sending directly to us, * and if packet was not source routed (or has any options). * Also, don't send redirect if forwarding using a default route * or a route modified by a redirect. */ dest.s_addr = 0; if (!srcrt && V_ipsendredirects && ia != NULL && ia->ia_ifp == m->m_pkthdr.rcvif) { struct rtentry *rt; rt = ro.ro_rt; if (rt && (rt->rt_flags & (RTF_DYNAMIC|RTF_MODIFIED)) == 0 && satosin(rt_key(rt))->sin_addr.s_addr != 0) { #define RTA(rt) ((struct in_ifaddr *)(rt->rt_ifa)) u_long src = ntohl(ip->ip_src.s_addr); if (RTA(rt) && (src & RTA(rt)->ia_subnetmask) == RTA(rt)->ia_subnet) { if (rt->rt_flags & RTF_GATEWAY) dest.s_addr = satosin(rt->rt_gateway)->sin_addr.s_addr; else dest.s_addr = ip->ip_dst.s_addr; /* Router requirements says to only send host redirects */ type = ICMP_REDIRECT; code = ICMP_REDIRECT_HOST; } } } error = ip_output(m, NULL, &ro, IP_FORWARDING, NULL, NULL); if (error == EMSGSIZE && ro.ro_rt) mtu = ro.ro_rt->rt_mtu; RO_RTFREE(&ro); if (error) IPSTAT_INC(ips_cantforward); else { IPSTAT_INC(ips_forward); if (type) IPSTAT_INC(ips_redirectsent); else { if (mcopy) m_freem(mcopy); - if (ia != NULL) - ifa_free(&ia->ia_ifa); - return; + goto out; } } - if (mcopy == NULL) { - if (ia != NULL) - ifa_free(&ia->ia_ifa); - return; - } + if (mcopy == NULL) + goto out; + switch (error) { case 0: /* forwarded, but need redirect */ /* type, code set above */ break; case ENETUNREACH: case EHOSTUNREACH: case ENETDOWN: case EHOSTDOWN: default: type = ICMP_UNREACH; code = ICMP_UNREACH_HOST; break; case EMSGSIZE: type = ICMP_UNREACH; code = ICMP_UNREACH_NEEDFRAG; /* * If the MTU was set before make sure we are below the * interface MTU. * If the MTU wasn't set before use the interface mtu or * fall back to the next smaller mtu step compared to the * current packet size. */ if (mtu != 0) { if (ia != NULL) mtu = min(mtu, ia->ia_ifp->if_mtu); } else { if (ia != NULL) mtu = ia->ia_ifp->if_mtu; else mtu = ip_next_mtu(ntohs(ip->ip_len), 0); } IPSTAT_INC(ips_cantfrag); break; case ENOBUFS: case EACCES: /* ipfw denied packet */ m_freem(mcopy); - if (ia != NULL) - ifa_free(&ia->ia_ifa); - return; + goto out; } - if (ia != NULL) - ifa_free(&ia->ia_ifa); icmp_error(mcopy, type, code, dest.s_addr, mtu); + out: + NET_EPOCH_EXIT(); } #define CHECK_SO_CT(sp, ct) \ (((sp->so_options & SO_TIMESTAMP) && (sp->so_ts_clock == ct)) ? 1 : 0) void ip_savecontrol(struct inpcb *inp, struct mbuf **mp, struct ip *ip, struct mbuf *m) { bool stamped; stamped = false; if ((inp->inp_socket->so_options & SO_BINTIME) || CHECK_SO_CT(inp->inp_socket, SO_TS_BINTIME)) { struct bintime boottimebin, bt; struct timespec ts1; if ((m->m_flags & (M_PKTHDR | M_TSTMP)) == (M_PKTHDR | M_TSTMP)) { mbuf_tstmp2timespec(m, &ts1); timespec2bintime(&ts1, &bt); getboottimebin(&boottimebin); bintime_add(&bt, &boottimebin); } else { bintime(&bt); } *mp = sbcreatecontrol((caddr_t)&bt, sizeof(bt), SCM_BINTIME, SOL_SOCKET); if (*mp != NULL) { mp = &(*mp)->m_next; stamped = true; } } if (CHECK_SO_CT(inp->inp_socket, SO_TS_REALTIME_MICRO)) { struct bintime boottimebin, bt1; struct timespec ts1;; struct timeval tv; if ((m->m_flags & (M_PKTHDR | M_TSTMP)) == (M_PKTHDR | M_TSTMP)) { mbuf_tstmp2timespec(m, &ts1); timespec2bintime(&ts1, &bt1); getboottimebin(&boottimebin); bintime_add(&bt1, &boottimebin); bintime2timeval(&bt1, &tv); } else { microtime(&tv); } *mp = sbcreatecontrol((caddr_t)&tv, sizeof(tv), SCM_TIMESTAMP, SOL_SOCKET); if (*mp != NULL) { mp = &(*mp)->m_next; stamped = true; } } else if (CHECK_SO_CT(inp->inp_socket, SO_TS_REALTIME)) { struct bintime boottimebin; struct timespec ts, ts1; if ((m->m_flags & (M_PKTHDR | M_TSTMP)) == (M_PKTHDR | M_TSTMP)) { mbuf_tstmp2timespec(m, &ts); getboottimebin(&boottimebin); bintime2timespec(&boottimebin, &ts1); timespecadd(&ts, &ts1); } else { nanotime(&ts); } *mp = sbcreatecontrol((caddr_t)&ts, sizeof(ts), SCM_REALTIME, SOL_SOCKET); if (*mp != NULL) { mp = &(*mp)->m_next; stamped = true; } } else if (CHECK_SO_CT(inp->inp_socket, SO_TS_MONOTONIC)) { struct timespec ts; if ((m->m_flags & (M_PKTHDR | M_TSTMP)) == (M_PKTHDR | M_TSTMP)) mbuf_tstmp2timespec(m, &ts); else nanouptime(&ts); *mp = sbcreatecontrol((caddr_t)&ts, sizeof(ts), SCM_MONOTONIC, SOL_SOCKET); if (*mp != NULL) { mp = &(*mp)->m_next; stamped = true; } } if (stamped && (m->m_flags & (M_PKTHDR | M_TSTMP)) == (M_PKTHDR | M_TSTMP)) { struct sock_timestamp_info sti; bzero(&sti, sizeof(sti)); sti.st_info_flags = ST_INFO_HW; if ((m->m_flags & M_TSTMP_HPREC) != 0) sti.st_info_flags |= ST_INFO_HW_HPREC; *mp = sbcreatecontrol((caddr_t)&sti, sizeof(sti), SCM_TIME_INFO, SOL_SOCKET); if (*mp != NULL) mp = &(*mp)->m_next; } if (inp->inp_flags & INP_RECVDSTADDR) { *mp = sbcreatecontrol((caddr_t)&ip->ip_dst, sizeof(struct in_addr), IP_RECVDSTADDR, IPPROTO_IP); if (*mp) mp = &(*mp)->m_next; } if (inp->inp_flags & INP_RECVTTL) { *mp = sbcreatecontrol((caddr_t)&ip->ip_ttl, sizeof(u_char), IP_RECVTTL, IPPROTO_IP); if (*mp) mp = &(*mp)->m_next; } #ifdef notyet /* XXX * Moving these out of udp_input() made them even more broken * than they already were. */ /* options were tossed already */ if (inp->inp_flags & INP_RECVOPTS) { *mp = sbcreatecontrol((caddr_t)opts_deleted_above, sizeof(struct in_addr), IP_RECVOPTS, IPPROTO_IP); if (*mp) mp = &(*mp)->m_next; } /* ip_srcroute doesn't do what we want here, need to fix */ if (inp->inp_flags & INP_RECVRETOPTS) { *mp = sbcreatecontrol((caddr_t)ip_srcroute(m), sizeof(struct in_addr), IP_RECVRETOPTS, IPPROTO_IP); if (*mp) mp = &(*mp)->m_next; } #endif if (inp->inp_flags & INP_RECVIF) { struct ifnet *ifp; struct sdlbuf { struct sockaddr_dl sdl; u_char pad[32]; } sdlbuf; struct sockaddr_dl *sdp; struct sockaddr_dl *sdl2 = &sdlbuf.sdl; if ((ifp = m->m_pkthdr.rcvif) && ifp->if_index && ifp->if_index <= V_if_index) { sdp = (struct sockaddr_dl *)ifp->if_addr->ifa_addr; /* * Change our mind and don't try copy. */ if (sdp->sdl_family != AF_LINK || sdp->sdl_len > sizeof(sdlbuf)) { goto makedummy; } bcopy(sdp, sdl2, sdp->sdl_len); } else { makedummy: sdl2->sdl_len = offsetof(struct sockaddr_dl, sdl_data[0]); sdl2->sdl_family = AF_LINK; sdl2->sdl_index = 0; sdl2->sdl_nlen = sdl2->sdl_alen = sdl2->sdl_slen = 0; } *mp = sbcreatecontrol((caddr_t)sdl2, sdl2->sdl_len, IP_RECVIF, IPPROTO_IP); if (*mp) mp = &(*mp)->m_next; } if (inp->inp_flags & INP_RECVTOS) { *mp = sbcreatecontrol((caddr_t)&ip->ip_tos, sizeof(u_char), IP_RECVTOS, IPPROTO_IP); if (*mp) mp = &(*mp)->m_next; } if (inp->inp_flags2 & INP_RECVFLOWID) { uint32_t flowid, flow_type; flowid = m->m_pkthdr.flowid; flow_type = M_HASHTYPE_GET(m); /* * XXX should handle the failure of one or the * other - don't populate both? */ *mp = sbcreatecontrol((caddr_t) &flowid, sizeof(uint32_t), IP_FLOWID, IPPROTO_IP); if (*mp) mp = &(*mp)->m_next; *mp = sbcreatecontrol((caddr_t) &flow_type, sizeof(uint32_t), IP_FLOWTYPE, IPPROTO_IP); if (*mp) mp = &(*mp)->m_next; } #ifdef RSS if (inp->inp_flags2 & INP_RECVRSSBUCKETID) { uint32_t flowid, flow_type; uint32_t rss_bucketid; flowid = m->m_pkthdr.flowid; flow_type = M_HASHTYPE_GET(m); if (rss_hash2bucket(flowid, flow_type, &rss_bucketid) == 0) { *mp = sbcreatecontrol((caddr_t) &rss_bucketid, sizeof(uint32_t), IP_RSSBUCKETID, IPPROTO_IP); if (*mp) mp = &(*mp)->m_next; } } #endif } /* * XXXRW: Multicast routing code in ip_mroute.c is generally MPSAFE, but the * ip_rsvp and ip_rsvp_on variables need to be interlocked with rsvp_on * locking. This code remains in ip_input.c as ip_mroute.c is optionally * compiled. */ static VNET_DEFINE(int, ip_rsvp_on); VNET_DEFINE(struct socket *, ip_rsvpd); #define V_ip_rsvp_on VNET(ip_rsvp_on) int ip_rsvp_init(struct socket *so) { if (so->so_type != SOCK_RAW || so->so_proto->pr_protocol != IPPROTO_RSVP) return EOPNOTSUPP; if (V_ip_rsvpd != NULL) return EADDRINUSE; V_ip_rsvpd = so; /* * This may seem silly, but we need to be sure we don't over-increment * the RSVP counter, in case something slips up. */ if (!V_ip_rsvp_on) { V_ip_rsvp_on = 1; V_rsvp_on++; } return 0; } int ip_rsvp_done(void) { V_ip_rsvpd = NULL; /* * This may seem silly, but we need to be sure we don't over-decrement * the RSVP counter, in case something slips up. */ if (V_ip_rsvp_on) { V_ip_rsvp_on = 0; V_rsvp_on--; } return 0; } int rsvp_input(struct mbuf **mp, int *offp, int proto) { struct mbuf *m; m = *mp; *mp = NULL; if (rsvp_input_p) { /* call the real one if loaded */ *mp = m; rsvp_input_p(mp, offp, proto); return (IPPROTO_DONE); } /* Can still get packets with rsvp_on = 0 if there is a local member * of the group to which the RSVP packet is addressed. But in this * case we want to throw the packet away. */ if (!V_rsvp_on) { m_freem(m); return (IPPROTO_DONE); } if (V_ip_rsvpd != NULL) { *mp = m; rip_input(mp, offp, proto); return (IPPROTO_DONE); } /* Drop the packet */ m_freem(m); return (IPPROTO_DONE); } Index: head/sys/netinet/ip_mroute.c =================================================================== --- head/sys/netinet/ip_mroute.c (revision 334117) +++ head/sys/netinet/ip_mroute.c (revision 334118) @@ -1,2952 +1,2954 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1989 Stephen Deering * Copyright (c) 1992, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * Stephen Deering of Stanford University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ip_mroute.c 8.2 (Berkeley) 11/15/93 */ /* * IP multicast forwarding procedures * * Written by David Waitzman, BBN Labs, August 1988. * Modified by Steve Deering, Stanford, February 1989. * Modified by Mark J. Steiglitz, Stanford, May, 1991 * Modified by Van Jacobson, LBL, January 1993 * Modified by Ajit Thyagarajan, PARC, August 1993 * Modified by Bill Fenner, PARC, April 1995 * Modified by Ahmed Helmy, SGI, June 1996 * Modified by George Edmond Eddy (Rusty), ISI, February 1998 * Modified by Pavlin Radoslavov, USC/ISI, May 1998, August 1999, October 2000 * Modified by Hitoshi Asaeda, WIDE, August 2000 * Modified by Pavlin Radoslavov, ICSI, October 2002 * * MROUTING Revision: 3.5 * and PIM-SMv2 and PIM-DM support, advanced API support, * bandwidth metering and signaling */ /* * TODO: Prefix functions with ipmf_. * TODO: Maintain a refcount on if_allmulti() in ifnet or in the protocol * domain attachment (if_afdata) so we can track consumers of that service. * TODO: Deprecate routing socket path for SIOCGETSGCNT and SIOCGETVIFCNT, * move it to socket options. * TODO: Cleanup LSRR removal further. * TODO: Push RSVP stubs into raw_ip.c. * TODO: Use bitstring.h for vif set. * TODO: Fix mrt6_ioctl dangling ref when dynamically loaded. * TODO: Sync ip6_mroute.c with this file. */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_mrouting.h" #define _PIM_VT 1 #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifndef KTR_IPMF #define KTR_IPMF KTR_INET #endif #define VIFI_INVALID ((vifi_t) -1) static VNET_DEFINE(uint32_t, last_tv_sec); /* last time we processed this */ #define V_last_tv_sec VNET(last_tv_sec) static MALLOC_DEFINE(M_MRTABLE, "mroutetbl", "multicast forwarding cache"); /* * Locking. We use two locks: one for the virtual interface table and * one for the forwarding table. These locks may be nested in which case * the VIF lock must always be taken first. Note that each lock is used * to cover not only the specific data structure but also related data * structures. */ static struct mtx mrouter_mtx; #define MROUTER_LOCK() mtx_lock(&mrouter_mtx) #define MROUTER_UNLOCK() mtx_unlock(&mrouter_mtx) #define MROUTER_LOCK_ASSERT() mtx_assert(&mrouter_mtx, MA_OWNED) #define MROUTER_LOCK_INIT() \ mtx_init(&mrouter_mtx, "IPv4 multicast forwarding", NULL, MTX_DEF) #define MROUTER_LOCK_DESTROY() mtx_destroy(&mrouter_mtx) static int ip_mrouter_cnt; /* # of vnets with active mrouters */ static int ip_mrouter_unloading; /* Allow no more V_ip_mrouter sockets */ static VNET_PCPUSTAT_DEFINE(struct mrtstat, mrtstat); VNET_PCPUSTAT_SYSINIT(mrtstat); VNET_PCPUSTAT_SYSUNINIT(mrtstat); SYSCTL_VNET_PCPUSTAT(_net_inet_ip, OID_AUTO, mrtstat, struct mrtstat, mrtstat, "IPv4 Multicast Forwarding Statistics (struct mrtstat, " "netinet/ip_mroute.h)"); static VNET_DEFINE(u_long, mfchash); #define V_mfchash VNET(mfchash) #define MFCHASH(a, g) \ ((((a).s_addr >> 20) ^ ((a).s_addr >> 10) ^ (a).s_addr ^ \ ((g).s_addr >> 20) ^ ((g).s_addr >> 10) ^ (g).s_addr) & V_mfchash) #define MFCHASHSIZE 256 static u_long mfchashsize; /* Hash size */ static VNET_DEFINE(u_char *, nexpire); /* 0..mfchashsize-1 */ #define V_nexpire VNET(nexpire) static VNET_DEFINE(LIST_HEAD(mfchashhdr, mfc)*, mfchashtbl); #define V_mfchashtbl VNET(mfchashtbl) static struct mtx mfc_mtx; #define MFC_LOCK() mtx_lock(&mfc_mtx) #define MFC_UNLOCK() mtx_unlock(&mfc_mtx) #define MFC_LOCK_ASSERT() mtx_assert(&mfc_mtx, MA_OWNED) #define MFC_LOCK_INIT() \ mtx_init(&mfc_mtx, "IPv4 multicast forwarding cache", NULL, MTX_DEF) #define MFC_LOCK_DESTROY() mtx_destroy(&mfc_mtx) static VNET_DEFINE(vifi_t, numvifs); #define V_numvifs VNET(numvifs) static VNET_DEFINE(struct vif, viftable[MAXVIFS]); #define V_viftable VNET(viftable) SYSCTL_OPAQUE(_net_inet_ip, OID_AUTO, viftable, CTLFLAG_VNET | CTLFLAG_RD, &VNET_NAME(viftable), sizeof(V_viftable), "S,vif[MAXVIFS]", "IPv4 Multicast Interfaces (struct vif[MAXVIFS], netinet/ip_mroute.h)"); static struct mtx vif_mtx; #define VIF_LOCK() mtx_lock(&vif_mtx) #define VIF_UNLOCK() mtx_unlock(&vif_mtx) #define VIF_LOCK_ASSERT() mtx_assert(&vif_mtx, MA_OWNED) #define VIF_LOCK_INIT() \ mtx_init(&vif_mtx, "IPv4 multicast interfaces", NULL, MTX_DEF) #define VIF_LOCK_DESTROY() mtx_destroy(&vif_mtx) static eventhandler_tag if_detach_event_tag = NULL; static VNET_DEFINE(struct callout, expire_upcalls_ch); #define V_expire_upcalls_ch VNET(expire_upcalls_ch) #define EXPIRE_TIMEOUT (hz / 4) /* 4x / second */ #define UPCALL_EXPIRE 6 /* number of timeouts */ /* * Bandwidth meter variables and constants */ static MALLOC_DEFINE(M_BWMETER, "bwmeter", "multicast upcall bw meters"); /* * Pending timeouts are stored in a hash table, the key being the * expiration time. Periodically, the entries are analysed and processed. */ #define BW_METER_BUCKETS 1024 static VNET_DEFINE(struct bw_meter*, bw_meter_timers[BW_METER_BUCKETS]); #define V_bw_meter_timers VNET(bw_meter_timers) static VNET_DEFINE(struct callout, bw_meter_ch); #define V_bw_meter_ch VNET(bw_meter_ch) #define BW_METER_PERIOD (hz) /* periodical handling of bw meters */ /* * Pending upcalls are stored in a vector which is flushed when * full, or periodically */ static VNET_DEFINE(struct bw_upcall, bw_upcalls[BW_UPCALLS_MAX]); #define V_bw_upcalls VNET(bw_upcalls) static VNET_DEFINE(u_int, bw_upcalls_n); /* # of pending upcalls */ #define V_bw_upcalls_n VNET(bw_upcalls_n) static VNET_DEFINE(struct callout, bw_upcalls_ch); #define V_bw_upcalls_ch VNET(bw_upcalls_ch) #define BW_UPCALLS_PERIOD (hz) /* periodical flush of bw upcalls */ static VNET_PCPUSTAT_DEFINE(struct pimstat, pimstat); VNET_PCPUSTAT_SYSINIT(pimstat); VNET_PCPUSTAT_SYSUNINIT(pimstat); SYSCTL_NODE(_net_inet, IPPROTO_PIM, pim, CTLFLAG_RW, 0, "PIM"); SYSCTL_VNET_PCPUSTAT(_net_inet_pim, PIMCTL_STATS, stats, struct pimstat, pimstat, "PIM Statistics (struct pimstat, netinet/pim_var.h)"); static u_long pim_squelch_wholepkt = 0; SYSCTL_ULONG(_net_inet_pim, OID_AUTO, squelch_wholepkt, CTLFLAG_RW, &pim_squelch_wholepkt, 0, "Disable IGMP_WHOLEPKT notifications if rendezvous point is unspecified"); extern struct domain inetdomain; static const struct protosw in_pim_protosw = { .pr_type = SOCK_RAW, .pr_domain = &inetdomain, .pr_protocol = IPPROTO_PIM, .pr_flags = PR_ATOMIC|PR_ADDR|PR_LASTHDR, .pr_input = pim_input, .pr_output = rip_output, .pr_ctloutput = rip_ctloutput, .pr_usrreqs = &rip_usrreqs }; static const struct encaptab *pim_encap_cookie; static int pim_encapcheck(const struct mbuf *, int, int, void *); /* * Note: the PIM Register encapsulation adds the following in front of a * data packet: * * struct pim_encap_hdr { * struct ip ip; * struct pim_encap_pimhdr pim; * } * */ struct pim_encap_pimhdr { struct pim pim; uint32_t flags; }; #define PIM_ENCAP_TTL 64 static struct ip pim_encap_iphdr = { #if BYTE_ORDER == LITTLE_ENDIAN sizeof(struct ip) >> 2, IPVERSION, #else IPVERSION, sizeof(struct ip) >> 2, #endif 0, /* tos */ sizeof(struct ip), /* total length */ 0, /* id */ 0, /* frag offset */ PIM_ENCAP_TTL, IPPROTO_PIM, 0, /* checksum */ }; static struct pim_encap_pimhdr pim_encap_pimhdr = { { PIM_MAKE_VT(PIM_VERSION, PIM_REGISTER), /* PIM vers and message type */ 0, /* reserved */ 0, /* checksum */ }, 0 /* flags */ }; static VNET_DEFINE(vifi_t, reg_vif_num) = VIFI_INVALID; #define V_reg_vif_num VNET(reg_vif_num) static VNET_DEFINE(struct ifnet, multicast_register_if); #define V_multicast_register_if VNET(multicast_register_if) /* * Private variables. */ static u_long X_ip_mcast_src(int); static int X_ip_mforward(struct ip *, struct ifnet *, struct mbuf *, struct ip_moptions *); static int X_ip_mrouter_done(void); static int X_ip_mrouter_get(struct socket *, struct sockopt *); static int X_ip_mrouter_set(struct socket *, struct sockopt *); static int X_legal_vif_num(int); static int X_mrt_ioctl(u_long, caddr_t, int); static int add_bw_upcall(struct bw_upcall *); static int add_mfc(struct mfcctl2 *); static int add_vif(struct vifctl *); static void bw_meter_prepare_upcall(struct bw_meter *, struct timeval *); static void bw_meter_process(void); static void bw_meter_receive_packet(struct bw_meter *, int, struct timeval *); static void bw_upcalls_send(void); static int del_bw_upcall(struct bw_upcall *); static int del_mfc(struct mfcctl2 *); static int del_vif(vifi_t); static int del_vif_locked(vifi_t); static void expire_bw_meter_process(void *); static void expire_bw_upcalls_send(void *); static void expire_mfc(struct mfc *); static void expire_upcalls(void *); static void free_bw_list(struct bw_meter *); static int get_sg_cnt(struct sioc_sg_req *); static int get_vif_cnt(struct sioc_vif_req *); static void if_detached_event(void *, struct ifnet *); static int ip_mdq(struct mbuf *, struct ifnet *, struct mfc *, vifi_t); static int ip_mrouter_init(struct socket *, int); static __inline struct mfc * mfc_find(struct in_addr *, struct in_addr *); static void phyint_send(struct ip *, struct vif *, struct mbuf *); static struct mbuf * pim_register_prepare(struct ip *, struct mbuf *); static int pim_register_send(struct ip *, struct vif *, struct mbuf *, struct mfc *); static int pim_register_send_rp(struct ip *, struct vif *, struct mbuf *, struct mfc *); static int pim_register_send_upcall(struct ip *, struct vif *, struct mbuf *, struct mfc *); static void schedule_bw_meter(struct bw_meter *, struct timeval *); static void send_packet(struct vif *, struct mbuf *); static int set_api_config(uint32_t *); static int set_assert(int); static int socket_send(struct socket *, struct mbuf *, struct sockaddr_in *); static void unschedule_bw_meter(struct bw_meter *); /* * Kernel multicast forwarding API capabilities and setup. * If more API capabilities are added to the kernel, they should be * recorded in `mrt_api_support'. */ #define MRT_API_VERSION 0x0305 static const int mrt_api_version = MRT_API_VERSION; static const uint32_t mrt_api_support = (MRT_MFC_FLAGS_DISABLE_WRONGVIF | MRT_MFC_FLAGS_BORDER_VIF | MRT_MFC_RP | MRT_MFC_BW_UPCALL); static VNET_DEFINE(uint32_t, mrt_api_config); #define V_mrt_api_config VNET(mrt_api_config) static VNET_DEFINE(int, pim_assert_enabled); #define V_pim_assert_enabled VNET(pim_assert_enabled) static struct timeval pim_assert_interval = { 3, 0 }; /* Rate limit */ /* * Find a route for a given origin IP address and multicast group address. * Statistics must be updated by the caller. */ static __inline struct mfc * mfc_find(struct in_addr *o, struct in_addr *g) { struct mfc *rt; MFC_LOCK_ASSERT(); LIST_FOREACH(rt, &V_mfchashtbl[MFCHASH(*o, *g)], mfc_hash) { if (in_hosteq(rt->mfc_origin, *o) && in_hosteq(rt->mfc_mcastgrp, *g) && TAILQ_EMPTY(&rt->mfc_stall)) break; } return (rt); } /* * Handle MRT setsockopt commands to modify the multicast forwarding tables. */ static int X_ip_mrouter_set(struct socket *so, struct sockopt *sopt) { int error, optval; vifi_t vifi; struct vifctl vifc; struct mfcctl2 mfc; struct bw_upcall bw_upcall; uint32_t i; if (so != V_ip_mrouter && sopt->sopt_name != MRT_INIT) return EPERM; error = 0; switch (sopt->sopt_name) { case MRT_INIT: error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) break; error = ip_mrouter_init(so, optval); break; case MRT_DONE: error = ip_mrouter_done(); break; case MRT_ADD_VIF: error = sooptcopyin(sopt, &vifc, sizeof vifc, sizeof vifc); if (error) break; error = add_vif(&vifc); break; case MRT_DEL_VIF: error = sooptcopyin(sopt, &vifi, sizeof vifi, sizeof vifi); if (error) break; error = del_vif(vifi); break; case MRT_ADD_MFC: case MRT_DEL_MFC: /* * select data size depending on API version. */ if (sopt->sopt_name == MRT_ADD_MFC && V_mrt_api_config & MRT_API_FLAGS_ALL) { error = sooptcopyin(sopt, &mfc, sizeof(struct mfcctl2), sizeof(struct mfcctl2)); } else { error = sooptcopyin(sopt, &mfc, sizeof(struct mfcctl), sizeof(struct mfcctl)); bzero((caddr_t)&mfc + sizeof(struct mfcctl), sizeof(mfc) - sizeof(struct mfcctl)); } if (error) break; if (sopt->sopt_name == MRT_ADD_MFC) error = add_mfc(&mfc); else error = del_mfc(&mfc); break; case MRT_ASSERT: error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) break; set_assert(optval); break; case MRT_API_CONFIG: error = sooptcopyin(sopt, &i, sizeof i, sizeof i); if (!error) error = set_api_config(&i); if (!error) error = sooptcopyout(sopt, &i, sizeof i); break; case MRT_ADD_BW_UPCALL: case MRT_DEL_BW_UPCALL: error = sooptcopyin(sopt, &bw_upcall, sizeof bw_upcall, sizeof bw_upcall); if (error) break; if (sopt->sopt_name == MRT_ADD_BW_UPCALL) error = add_bw_upcall(&bw_upcall); else error = del_bw_upcall(&bw_upcall); break; default: error = EOPNOTSUPP; break; } return error; } /* * Handle MRT getsockopt commands */ static int X_ip_mrouter_get(struct socket *so, struct sockopt *sopt) { int error; switch (sopt->sopt_name) { case MRT_VERSION: error = sooptcopyout(sopt, &mrt_api_version, sizeof mrt_api_version); break; case MRT_ASSERT: error = sooptcopyout(sopt, &V_pim_assert_enabled, sizeof V_pim_assert_enabled); break; case MRT_API_SUPPORT: error = sooptcopyout(sopt, &mrt_api_support, sizeof mrt_api_support); break; case MRT_API_CONFIG: error = sooptcopyout(sopt, &V_mrt_api_config, sizeof V_mrt_api_config); break; default: error = EOPNOTSUPP; break; } return error; } /* * Handle ioctl commands to obtain information from the cache */ static int X_mrt_ioctl(u_long cmd, caddr_t data, int fibnum __unused) { int error = 0; /* * Currently the only function calling this ioctl routine is rtioctl_fib(). * Typically, only root can create the raw socket in order to execute * this ioctl method, however the request might be coming from a prison */ error = priv_check(curthread, PRIV_NETINET_MROUTE); if (error) return (error); switch (cmd) { case (SIOCGETVIFCNT): error = get_vif_cnt((struct sioc_vif_req *)data); break; case (SIOCGETSGCNT): error = get_sg_cnt((struct sioc_sg_req *)data); break; default: error = EINVAL; break; } return error; } /* * returns the packet, byte, rpf-failure count for the source group provided */ static int get_sg_cnt(struct sioc_sg_req *req) { struct mfc *rt; MFC_LOCK(); rt = mfc_find(&req->src, &req->grp); if (rt == NULL) { MFC_UNLOCK(); req->pktcnt = req->bytecnt = req->wrong_if = 0xffffffff; return EADDRNOTAVAIL; } req->pktcnt = rt->mfc_pkt_cnt; req->bytecnt = rt->mfc_byte_cnt; req->wrong_if = rt->mfc_wrong_if; MFC_UNLOCK(); return 0; } /* * returns the input and output packet and byte counts on the vif provided */ static int get_vif_cnt(struct sioc_vif_req *req) { vifi_t vifi = req->vifi; VIF_LOCK(); if (vifi >= V_numvifs) { VIF_UNLOCK(); return EINVAL; } req->icount = V_viftable[vifi].v_pkt_in; req->ocount = V_viftable[vifi].v_pkt_out; req->ibytes = V_viftable[vifi].v_bytes_in; req->obytes = V_viftable[vifi].v_bytes_out; VIF_UNLOCK(); return 0; } static void if_detached_event(void *arg __unused, struct ifnet *ifp) { vifi_t vifi; u_long i; MROUTER_LOCK(); if (V_ip_mrouter == NULL) { MROUTER_UNLOCK(); return; } VIF_LOCK(); MFC_LOCK(); /* * Tear down multicast forwarder state associated with this ifnet. * 1. Walk the vif list, matching vifs against this ifnet. * 2. Walk the multicast forwarding cache (mfc) looking for * inner matches with this vif's index. * 3. Expire any matching multicast forwarding cache entries. * 4. Free vif state. This should disable ALLMULTI on the interface. */ for (vifi = 0; vifi < V_numvifs; vifi++) { if (V_viftable[vifi].v_ifp != ifp) continue; for (i = 0; i < mfchashsize; i++) { struct mfc *rt, *nrt; LIST_FOREACH_SAFE(rt, &V_mfchashtbl[i], mfc_hash, nrt) { if (rt->mfc_parent == vifi) { expire_mfc(rt); } } } del_vif_locked(vifi); } MFC_UNLOCK(); VIF_UNLOCK(); MROUTER_UNLOCK(); } /* * Enable multicast forwarding. */ static int ip_mrouter_init(struct socket *so, int version) { CTR3(KTR_IPMF, "%s: so_type %d, pr_protocol %d", __func__, so->so_type, so->so_proto->pr_protocol); if (so->so_type != SOCK_RAW || so->so_proto->pr_protocol != IPPROTO_IGMP) return EOPNOTSUPP; if (version != 1) return ENOPROTOOPT; MROUTER_LOCK(); if (ip_mrouter_unloading) { MROUTER_UNLOCK(); return ENOPROTOOPT; } if (V_ip_mrouter != NULL) { MROUTER_UNLOCK(); return EADDRINUSE; } V_mfchashtbl = hashinit_flags(mfchashsize, M_MRTABLE, &V_mfchash, HASH_NOWAIT); callout_reset(&V_expire_upcalls_ch, EXPIRE_TIMEOUT, expire_upcalls, curvnet); callout_reset(&V_bw_upcalls_ch, BW_UPCALLS_PERIOD, expire_bw_upcalls_send, curvnet); callout_reset(&V_bw_meter_ch, BW_METER_PERIOD, expire_bw_meter_process, curvnet); V_ip_mrouter = so; ip_mrouter_cnt++; MROUTER_UNLOCK(); CTR1(KTR_IPMF, "%s: done", __func__); return 0; } /* * Disable multicast forwarding. */ static int X_ip_mrouter_done(void) { struct ifnet *ifp; u_long i; vifi_t vifi; MROUTER_LOCK(); if (V_ip_mrouter == NULL) { MROUTER_UNLOCK(); return EINVAL; } /* * Detach/disable hooks to the reset of the system. */ V_ip_mrouter = NULL; ip_mrouter_cnt--; V_mrt_api_config = 0; VIF_LOCK(); /* * For each phyint in use, disable promiscuous reception of all IP * multicasts. */ for (vifi = 0; vifi < V_numvifs; vifi++) { if (!in_nullhost(V_viftable[vifi].v_lcl_addr) && !(V_viftable[vifi].v_flags & (VIFF_TUNNEL | VIFF_REGISTER))) { ifp = V_viftable[vifi].v_ifp; if_allmulti(ifp, 0); } } bzero((caddr_t)V_viftable, sizeof(V_viftable)); V_numvifs = 0; V_pim_assert_enabled = 0; VIF_UNLOCK(); callout_stop(&V_expire_upcalls_ch); callout_stop(&V_bw_upcalls_ch); callout_stop(&V_bw_meter_ch); MFC_LOCK(); /* * Free all multicast forwarding cache entries. * Do not use hashdestroy(), as we must perform other cleanup. */ for (i = 0; i < mfchashsize; i++) { struct mfc *rt, *nrt; LIST_FOREACH_SAFE(rt, &V_mfchashtbl[i], mfc_hash, nrt) { expire_mfc(rt); } } free(V_mfchashtbl, M_MRTABLE); V_mfchashtbl = NULL; bzero(V_nexpire, sizeof(V_nexpire[0]) * mfchashsize); V_bw_upcalls_n = 0; bzero(V_bw_meter_timers, sizeof(V_bw_meter_timers)); MFC_UNLOCK(); V_reg_vif_num = VIFI_INVALID; MROUTER_UNLOCK(); CTR1(KTR_IPMF, "%s: done", __func__); return 0; } /* * Set PIM assert processing global */ static int set_assert(int i) { if ((i != 1) && (i != 0)) return EINVAL; V_pim_assert_enabled = i; return 0; } /* * Configure API capabilities */ int set_api_config(uint32_t *apival) { u_long i; /* * We can set the API capabilities only if it is the first operation * after MRT_INIT. I.e.: * - there are no vifs installed * - pim_assert is not enabled * - the MFC table is empty */ if (V_numvifs > 0) { *apival = 0; return EPERM; } if (V_pim_assert_enabled) { *apival = 0; return EPERM; } MFC_LOCK(); for (i = 0; i < mfchashsize; i++) { if (LIST_FIRST(&V_mfchashtbl[i]) != NULL) { MFC_UNLOCK(); *apival = 0; return EPERM; } } MFC_UNLOCK(); V_mrt_api_config = *apival & mrt_api_support; *apival = V_mrt_api_config; return 0; } /* * Add a vif to the vif table */ static int add_vif(struct vifctl *vifcp) { struct vif *vifp = V_viftable + vifcp->vifc_vifi; struct sockaddr_in sin = {sizeof sin, AF_INET}; struct ifaddr *ifa; struct ifnet *ifp; int error; VIF_LOCK(); if (vifcp->vifc_vifi >= MAXVIFS) { VIF_UNLOCK(); return EINVAL; } /* rate limiting is no longer supported by this code */ if (vifcp->vifc_rate_limit != 0) { log(LOG_ERR, "rate limiting is no longer supported\n"); VIF_UNLOCK(); return EINVAL; } if (!in_nullhost(vifp->v_lcl_addr)) { VIF_UNLOCK(); return EADDRINUSE; } if (in_nullhost(vifcp->vifc_lcl_addr)) { VIF_UNLOCK(); return EADDRNOTAVAIL; } /* Find the interface with an address in AF_INET family */ if (vifcp->vifc_flags & VIFF_REGISTER) { /* * XXX: Because VIFF_REGISTER does not really need a valid * local interface (e.g. it could be 127.0.0.2), we don't * check its address. */ ifp = NULL; } else { sin.sin_addr = vifcp->vifc_lcl_addr; + NET_EPOCH_ENTER(); ifa = ifa_ifwithaddr((struct sockaddr *)&sin); if (ifa == NULL) { + NET_EPOCH_EXIT(); VIF_UNLOCK(); return EADDRNOTAVAIL; } ifp = ifa->ifa_ifp; - ifa_free(ifa); + NET_EPOCH_EXIT(); } if ((vifcp->vifc_flags & VIFF_TUNNEL) != 0) { CTR1(KTR_IPMF, "%s: tunnels are no longer supported", __func__); VIF_UNLOCK(); return EOPNOTSUPP; } else if (vifcp->vifc_flags & VIFF_REGISTER) { ifp = &V_multicast_register_if; CTR2(KTR_IPMF, "%s: add register vif for ifp %p", __func__, ifp); if (V_reg_vif_num == VIFI_INVALID) { if_initname(&V_multicast_register_if, "register_vif", 0); V_multicast_register_if.if_flags = IFF_LOOPBACK; V_reg_vif_num = vifcp->vifc_vifi; } } else { /* Make sure the interface supports multicast */ if ((ifp->if_flags & IFF_MULTICAST) == 0) { VIF_UNLOCK(); return EOPNOTSUPP; } /* Enable promiscuous reception of all IP multicasts from the if */ error = if_allmulti(ifp, 1); if (error) { VIF_UNLOCK(); return error; } } vifp->v_flags = vifcp->vifc_flags; vifp->v_threshold = vifcp->vifc_threshold; vifp->v_lcl_addr = vifcp->vifc_lcl_addr; vifp->v_rmt_addr = vifcp->vifc_rmt_addr; vifp->v_ifp = ifp; /* initialize per vif pkt counters */ vifp->v_pkt_in = 0; vifp->v_pkt_out = 0; vifp->v_bytes_in = 0; vifp->v_bytes_out = 0; /* Adjust numvifs up if the vifi is higher than numvifs */ if (V_numvifs <= vifcp->vifc_vifi) V_numvifs = vifcp->vifc_vifi + 1; VIF_UNLOCK(); CTR4(KTR_IPMF, "%s: add vif %d laddr 0x%08x thresh %x", __func__, (int)vifcp->vifc_vifi, ntohl(vifcp->vifc_lcl_addr.s_addr), (int)vifcp->vifc_threshold); return 0; } /* * Delete a vif from the vif table */ static int del_vif_locked(vifi_t vifi) { struct vif *vifp; VIF_LOCK_ASSERT(); if (vifi >= V_numvifs) { return EINVAL; } vifp = &V_viftable[vifi]; if (in_nullhost(vifp->v_lcl_addr)) { return EADDRNOTAVAIL; } if (!(vifp->v_flags & (VIFF_TUNNEL | VIFF_REGISTER))) if_allmulti(vifp->v_ifp, 0); if (vifp->v_flags & VIFF_REGISTER) V_reg_vif_num = VIFI_INVALID; bzero((caddr_t)vifp, sizeof (*vifp)); CTR2(KTR_IPMF, "%s: delete vif %d", __func__, (int)vifi); /* Adjust numvifs down */ for (vifi = V_numvifs; vifi > 0; vifi--) if (!in_nullhost(V_viftable[vifi-1].v_lcl_addr)) break; V_numvifs = vifi; return 0; } static int del_vif(vifi_t vifi) { int cc; VIF_LOCK(); cc = del_vif_locked(vifi); VIF_UNLOCK(); return cc; } /* * update an mfc entry without resetting counters and S,G addresses. */ static void update_mfc_params(struct mfc *rt, struct mfcctl2 *mfccp) { int i; rt->mfc_parent = mfccp->mfcc_parent; for (i = 0; i < V_numvifs; i++) { rt->mfc_ttls[i] = mfccp->mfcc_ttls[i]; rt->mfc_flags[i] = mfccp->mfcc_flags[i] & V_mrt_api_config & MRT_MFC_FLAGS_ALL; } /* set the RP address */ if (V_mrt_api_config & MRT_MFC_RP) rt->mfc_rp = mfccp->mfcc_rp; else rt->mfc_rp.s_addr = INADDR_ANY; } /* * fully initialize an mfc entry from the parameter. */ static void init_mfc_params(struct mfc *rt, struct mfcctl2 *mfccp) { rt->mfc_origin = mfccp->mfcc_origin; rt->mfc_mcastgrp = mfccp->mfcc_mcastgrp; update_mfc_params(rt, mfccp); /* initialize pkt counters per src-grp */ rt->mfc_pkt_cnt = 0; rt->mfc_byte_cnt = 0; rt->mfc_wrong_if = 0; timevalclear(&rt->mfc_last_assert); } static void expire_mfc(struct mfc *rt) { struct rtdetq *rte, *nrte; MFC_LOCK_ASSERT(); free_bw_list(rt->mfc_bw_meter); TAILQ_FOREACH_SAFE(rte, &rt->mfc_stall, rte_link, nrte) { m_freem(rte->m); TAILQ_REMOVE(&rt->mfc_stall, rte, rte_link); free(rte, M_MRTABLE); } LIST_REMOVE(rt, mfc_hash); free(rt, M_MRTABLE); } /* * Add an mfc entry */ static int add_mfc(struct mfcctl2 *mfccp) { struct mfc *rt; struct rtdetq *rte, *nrte; u_long hash = 0; u_short nstl; VIF_LOCK(); MFC_LOCK(); rt = mfc_find(&mfccp->mfcc_origin, &mfccp->mfcc_mcastgrp); /* If an entry already exists, just update the fields */ if (rt) { CTR4(KTR_IPMF, "%s: update mfc orig 0x%08x group %lx parent %x", __func__, ntohl(mfccp->mfcc_origin.s_addr), (u_long)ntohl(mfccp->mfcc_mcastgrp.s_addr), mfccp->mfcc_parent); update_mfc_params(rt, mfccp); MFC_UNLOCK(); VIF_UNLOCK(); return (0); } /* * Find the entry for which the upcall was made and update */ nstl = 0; hash = MFCHASH(mfccp->mfcc_origin, mfccp->mfcc_mcastgrp); LIST_FOREACH(rt, &V_mfchashtbl[hash], mfc_hash) { if (in_hosteq(rt->mfc_origin, mfccp->mfcc_origin) && in_hosteq(rt->mfc_mcastgrp, mfccp->mfcc_mcastgrp) && !TAILQ_EMPTY(&rt->mfc_stall)) { CTR5(KTR_IPMF, "%s: add mfc orig 0x%08x group %lx parent %x qh %p", __func__, ntohl(mfccp->mfcc_origin.s_addr), (u_long)ntohl(mfccp->mfcc_mcastgrp.s_addr), mfccp->mfcc_parent, TAILQ_FIRST(&rt->mfc_stall)); if (nstl++) CTR1(KTR_IPMF, "%s: multiple matches", __func__); init_mfc_params(rt, mfccp); rt->mfc_expire = 0; /* Don't clean this guy up */ V_nexpire[hash]--; /* Free queued packets, but attempt to forward them first. */ TAILQ_FOREACH_SAFE(rte, &rt->mfc_stall, rte_link, nrte) { if (rte->ifp != NULL) ip_mdq(rte->m, rte->ifp, rt, -1); m_freem(rte->m); TAILQ_REMOVE(&rt->mfc_stall, rte, rte_link); rt->mfc_nstall--; free(rte, M_MRTABLE); } } } /* * It is possible that an entry is being inserted without an upcall */ if (nstl == 0) { CTR1(KTR_IPMF, "%s: adding mfc w/o upcall", __func__); LIST_FOREACH(rt, &V_mfchashtbl[hash], mfc_hash) { if (in_hosteq(rt->mfc_origin, mfccp->mfcc_origin) && in_hosteq(rt->mfc_mcastgrp, mfccp->mfcc_mcastgrp)) { init_mfc_params(rt, mfccp); if (rt->mfc_expire) V_nexpire[hash]--; rt->mfc_expire = 0; break; /* XXX */ } } if (rt == NULL) { /* no upcall, so make a new entry */ rt = (struct mfc *)malloc(sizeof(*rt), M_MRTABLE, M_NOWAIT); if (rt == NULL) { MFC_UNLOCK(); VIF_UNLOCK(); return (ENOBUFS); } init_mfc_params(rt, mfccp); TAILQ_INIT(&rt->mfc_stall); rt->mfc_nstall = 0; rt->mfc_expire = 0; rt->mfc_bw_meter = NULL; /* insert new entry at head of hash chain */ LIST_INSERT_HEAD(&V_mfchashtbl[hash], rt, mfc_hash); } } MFC_UNLOCK(); VIF_UNLOCK(); return (0); } /* * Delete an mfc entry */ static int del_mfc(struct mfcctl2 *mfccp) { struct in_addr origin; struct in_addr mcastgrp; struct mfc *rt; origin = mfccp->mfcc_origin; mcastgrp = mfccp->mfcc_mcastgrp; CTR3(KTR_IPMF, "%s: delete mfc orig 0x%08x group %lx", __func__, ntohl(origin.s_addr), (u_long)ntohl(mcastgrp.s_addr)); MFC_LOCK(); rt = mfc_find(&origin, &mcastgrp); if (rt == NULL) { MFC_UNLOCK(); return EADDRNOTAVAIL; } /* * free the bw_meter entries */ free_bw_list(rt->mfc_bw_meter); rt->mfc_bw_meter = NULL; LIST_REMOVE(rt, mfc_hash); free(rt, M_MRTABLE); MFC_UNLOCK(); return (0); } /* * Send a message to the routing daemon on the multicast routing socket. */ static int socket_send(struct socket *s, struct mbuf *mm, struct sockaddr_in *src) { if (s) { SOCKBUF_LOCK(&s->so_rcv); if (sbappendaddr_locked(&s->so_rcv, (struct sockaddr *)src, mm, NULL) != 0) { sorwakeup_locked(s); return 0; } SOCKBUF_UNLOCK(&s->so_rcv); } m_freem(mm); return -1; } /* * IP multicast forwarding function. This function assumes that the packet * pointed to by "ip" has arrived on (or is about to be sent to) the interface * pointed to by "ifp", and the packet is to be relayed to other networks * that have members of the packet's destination IP multicast group. * * The packet is returned unscathed to the caller, unless it is * erroneous, in which case a non-zero return value tells the caller to * discard it. */ #define TUNNEL_LEN 12 /* # bytes of IP option for tunnel encapsulation */ static int X_ip_mforward(struct ip *ip, struct ifnet *ifp, struct mbuf *m, struct ip_moptions *imo) { struct mfc *rt; int error; vifi_t vifi; CTR3(KTR_IPMF, "ip_mforward: delete mfc orig 0x%08x group %lx ifp %p", ntohl(ip->ip_src.s_addr), (u_long)ntohl(ip->ip_dst.s_addr), ifp); if (ip->ip_hl < (sizeof(struct ip) + TUNNEL_LEN) >> 2 || ((u_char *)(ip + 1))[1] != IPOPT_LSRR ) { /* * Packet arrived via a physical interface or * an encapsulated tunnel or a register_vif. */ } else { /* * Packet arrived through a source-route tunnel. * Source-route tunnels are no longer supported. */ return (1); } VIF_LOCK(); MFC_LOCK(); if (imo && ((vifi = imo->imo_multicast_vif) < V_numvifs)) { if (ip->ip_ttl < MAXTTL) ip->ip_ttl++; /* compensate for -1 in *_send routines */ error = ip_mdq(m, ifp, NULL, vifi); MFC_UNLOCK(); VIF_UNLOCK(); return error; } /* * Don't forward a packet with time-to-live of zero or one, * or a packet destined to a local-only group. */ if (ip->ip_ttl <= 1 || IN_LOCAL_GROUP(ntohl(ip->ip_dst.s_addr))) { MFC_UNLOCK(); VIF_UNLOCK(); return 0; } /* * Determine forwarding vifs from the forwarding cache table */ MRTSTAT_INC(mrts_mfc_lookups); rt = mfc_find(&ip->ip_src, &ip->ip_dst); /* Entry exists, so forward if necessary */ if (rt != NULL) { error = ip_mdq(m, ifp, rt, -1); MFC_UNLOCK(); VIF_UNLOCK(); return error; } else { /* * If we don't have a route for packet's origin, * Make a copy of the packet & send message to routing daemon */ struct mbuf *mb0; struct rtdetq *rte; u_long hash; int hlen = ip->ip_hl << 2; MRTSTAT_INC(mrts_mfc_misses); MRTSTAT_INC(mrts_no_route); CTR2(KTR_IPMF, "ip_mforward: no mfc for (0x%08x,%lx)", ntohl(ip->ip_src.s_addr), (u_long)ntohl(ip->ip_dst.s_addr)); /* * Allocate mbufs early so that we don't do extra work if we are * just going to fail anyway. Make sure to pullup the header so * that other people can't step on it. */ rte = (struct rtdetq *)malloc((sizeof *rte), M_MRTABLE, M_NOWAIT|M_ZERO); if (rte == NULL) { MFC_UNLOCK(); VIF_UNLOCK(); return ENOBUFS; } mb0 = m_copypacket(m, M_NOWAIT); if (mb0 && (!M_WRITABLE(mb0) || mb0->m_len < hlen)) mb0 = m_pullup(mb0, hlen); if (mb0 == NULL) { free(rte, M_MRTABLE); MFC_UNLOCK(); VIF_UNLOCK(); return ENOBUFS; } /* is there an upcall waiting for this flow ? */ hash = MFCHASH(ip->ip_src, ip->ip_dst); LIST_FOREACH(rt, &V_mfchashtbl[hash], mfc_hash) { if (in_hosteq(ip->ip_src, rt->mfc_origin) && in_hosteq(ip->ip_dst, rt->mfc_mcastgrp) && !TAILQ_EMPTY(&rt->mfc_stall)) break; } if (rt == NULL) { int i; struct igmpmsg *im; struct sockaddr_in k_igmpsrc = { sizeof k_igmpsrc, AF_INET }; struct mbuf *mm; /* * Locate the vifi for the incoming interface for this packet. * If none found, drop packet. */ for (vifi = 0; vifi < V_numvifs && V_viftable[vifi].v_ifp != ifp; vifi++) ; if (vifi >= V_numvifs) /* vif not found, drop packet */ goto non_fatal; /* no upcall, so make a new entry */ rt = (struct mfc *)malloc(sizeof(*rt), M_MRTABLE, M_NOWAIT); if (rt == NULL) goto fail; /* Make a copy of the header to send to the user level process */ mm = m_copym(mb0, 0, hlen, M_NOWAIT); if (mm == NULL) goto fail1; /* * Send message to routing daemon to install * a route into the kernel table */ im = mtod(mm, struct igmpmsg *); im->im_msgtype = IGMPMSG_NOCACHE; im->im_mbz = 0; im->im_vif = vifi; MRTSTAT_INC(mrts_upcalls); k_igmpsrc.sin_addr = ip->ip_src; if (socket_send(V_ip_mrouter, mm, &k_igmpsrc) < 0) { CTR0(KTR_IPMF, "ip_mforward: socket queue full"); MRTSTAT_INC(mrts_upq_sockfull); fail1: free(rt, M_MRTABLE); fail: free(rte, M_MRTABLE); m_freem(mb0); MFC_UNLOCK(); VIF_UNLOCK(); return ENOBUFS; } /* insert new entry at head of hash chain */ rt->mfc_origin.s_addr = ip->ip_src.s_addr; rt->mfc_mcastgrp.s_addr = ip->ip_dst.s_addr; rt->mfc_expire = UPCALL_EXPIRE; V_nexpire[hash]++; for (i = 0; i < V_numvifs; i++) { rt->mfc_ttls[i] = 0; rt->mfc_flags[i] = 0; } rt->mfc_parent = -1; /* clear the RP address */ rt->mfc_rp.s_addr = INADDR_ANY; rt->mfc_bw_meter = NULL; /* initialize pkt counters per src-grp */ rt->mfc_pkt_cnt = 0; rt->mfc_byte_cnt = 0; rt->mfc_wrong_if = 0; timevalclear(&rt->mfc_last_assert); TAILQ_INIT(&rt->mfc_stall); rt->mfc_nstall = 0; /* link into table */ LIST_INSERT_HEAD(&V_mfchashtbl[hash], rt, mfc_hash); TAILQ_INSERT_HEAD(&rt->mfc_stall, rte, rte_link); rt->mfc_nstall++; } else { /* determine if queue has overflowed */ if (rt->mfc_nstall > MAX_UPQ) { MRTSTAT_INC(mrts_upq_ovflw); non_fatal: free(rte, M_MRTABLE); m_freem(mb0); MFC_UNLOCK(); VIF_UNLOCK(); return (0); } TAILQ_INSERT_TAIL(&rt->mfc_stall, rte, rte_link); rt->mfc_nstall++; } rte->m = mb0; rte->ifp = ifp; MFC_UNLOCK(); VIF_UNLOCK(); return 0; } } /* * Clean up the cache entry if upcall is not serviced */ static void expire_upcalls(void *arg) { u_long i; CURVNET_SET((struct vnet *) arg); MFC_LOCK(); for (i = 0; i < mfchashsize; i++) { struct mfc *rt, *nrt; if (V_nexpire[i] == 0) continue; LIST_FOREACH_SAFE(rt, &V_mfchashtbl[i], mfc_hash, nrt) { if (TAILQ_EMPTY(&rt->mfc_stall)) continue; if (rt->mfc_expire == 0 || --rt->mfc_expire > 0) continue; /* * free the bw_meter entries */ while (rt->mfc_bw_meter != NULL) { struct bw_meter *x = rt->mfc_bw_meter; rt->mfc_bw_meter = x->bm_mfc_next; free(x, M_BWMETER); } MRTSTAT_INC(mrts_cache_cleanups); CTR3(KTR_IPMF, "%s: expire (%lx, %lx)", __func__, (u_long)ntohl(rt->mfc_origin.s_addr), (u_long)ntohl(rt->mfc_mcastgrp.s_addr)); expire_mfc(rt); } } MFC_UNLOCK(); callout_reset(&V_expire_upcalls_ch, EXPIRE_TIMEOUT, expire_upcalls, curvnet); CURVNET_RESTORE(); } /* * Packet forwarding routine once entry in the cache is made */ static int ip_mdq(struct mbuf *m, struct ifnet *ifp, struct mfc *rt, vifi_t xmt_vif) { struct ip *ip = mtod(m, struct ip *); vifi_t vifi; int plen = ntohs(ip->ip_len); VIF_LOCK_ASSERT(); /* * If xmt_vif is not -1, send on only the requested vif. * * (since vifi_t is u_short, -1 becomes MAXUSHORT, which > numvifs.) */ if (xmt_vif < V_numvifs) { if (V_viftable[xmt_vif].v_flags & VIFF_REGISTER) pim_register_send(ip, V_viftable + xmt_vif, m, rt); else phyint_send(ip, V_viftable + xmt_vif, m); return 1; } /* * Don't forward if it didn't arrive from the parent vif for its origin. */ vifi = rt->mfc_parent; if ((vifi >= V_numvifs) || (V_viftable[vifi].v_ifp != ifp)) { CTR4(KTR_IPMF, "%s: rx on wrong ifp %p (vifi %d, v_ifp %p)", __func__, ifp, (int)vifi, V_viftable[vifi].v_ifp); MRTSTAT_INC(mrts_wrong_if); ++rt->mfc_wrong_if; /* * If we are doing PIM assert processing, send a message * to the routing daemon. * * XXX: A PIM-SM router needs the WRONGVIF detection so it * can complete the SPT switch, regardless of the type * of the iif (broadcast media, GRE tunnel, etc). */ if (V_pim_assert_enabled && (vifi < V_numvifs) && V_viftable[vifi].v_ifp) { if (ifp == &V_multicast_register_if) PIMSTAT_INC(pims_rcv_registers_wrongiif); /* Get vifi for the incoming packet */ for (vifi = 0; vifi < V_numvifs && V_viftable[vifi].v_ifp != ifp; vifi++) ; if (vifi >= V_numvifs) return 0; /* The iif is not found: ignore the packet. */ if (rt->mfc_flags[vifi] & MRT_MFC_FLAGS_DISABLE_WRONGVIF) return 0; /* WRONGVIF disabled: ignore the packet */ if (ratecheck(&rt->mfc_last_assert, &pim_assert_interval)) { struct sockaddr_in k_igmpsrc = { sizeof k_igmpsrc, AF_INET }; struct igmpmsg *im; int hlen = ip->ip_hl << 2; struct mbuf *mm = m_copym(m, 0, hlen, M_NOWAIT); if (mm && (!M_WRITABLE(mm) || mm->m_len < hlen)) mm = m_pullup(mm, hlen); if (mm == NULL) return ENOBUFS; im = mtod(mm, struct igmpmsg *); im->im_msgtype = IGMPMSG_WRONGVIF; im->im_mbz = 0; im->im_vif = vifi; MRTSTAT_INC(mrts_upcalls); k_igmpsrc.sin_addr = im->im_src; if (socket_send(V_ip_mrouter, mm, &k_igmpsrc) < 0) { CTR1(KTR_IPMF, "%s: socket queue full", __func__); MRTSTAT_INC(mrts_upq_sockfull); return ENOBUFS; } } } return 0; } /* If I sourced this packet, it counts as output, else it was input. */ if (in_hosteq(ip->ip_src, V_viftable[vifi].v_lcl_addr)) { V_viftable[vifi].v_pkt_out++; V_viftable[vifi].v_bytes_out += plen; } else { V_viftable[vifi].v_pkt_in++; V_viftable[vifi].v_bytes_in += plen; } rt->mfc_pkt_cnt++; rt->mfc_byte_cnt += plen; /* * For each vif, decide if a copy of the packet should be forwarded. * Forward if: * - the ttl exceeds the vif's threshold * - there are group members downstream on interface */ for (vifi = 0; vifi < V_numvifs; vifi++) if ((rt->mfc_ttls[vifi] > 0) && (ip->ip_ttl > rt->mfc_ttls[vifi])) { V_viftable[vifi].v_pkt_out++; V_viftable[vifi].v_bytes_out += plen; if (V_viftable[vifi].v_flags & VIFF_REGISTER) pim_register_send(ip, V_viftable + vifi, m, rt); else phyint_send(ip, V_viftable + vifi, m); } /* * Perform upcall-related bw measuring. */ if (rt->mfc_bw_meter != NULL) { struct bw_meter *x; struct timeval now; microtime(&now); MFC_LOCK_ASSERT(); for (x = rt->mfc_bw_meter; x != NULL; x = x->bm_mfc_next) bw_meter_receive_packet(x, plen, &now); } return 0; } /* * Check if a vif number is legal/ok. This is used by in_mcast.c. */ static int X_legal_vif_num(int vif) { int ret; ret = 0; if (vif < 0) return (ret); VIF_LOCK(); if (vif < V_numvifs) ret = 1; VIF_UNLOCK(); return (ret); } /* * Return the local address used by this vif */ static u_long X_ip_mcast_src(int vifi) { in_addr_t addr; addr = INADDR_ANY; if (vifi < 0) return (addr); VIF_LOCK(); if (vifi < V_numvifs) addr = V_viftable[vifi].v_lcl_addr.s_addr; VIF_UNLOCK(); return (addr); } static void phyint_send(struct ip *ip, struct vif *vifp, struct mbuf *m) { struct mbuf *mb_copy; int hlen = ip->ip_hl << 2; VIF_LOCK_ASSERT(); /* * Make a new reference to the packet; make sure that * the IP header is actually copied, not just referenced, * so that ip_output() only scribbles on the copy. */ mb_copy = m_copypacket(m, M_NOWAIT); if (mb_copy && (!M_WRITABLE(mb_copy) || mb_copy->m_len < hlen)) mb_copy = m_pullup(mb_copy, hlen); if (mb_copy == NULL) return; send_packet(vifp, mb_copy); } static void send_packet(struct vif *vifp, struct mbuf *m) { struct ip_moptions imo; struct in_multi *imm[2]; int error __unused; VIF_LOCK_ASSERT(); imo.imo_multicast_ifp = vifp->v_ifp; imo.imo_multicast_ttl = mtod(m, struct ip *)->ip_ttl - 1; imo.imo_multicast_loop = 1; imo.imo_multicast_vif = -1; imo.imo_num_memberships = 0; imo.imo_max_memberships = 2; imo.imo_membership = &imm[0]; /* * Re-entrancy should not be a problem here, because * the packets that we send out and are looped back at us * should get rejected because they appear to come from * the loopback interface, thus preventing looping. */ error = ip_output(m, NULL, NULL, IP_FORWARDING, &imo, NULL); CTR3(KTR_IPMF, "%s: vif %td err %d", __func__, (ptrdiff_t)(vifp - V_viftable), error); } /* * Stubs for old RSVP socket shim implementation. */ static int X_ip_rsvp_vif(struct socket *so __unused, struct sockopt *sopt __unused) { return (EOPNOTSUPP); } static void X_ip_rsvp_force_done(struct socket *so __unused) { } static int X_rsvp_input(struct mbuf **mp, int *offp, int proto) { struct mbuf *m; m = *mp; *mp = NULL; if (!V_rsvp_on) m_freem(m); return (IPPROTO_DONE); } /* * Code for bandwidth monitors */ /* * Define common interface for timeval-related methods */ #define BW_TIMEVALCMP(tvp, uvp, cmp) timevalcmp((tvp), (uvp), cmp) #define BW_TIMEVALDECR(vvp, uvp) timevalsub((vvp), (uvp)) #define BW_TIMEVALADD(vvp, uvp) timevaladd((vvp), (uvp)) static uint32_t compute_bw_meter_flags(struct bw_upcall *req) { uint32_t flags = 0; if (req->bu_flags & BW_UPCALL_UNIT_PACKETS) flags |= BW_METER_UNIT_PACKETS; if (req->bu_flags & BW_UPCALL_UNIT_BYTES) flags |= BW_METER_UNIT_BYTES; if (req->bu_flags & BW_UPCALL_GEQ) flags |= BW_METER_GEQ; if (req->bu_flags & BW_UPCALL_LEQ) flags |= BW_METER_LEQ; return flags; } /* * Add a bw_meter entry */ static int add_bw_upcall(struct bw_upcall *req) { struct mfc *mfc; struct timeval delta = { BW_UPCALL_THRESHOLD_INTERVAL_MIN_SEC, BW_UPCALL_THRESHOLD_INTERVAL_MIN_USEC }; struct timeval now; struct bw_meter *x; uint32_t flags; if (!(V_mrt_api_config & MRT_MFC_BW_UPCALL)) return EOPNOTSUPP; /* Test if the flags are valid */ if (!(req->bu_flags & (BW_UPCALL_UNIT_PACKETS | BW_UPCALL_UNIT_BYTES))) return EINVAL; if (!(req->bu_flags & (BW_UPCALL_GEQ | BW_UPCALL_LEQ))) return EINVAL; if ((req->bu_flags & (BW_UPCALL_GEQ | BW_UPCALL_LEQ)) == (BW_UPCALL_GEQ | BW_UPCALL_LEQ)) return EINVAL; /* Test if the threshold time interval is valid */ if (BW_TIMEVALCMP(&req->bu_threshold.b_time, &delta, <)) return EINVAL; flags = compute_bw_meter_flags(req); /* * Find if we have already same bw_meter entry */ MFC_LOCK(); mfc = mfc_find(&req->bu_src, &req->bu_dst); if (mfc == NULL) { MFC_UNLOCK(); return EADDRNOTAVAIL; } for (x = mfc->mfc_bw_meter; x != NULL; x = x->bm_mfc_next) { if ((BW_TIMEVALCMP(&x->bm_threshold.b_time, &req->bu_threshold.b_time, ==)) && (x->bm_threshold.b_packets == req->bu_threshold.b_packets) && (x->bm_threshold.b_bytes == req->bu_threshold.b_bytes) && (x->bm_flags & BW_METER_USER_FLAGS) == flags) { MFC_UNLOCK(); return 0; /* XXX Already installed */ } } /* Allocate the new bw_meter entry */ x = (struct bw_meter *)malloc(sizeof(*x), M_BWMETER, M_NOWAIT); if (x == NULL) { MFC_UNLOCK(); return ENOBUFS; } /* Set the new bw_meter entry */ x->bm_threshold.b_time = req->bu_threshold.b_time; microtime(&now); x->bm_start_time = now; x->bm_threshold.b_packets = req->bu_threshold.b_packets; x->bm_threshold.b_bytes = req->bu_threshold.b_bytes; x->bm_measured.b_packets = 0; x->bm_measured.b_bytes = 0; x->bm_flags = flags; x->bm_time_next = NULL; x->bm_time_hash = BW_METER_BUCKETS; /* Add the new bw_meter entry to the front of entries for this MFC */ x->bm_mfc = mfc; x->bm_mfc_next = mfc->mfc_bw_meter; mfc->mfc_bw_meter = x; schedule_bw_meter(x, &now); MFC_UNLOCK(); return 0; } static void free_bw_list(struct bw_meter *list) { while (list != NULL) { struct bw_meter *x = list; list = list->bm_mfc_next; unschedule_bw_meter(x); free(x, M_BWMETER); } } /* * Delete one or multiple bw_meter entries */ static int del_bw_upcall(struct bw_upcall *req) { struct mfc *mfc; struct bw_meter *x; if (!(V_mrt_api_config & MRT_MFC_BW_UPCALL)) return EOPNOTSUPP; MFC_LOCK(); /* Find the corresponding MFC entry */ mfc = mfc_find(&req->bu_src, &req->bu_dst); if (mfc == NULL) { MFC_UNLOCK(); return EADDRNOTAVAIL; } else if (req->bu_flags & BW_UPCALL_DELETE_ALL) { /* * Delete all bw_meter entries for this mfc */ struct bw_meter *list; list = mfc->mfc_bw_meter; mfc->mfc_bw_meter = NULL; free_bw_list(list); MFC_UNLOCK(); return 0; } else { /* Delete a single bw_meter entry */ struct bw_meter *prev; uint32_t flags = 0; flags = compute_bw_meter_flags(req); /* Find the bw_meter entry to delete */ for (prev = NULL, x = mfc->mfc_bw_meter; x != NULL; prev = x, x = x->bm_mfc_next) { if ((BW_TIMEVALCMP(&x->bm_threshold.b_time, &req->bu_threshold.b_time, ==)) && (x->bm_threshold.b_packets == req->bu_threshold.b_packets) && (x->bm_threshold.b_bytes == req->bu_threshold.b_bytes) && (x->bm_flags & BW_METER_USER_FLAGS) == flags) break; } if (x != NULL) { /* Delete entry from the list for this MFC */ if (prev != NULL) prev->bm_mfc_next = x->bm_mfc_next; /* remove from middle*/ else x->bm_mfc->mfc_bw_meter = x->bm_mfc_next;/* new head of list */ unschedule_bw_meter(x); MFC_UNLOCK(); /* Free the bw_meter entry */ free(x, M_BWMETER); return 0; } else { MFC_UNLOCK(); return EINVAL; } } /* NOTREACHED */ } /* * Perform bandwidth measurement processing that may result in an upcall */ static void bw_meter_receive_packet(struct bw_meter *x, int plen, struct timeval *nowp) { struct timeval delta; MFC_LOCK_ASSERT(); delta = *nowp; BW_TIMEVALDECR(&delta, &x->bm_start_time); if (x->bm_flags & BW_METER_GEQ) { /* * Processing for ">=" type of bw_meter entry */ if (BW_TIMEVALCMP(&delta, &x->bm_threshold.b_time, >)) { /* Reset the bw_meter entry */ x->bm_start_time = *nowp; x->bm_measured.b_packets = 0; x->bm_measured.b_bytes = 0; x->bm_flags &= ~BW_METER_UPCALL_DELIVERED; } /* Record that a packet is received */ x->bm_measured.b_packets++; x->bm_measured.b_bytes += plen; /* * Test if we should deliver an upcall */ if (!(x->bm_flags & BW_METER_UPCALL_DELIVERED)) { if (((x->bm_flags & BW_METER_UNIT_PACKETS) && (x->bm_measured.b_packets >= x->bm_threshold.b_packets)) || ((x->bm_flags & BW_METER_UNIT_BYTES) && (x->bm_measured.b_bytes >= x->bm_threshold.b_bytes))) { /* Prepare an upcall for delivery */ bw_meter_prepare_upcall(x, nowp); x->bm_flags |= BW_METER_UPCALL_DELIVERED; } } } else if (x->bm_flags & BW_METER_LEQ) { /* * Processing for "<=" type of bw_meter entry */ if (BW_TIMEVALCMP(&delta, &x->bm_threshold.b_time, >)) { /* * We are behind time with the multicast forwarding table * scanning for "<=" type of bw_meter entries, so test now * if we should deliver an upcall. */ if (((x->bm_flags & BW_METER_UNIT_PACKETS) && (x->bm_measured.b_packets <= x->bm_threshold.b_packets)) || ((x->bm_flags & BW_METER_UNIT_BYTES) && (x->bm_measured.b_bytes <= x->bm_threshold.b_bytes))) { /* Prepare an upcall for delivery */ bw_meter_prepare_upcall(x, nowp); } /* Reschedule the bw_meter entry */ unschedule_bw_meter(x); schedule_bw_meter(x, nowp); } /* Record that a packet is received */ x->bm_measured.b_packets++; x->bm_measured.b_bytes += plen; /* * Test if we should restart the measuring interval */ if ((x->bm_flags & BW_METER_UNIT_PACKETS && x->bm_measured.b_packets <= x->bm_threshold.b_packets) || (x->bm_flags & BW_METER_UNIT_BYTES && x->bm_measured.b_bytes <= x->bm_threshold.b_bytes)) { /* Don't restart the measuring interval */ } else { /* Do restart the measuring interval */ /* * XXX: note that we don't unschedule and schedule, because this * might be too much overhead per packet. Instead, when we process * all entries for a given timer hash bin, we check whether it is * really a timeout. If not, we reschedule at that time. */ x->bm_start_time = *nowp; x->bm_measured.b_packets = 0; x->bm_measured.b_bytes = 0; x->bm_flags &= ~BW_METER_UPCALL_DELIVERED; } } } /* * Prepare a bandwidth-related upcall */ static void bw_meter_prepare_upcall(struct bw_meter *x, struct timeval *nowp) { struct timeval delta; struct bw_upcall *u; MFC_LOCK_ASSERT(); /* * Compute the measured time interval */ delta = *nowp; BW_TIMEVALDECR(&delta, &x->bm_start_time); /* * If there are too many pending upcalls, deliver them now */ if (V_bw_upcalls_n >= BW_UPCALLS_MAX) bw_upcalls_send(); /* * Set the bw_upcall entry */ u = &V_bw_upcalls[V_bw_upcalls_n++]; u->bu_src = x->bm_mfc->mfc_origin; u->bu_dst = x->bm_mfc->mfc_mcastgrp; u->bu_threshold.b_time = x->bm_threshold.b_time; u->bu_threshold.b_packets = x->bm_threshold.b_packets; u->bu_threshold.b_bytes = x->bm_threshold.b_bytes; u->bu_measured.b_time = delta; u->bu_measured.b_packets = x->bm_measured.b_packets; u->bu_measured.b_bytes = x->bm_measured.b_bytes; u->bu_flags = 0; if (x->bm_flags & BW_METER_UNIT_PACKETS) u->bu_flags |= BW_UPCALL_UNIT_PACKETS; if (x->bm_flags & BW_METER_UNIT_BYTES) u->bu_flags |= BW_UPCALL_UNIT_BYTES; if (x->bm_flags & BW_METER_GEQ) u->bu_flags |= BW_UPCALL_GEQ; if (x->bm_flags & BW_METER_LEQ) u->bu_flags |= BW_UPCALL_LEQ; } /* * Send the pending bandwidth-related upcalls */ static void bw_upcalls_send(void) { struct mbuf *m; int len = V_bw_upcalls_n * sizeof(V_bw_upcalls[0]); struct sockaddr_in k_igmpsrc = { sizeof k_igmpsrc, AF_INET }; static struct igmpmsg igmpmsg = { 0, /* unused1 */ 0, /* unused2 */ IGMPMSG_BW_UPCALL,/* im_msgtype */ 0, /* im_mbz */ 0, /* im_vif */ 0, /* unused3 */ { 0 }, /* im_src */ { 0 } }; /* im_dst */ MFC_LOCK_ASSERT(); if (V_bw_upcalls_n == 0) return; /* No pending upcalls */ V_bw_upcalls_n = 0; /* * Allocate a new mbuf, initialize it with the header and * the payload for the pending calls. */ m = m_gethdr(M_NOWAIT, MT_DATA); if (m == NULL) { log(LOG_WARNING, "bw_upcalls_send: cannot allocate mbuf\n"); return; } m_copyback(m, 0, sizeof(struct igmpmsg), (caddr_t)&igmpmsg); m_copyback(m, sizeof(struct igmpmsg), len, (caddr_t)&V_bw_upcalls[0]); /* * Send the upcalls * XXX do we need to set the address in k_igmpsrc ? */ MRTSTAT_INC(mrts_upcalls); if (socket_send(V_ip_mrouter, m, &k_igmpsrc) < 0) { log(LOG_WARNING, "bw_upcalls_send: ip_mrouter socket queue full\n"); MRTSTAT_INC(mrts_upq_sockfull); } } /* * Compute the timeout hash value for the bw_meter entries */ #define BW_METER_TIMEHASH(bw_meter, hash) \ do { \ struct timeval next_timeval = (bw_meter)->bm_start_time; \ \ BW_TIMEVALADD(&next_timeval, &(bw_meter)->bm_threshold.b_time); \ (hash) = next_timeval.tv_sec; \ if (next_timeval.tv_usec) \ (hash)++; /* XXX: make sure we don't timeout early */ \ (hash) %= BW_METER_BUCKETS; \ } while (0) /* * Schedule a timer to process periodically bw_meter entry of type "<=" * by linking the entry in the proper hash bucket. */ static void schedule_bw_meter(struct bw_meter *x, struct timeval *nowp) { int time_hash; MFC_LOCK_ASSERT(); if (!(x->bm_flags & BW_METER_LEQ)) return; /* XXX: we schedule timers only for "<=" entries */ /* * Reset the bw_meter entry */ x->bm_start_time = *nowp; x->bm_measured.b_packets = 0; x->bm_measured.b_bytes = 0; x->bm_flags &= ~BW_METER_UPCALL_DELIVERED; /* * Compute the timeout hash value and insert the entry */ BW_METER_TIMEHASH(x, time_hash); x->bm_time_next = V_bw_meter_timers[time_hash]; V_bw_meter_timers[time_hash] = x; x->bm_time_hash = time_hash; } /* * Unschedule the periodic timer that processes bw_meter entry of type "<=" * by removing the entry from the proper hash bucket. */ static void unschedule_bw_meter(struct bw_meter *x) { int time_hash; struct bw_meter *prev, *tmp; MFC_LOCK_ASSERT(); if (!(x->bm_flags & BW_METER_LEQ)) return; /* XXX: we schedule timers only for "<=" entries */ /* * Compute the timeout hash value and delete the entry */ time_hash = x->bm_time_hash; if (time_hash >= BW_METER_BUCKETS) return; /* Entry was not scheduled */ for (prev = NULL, tmp = V_bw_meter_timers[time_hash]; tmp != NULL; prev = tmp, tmp = tmp->bm_time_next) if (tmp == x) break; if (tmp == NULL) panic("unschedule_bw_meter: bw_meter entry not found"); if (prev != NULL) prev->bm_time_next = x->bm_time_next; else V_bw_meter_timers[time_hash] = x->bm_time_next; x->bm_time_next = NULL; x->bm_time_hash = BW_METER_BUCKETS; } /* * Process all "<=" type of bw_meter that should be processed now, * and for each entry prepare an upcall if necessary. Each processed * entry is rescheduled again for the (periodic) processing. * * This is run periodically (once per second normally). On each round, * all the potentially matching entries are in the hash slot that we are * looking at. */ static void bw_meter_process() { uint32_t loops; int i; struct timeval now, process_endtime; microtime(&now); if (V_last_tv_sec == now.tv_sec) return; /* nothing to do */ loops = now.tv_sec - V_last_tv_sec; V_last_tv_sec = now.tv_sec; if (loops > BW_METER_BUCKETS) loops = BW_METER_BUCKETS; MFC_LOCK(); /* * Process all bins of bw_meter entries from the one after the last * processed to the current one. On entry, i points to the last bucket * visited, so we need to increment i at the beginning of the loop. */ for (i = (now.tv_sec - loops) % BW_METER_BUCKETS; loops > 0; loops--) { struct bw_meter *x, *tmp_list; if (++i >= BW_METER_BUCKETS) i = 0; /* Disconnect the list of bw_meter entries from the bin */ tmp_list = V_bw_meter_timers[i]; V_bw_meter_timers[i] = NULL; /* Process the list of bw_meter entries */ while (tmp_list != NULL) { x = tmp_list; tmp_list = tmp_list->bm_time_next; /* Test if the time interval is over */ process_endtime = x->bm_start_time; BW_TIMEVALADD(&process_endtime, &x->bm_threshold.b_time); if (BW_TIMEVALCMP(&process_endtime, &now, >)) { /* Not yet: reschedule, but don't reset */ int time_hash; BW_METER_TIMEHASH(x, time_hash); if (time_hash == i && process_endtime.tv_sec == now.tv_sec) { /* * XXX: somehow the bin processing is a bit ahead of time. * Put the entry in the next bin. */ if (++time_hash >= BW_METER_BUCKETS) time_hash = 0; } x->bm_time_next = V_bw_meter_timers[time_hash]; V_bw_meter_timers[time_hash] = x; x->bm_time_hash = time_hash; continue; } /* * Test if we should deliver an upcall */ if (((x->bm_flags & BW_METER_UNIT_PACKETS) && (x->bm_measured.b_packets <= x->bm_threshold.b_packets)) || ((x->bm_flags & BW_METER_UNIT_BYTES) && (x->bm_measured.b_bytes <= x->bm_threshold.b_bytes))) { /* Prepare an upcall for delivery */ bw_meter_prepare_upcall(x, &now); } /* * Reschedule for next processing */ schedule_bw_meter(x, &now); } } /* Send all upcalls that are pending delivery */ bw_upcalls_send(); MFC_UNLOCK(); } /* * A periodic function for sending all upcalls that are pending delivery */ static void expire_bw_upcalls_send(void *arg) { CURVNET_SET((struct vnet *) arg); MFC_LOCK(); bw_upcalls_send(); MFC_UNLOCK(); callout_reset(&V_bw_upcalls_ch, BW_UPCALLS_PERIOD, expire_bw_upcalls_send, curvnet); CURVNET_RESTORE(); } /* * A periodic function for periodic scanning of the multicast forwarding * table for processing all "<=" bw_meter entries. */ static void expire_bw_meter_process(void *arg) { CURVNET_SET((struct vnet *) arg); if (V_mrt_api_config & MRT_MFC_BW_UPCALL) bw_meter_process(); callout_reset(&V_bw_meter_ch, BW_METER_PERIOD, expire_bw_meter_process, curvnet); CURVNET_RESTORE(); } /* * End of bandwidth monitoring code */ /* * Send the packet up to the user daemon, or eventually do kernel encapsulation * */ static int pim_register_send(struct ip *ip, struct vif *vifp, struct mbuf *m, struct mfc *rt) { struct mbuf *mb_copy, *mm; /* * Do not send IGMP_WHOLEPKT notifications to userland, if the * rendezvous point was unspecified, and we were told not to. */ if (pim_squelch_wholepkt != 0 && (V_mrt_api_config & MRT_MFC_RP) && in_nullhost(rt->mfc_rp)) return 0; mb_copy = pim_register_prepare(ip, m); if (mb_copy == NULL) return ENOBUFS; /* * Send all the fragments. Note that the mbuf for each fragment * is freed by the sending machinery. */ for (mm = mb_copy; mm; mm = mb_copy) { mb_copy = mm->m_nextpkt; mm->m_nextpkt = 0; mm = m_pullup(mm, sizeof(struct ip)); if (mm != NULL) { ip = mtod(mm, struct ip *); if ((V_mrt_api_config & MRT_MFC_RP) && !in_nullhost(rt->mfc_rp)) { pim_register_send_rp(ip, vifp, mm, rt); } else { pim_register_send_upcall(ip, vifp, mm, rt); } } } return 0; } /* * Return a copy of the data packet that is ready for PIM Register * encapsulation. * XXX: Note that in the returned copy the IP header is a valid one. */ static struct mbuf * pim_register_prepare(struct ip *ip, struct mbuf *m) { struct mbuf *mb_copy = NULL; int mtu; /* Take care of delayed checksums */ if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { in_delayed_cksum(m); m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA; } /* * Copy the old packet & pullup its IP header into the * new mbuf so we can modify it. */ mb_copy = m_copypacket(m, M_NOWAIT); if (mb_copy == NULL) return NULL; mb_copy = m_pullup(mb_copy, ip->ip_hl << 2); if (mb_copy == NULL) return NULL; /* take care of the TTL */ ip = mtod(mb_copy, struct ip *); --ip->ip_ttl; /* Compute the MTU after the PIM Register encapsulation */ mtu = 0xffff - sizeof(pim_encap_iphdr) - sizeof(pim_encap_pimhdr); if (ntohs(ip->ip_len) <= mtu) { /* Turn the IP header into a valid one */ ip->ip_sum = 0; ip->ip_sum = in_cksum(mb_copy, ip->ip_hl << 2); } else { /* Fragment the packet */ mb_copy->m_pkthdr.csum_flags |= CSUM_IP; if (ip_fragment(ip, &mb_copy, mtu, 0) != 0) { m_freem(mb_copy); return NULL; } } return mb_copy; } /* * Send an upcall with the data packet to the user-level process. */ static int pim_register_send_upcall(struct ip *ip, struct vif *vifp, struct mbuf *mb_copy, struct mfc *rt) { struct mbuf *mb_first; int len = ntohs(ip->ip_len); struct igmpmsg *im; struct sockaddr_in k_igmpsrc = { sizeof k_igmpsrc, AF_INET }; VIF_LOCK_ASSERT(); /* * Add a new mbuf with an upcall header */ mb_first = m_gethdr(M_NOWAIT, MT_DATA); if (mb_first == NULL) { m_freem(mb_copy); return ENOBUFS; } mb_first->m_data += max_linkhdr; mb_first->m_pkthdr.len = len + sizeof(struct igmpmsg); mb_first->m_len = sizeof(struct igmpmsg); mb_first->m_next = mb_copy; /* Send message to routing daemon */ im = mtod(mb_first, struct igmpmsg *); im->im_msgtype = IGMPMSG_WHOLEPKT; im->im_mbz = 0; im->im_vif = vifp - V_viftable; im->im_src = ip->ip_src; im->im_dst = ip->ip_dst; k_igmpsrc.sin_addr = ip->ip_src; MRTSTAT_INC(mrts_upcalls); if (socket_send(V_ip_mrouter, mb_first, &k_igmpsrc) < 0) { CTR1(KTR_IPMF, "%s: socket queue full", __func__); MRTSTAT_INC(mrts_upq_sockfull); return ENOBUFS; } /* Keep statistics */ PIMSTAT_INC(pims_snd_registers_msgs); PIMSTAT_ADD(pims_snd_registers_bytes, len); return 0; } /* * Encapsulate the data packet in PIM Register message and send it to the RP. */ static int pim_register_send_rp(struct ip *ip, struct vif *vifp, struct mbuf *mb_copy, struct mfc *rt) { struct mbuf *mb_first; struct ip *ip_outer; struct pim_encap_pimhdr *pimhdr; int len = ntohs(ip->ip_len); vifi_t vifi = rt->mfc_parent; VIF_LOCK_ASSERT(); if ((vifi >= V_numvifs) || in_nullhost(V_viftable[vifi].v_lcl_addr)) { m_freem(mb_copy); return EADDRNOTAVAIL; /* The iif vif is invalid */ } /* * Add a new mbuf with the encapsulating header */ mb_first = m_gethdr(M_NOWAIT, MT_DATA); if (mb_first == NULL) { m_freem(mb_copy); return ENOBUFS; } mb_first->m_data += max_linkhdr; mb_first->m_len = sizeof(pim_encap_iphdr) + sizeof(pim_encap_pimhdr); mb_first->m_next = mb_copy; mb_first->m_pkthdr.len = len + mb_first->m_len; /* * Fill in the encapsulating IP and PIM header */ ip_outer = mtod(mb_first, struct ip *); *ip_outer = pim_encap_iphdr; ip_outer->ip_len = htons(len + sizeof(pim_encap_iphdr) + sizeof(pim_encap_pimhdr)); ip_outer->ip_src = V_viftable[vifi].v_lcl_addr; ip_outer->ip_dst = rt->mfc_rp; /* * Copy the inner header TOS to the outer header, and take care of the * IP_DF bit. */ ip_outer->ip_tos = ip->ip_tos; if (ip->ip_off & htons(IP_DF)) ip_outer->ip_off |= htons(IP_DF); ip_fillid(ip_outer); pimhdr = (struct pim_encap_pimhdr *)((caddr_t)ip_outer + sizeof(pim_encap_iphdr)); *pimhdr = pim_encap_pimhdr; /* If the iif crosses a border, set the Border-bit */ if (rt->mfc_flags[vifi] & MRT_MFC_FLAGS_BORDER_VIF & V_mrt_api_config) pimhdr->flags |= htonl(PIM_BORDER_REGISTER); mb_first->m_data += sizeof(pim_encap_iphdr); pimhdr->pim.pim_cksum = in_cksum(mb_first, sizeof(pim_encap_pimhdr)); mb_first->m_data -= sizeof(pim_encap_iphdr); send_packet(vifp, mb_first); /* Keep statistics */ PIMSTAT_INC(pims_snd_registers_msgs); PIMSTAT_ADD(pims_snd_registers_bytes, len); return 0; } /* * pim_encapcheck() is called by the encap4_input() path at runtime to * determine if a packet is for PIM; allowing PIM to be dynamically loaded * into the kernel. */ static int pim_encapcheck(const struct mbuf *m, int off, int proto, void *arg) { #ifdef DIAGNOSTIC KASSERT(proto == IPPROTO_PIM, ("not for IPPROTO_PIM")); #endif if (proto != IPPROTO_PIM) return 0; /* not for us; reject the datagram. */ return 64; /* claim the datagram. */ } /* * PIM-SMv2 and PIM-DM messages processing. * Receives and verifies the PIM control messages, and passes them * up to the listening socket, using rip_input(). * The only message with special processing is the PIM_REGISTER message * (used by PIM-SM): the PIM header is stripped off, and the inner packet * is passed to if_simloop(). */ int pim_input(struct mbuf **mp, int *offp, int proto) { struct mbuf *m = *mp; struct ip *ip = mtod(m, struct ip *); struct pim *pim; int iphlen = *offp; int minlen; int datalen = ntohs(ip->ip_len) - iphlen; int ip_tos; *mp = NULL; /* Keep statistics */ PIMSTAT_INC(pims_rcv_total_msgs); PIMSTAT_ADD(pims_rcv_total_bytes, datalen); /* * Validate lengths */ if (datalen < PIM_MINLEN) { PIMSTAT_INC(pims_rcv_tooshort); CTR3(KTR_IPMF, "%s: short packet (%d) from 0x%08x", __func__, datalen, ntohl(ip->ip_src.s_addr)); m_freem(m); return (IPPROTO_DONE); } /* * If the packet is at least as big as a REGISTER, go agead * and grab the PIM REGISTER header size, to avoid another * possible m_pullup() later. * * PIM_MINLEN == pimhdr + u_int32_t == 4 + 4 = 8 * PIM_REG_MINLEN == pimhdr + reghdr + encap_iphdr == 4 + 4 + 20 = 28 */ minlen = iphlen + (datalen >= PIM_REG_MINLEN ? PIM_REG_MINLEN : PIM_MINLEN); /* * Get the IP and PIM headers in contiguous memory, and * possibly the PIM REGISTER header. */ if (m->m_len < minlen && (m = m_pullup(m, minlen)) == NULL) { CTR1(KTR_IPMF, "%s: m_pullup() failed", __func__); return (IPPROTO_DONE); } /* m_pullup() may have given us a new mbuf so reset ip. */ ip = mtod(m, struct ip *); ip_tos = ip->ip_tos; /* adjust mbuf to point to the PIM header */ m->m_data += iphlen; m->m_len -= iphlen; pim = mtod(m, struct pim *); /* * Validate checksum. If PIM REGISTER, exclude the data packet. * * XXX: some older PIMv2 implementations don't make this distinction, * so for compatibility reason perform the checksum over part of the * message, and if error, then over the whole message. */ if (PIM_VT_T(pim->pim_vt) == PIM_REGISTER && in_cksum(m, PIM_MINLEN) == 0) { /* do nothing, checksum okay */ } else if (in_cksum(m, datalen)) { PIMSTAT_INC(pims_rcv_badsum); CTR1(KTR_IPMF, "%s: invalid checksum", __func__); m_freem(m); return (IPPROTO_DONE); } /* PIM version check */ if (PIM_VT_V(pim->pim_vt) < PIM_VERSION) { PIMSTAT_INC(pims_rcv_badversion); CTR3(KTR_IPMF, "%s: bad version %d expect %d", __func__, (int)PIM_VT_V(pim->pim_vt), PIM_VERSION); m_freem(m); return (IPPROTO_DONE); } /* restore mbuf back to the outer IP */ m->m_data -= iphlen; m->m_len += iphlen; if (PIM_VT_T(pim->pim_vt) == PIM_REGISTER) { /* * Since this is a REGISTER, we'll make a copy of the register * headers ip + pim + u_int32 + encap_ip, to be passed up to the * routing daemon. */ struct sockaddr_in dst = { sizeof(dst), AF_INET }; struct mbuf *mcp; struct ip *encap_ip; u_int32_t *reghdr; struct ifnet *vifp; VIF_LOCK(); if ((V_reg_vif_num >= V_numvifs) || (V_reg_vif_num == VIFI_INVALID)) { VIF_UNLOCK(); CTR2(KTR_IPMF, "%s: register vif not set: %d", __func__, (int)V_reg_vif_num); m_freem(m); return (IPPROTO_DONE); } /* XXX need refcnt? */ vifp = V_viftable[V_reg_vif_num].v_ifp; VIF_UNLOCK(); /* * Validate length */ if (datalen < PIM_REG_MINLEN) { PIMSTAT_INC(pims_rcv_tooshort); PIMSTAT_INC(pims_rcv_badregisters); CTR1(KTR_IPMF, "%s: register packet size too small", __func__); m_freem(m); return (IPPROTO_DONE); } reghdr = (u_int32_t *)(pim + 1); encap_ip = (struct ip *)(reghdr + 1); CTR3(KTR_IPMF, "%s: register: encap ip src 0x%08x len %d", __func__, ntohl(encap_ip->ip_src.s_addr), ntohs(encap_ip->ip_len)); /* verify the version number of the inner packet */ if (encap_ip->ip_v != IPVERSION) { PIMSTAT_INC(pims_rcv_badregisters); CTR1(KTR_IPMF, "%s: bad encap ip version", __func__); m_freem(m); return (IPPROTO_DONE); } /* verify the inner packet is destined to a mcast group */ if (!IN_MULTICAST(ntohl(encap_ip->ip_dst.s_addr))) { PIMSTAT_INC(pims_rcv_badregisters); CTR2(KTR_IPMF, "%s: bad encap ip dest 0x%08x", __func__, ntohl(encap_ip->ip_dst.s_addr)); m_freem(m); return (IPPROTO_DONE); } /* If a NULL_REGISTER, pass it to the daemon */ if ((ntohl(*reghdr) & PIM_NULL_REGISTER)) goto pim_input_to_daemon; /* * Copy the TOS from the outer IP header to the inner IP header. */ if (encap_ip->ip_tos != ip_tos) { /* Outer TOS -> inner TOS */ encap_ip->ip_tos = ip_tos; /* Recompute the inner header checksum. Sigh... */ /* adjust mbuf to point to the inner IP header */ m->m_data += (iphlen + PIM_MINLEN); m->m_len -= (iphlen + PIM_MINLEN); encap_ip->ip_sum = 0; encap_ip->ip_sum = in_cksum(m, encap_ip->ip_hl << 2); /* restore mbuf to point back to the outer IP header */ m->m_data -= (iphlen + PIM_MINLEN); m->m_len += (iphlen + PIM_MINLEN); } /* * Decapsulate the inner IP packet and loopback to forward it * as a normal multicast packet. Also, make a copy of the * outer_iphdr + pimhdr + reghdr + encap_iphdr * to pass to the daemon later, so it can take the appropriate * actions (e.g., send back PIM_REGISTER_STOP). * XXX: here m->m_data points to the outer IP header. */ mcp = m_copym(m, 0, iphlen + PIM_REG_MINLEN, M_NOWAIT); if (mcp == NULL) { CTR1(KTR_IPMF, "%s: m_copym() failed", __func__); m_freem(m); return (IPPROTO_DONE); } /* Keep statistics */ /* XXX: registers_bytes include only the encap. mcast pkt */ PIMSTAT_INC(pims_rcv_registers_msgs); PIMSTAT_ADD(pims_rcv_registers_bytes, ntohs(encap_ip->ip_len)); /* * forward the inner ip packet; point m_data at the inner ip. */ m_adj(m, iphlen + PIM_MINLEN); CTR4(KTR_IPMF, "%s: forward decap'd REGISTER: src %lx dst %lx vif %d", __func__, (u_long)ntohl(encap_ip->ip_src.s_addr), (u_long)ntohl(encap_ip->ip_dst.s_addr), (int)V_reg_vif_num); /* NB: vifp was collected above; can it change on us? */ if_simloop(vifp, m, dst.sin_family, 0); /* prepare the register head to send to the mrouting daemon */ m = mcp; } pim_input_to_daemon: /* * Pass the PIM message up to the daemon; if it is a Register message, * pass the 'head' only up to the daemon. This includes the * outer IP header, PIM header, PIM-Register header and the * inner IP header. * XXX: the outer IP header pkt size of a Register is not adjust to * reflect the fact that the inner multicast data is truncated. */ *mp = m; rip_input(mp, offp, proto); return (IPPROTO_DONE); } static int sysctl_mfctable(SYSCTL_HANDLER_ARGS) { struct mfc *rt; int error, i; if (req->newptr) return (EPERM); if (V_mfchashtbl == NULL) /* XXX unlocked */ return (0); error = sysctl_wire_old_buffer(req, 0); if (error) return (error); MFC_LOCK(); for (i = 0; i < mfchashsize; i++) { LIST_FOREACH(rt, &V_mfchashtbl[i], mfc_hash) { error = SYSCTL_OUT(req, rt, sizeof(struct mfc)); if (error) goto out_locked; } } out_locked: MFC_UNLOCK(); return (error); } static SYSCTL_NODE(_net_inet_ip, OID_AUTO, mfctable, CTLFLAG_RD, sysctl_mfctable, "IPv4 Multicast Forwarding Table " "(struct *mfc[mfchashsize], netinet/ip_mroute.h)"); static void vnet_mroute_init(const void *unused __unused) { V_nexpire = malloc(mfchashsize, M_MRTABLE, M_WAITOK|M_ZERO); bzero(V_bw_meter_timers, sizeof(V_bw_meter_timers)); callout_init(&V_expire_upcalls_ch, 1); callout_init(&V_bw_upcalls_ch, 1); callout_init(&V_bw_meter_ch, 1); } VNET_SYSINIT(vnet_mroute_init, SI_SUB_PROTO_MC, SI_ORDER_ANY, vnet_mroute_init, NULL); static void vnet_mroute_uninit(const void *unused __unused) { free(V_nexpire, M_MRTABLE); V_nexpire = NULL; } VNET_SYSUNINIT(vnet_mroute_uninit, SI_SUB_PROTO_MC, SI_ORDER_MIDDLE, vnet_mroute_uninit, NULL); static int ip_mroute_modevent(module_t mod, int type, void *unused) { switch (type) { case MOD_LOAD: MROUTER_LOCK_INIT(); if_detach_event_tag = EVENTHANDLER_REGISTER(ifnet_departure_event, if_detached_event, NULL, EVENTHANDLER_PRI_ANY); if (if_detach_event_tag == NULL) { printf("ip_mroute: unable to register " "ifnet_departure_event handler\n"); MROUTER_LOCK_DESTROY(); return (EINVAL); } MFC_LOCK_INIT(); VIF_LOCK_INIT(); mfchashsize = MFCHASHSIZE; if (TUNABLE_ULONG_FETCH("net.inet.ip.mfchashsize", &mfchashsize) && !powerof2(mfchashsize)) { printf("WARNING: %s not a power of 2; using default\n", "net.inet.ip.mfchashsize"); mfchashsize = MFCHASHSIZE; } pim_squelch_wholepkt = 0; TUNABLE_ULONG_FETCH("net.inet.pim.squelch_wholepkt", &pim_squelch_wholepkt); pim_encap_cookie = encap_attach_func(AF_INET, IPPROTO_PIM, pim_encapcheck, &in_pim_protosw, NULL); if (pim_encap_cookie == NULL) { printf("ip_mroute: unable to attach pim encap\n"); VIF_LOCK_DESTROY(); MFC_LOCK_DESTROY(); MROUTER_LOCK_DESTROY(); return (EINVAL); } ip_mcast_src = X_ip_mcast_src; ip_mforward = X_ip_mforward; ip_mrouter_done = X_ip_mrouter_done; ip_mrouter_get = X_ip_mrouter_get; ip_mrouter_set = X_ip_mrouter_set; ip_rsvp_force_done = X_ip_rsvp_force_done; ip_rsvp_vif = X_ip_rsvp_vif; legal_vif_num = X_legal_vif_num; mrt_ioctl = X_mrt_ioctl; rsvp_input_p = X_rsvp_input; break; case MOD_UNLOAD: /* * Typically module unload happens after the user-level * process has shutdown the kernel services (the check * below insures someone can't just yank the module out * from under a running process). But if the module is * just loaded and then unloaded w/o starting up a user * process we still need to cleanup. */ MROUTER_LOCK(); if (ip_mrouter_cnt != 0) { MROUTER_UNLOCK(); return (EINVAL); } ip_mrouter_unloading = 1; MROUTER_UNLOCK(); EVENTHANDLER_DEREGISTER(ifnet_departure_event, if_detach_event_tag); if (pim_encap_cookie) { encap_detach(pim_encap_cookie); pim_encap_cookie = NULL; } ip_mcast_src = NULL; ip_mforward = NULL; ip_mrouter_done = NULL; ip_mrouter_get = NULL; ip_mrouter_set = NULL; ip_rsvp_force_done = NULL; ip_rsvp_vif = NULL; legal_vif_num = NULL; mrt_ioctl = NULL; rsvp_input_p = NULL; VIF_LOCK_DESTROY(); MFC_LOCK_DESTROY(); MROUTER_LOCK_DESTROY(); break; default: return EOPNOTSUPP; } return 0; } static moduledata_t ip_mroutemod = { "ip_mroute", ip_mroute_modevent, 0 }; DECLARE_MODULE(ip_mroute, ip_mroutemod, SI_SUB_PROTO_MC, SI_ORDER_MIDDLE); Index: head/sys/netinet/ip_options.c =================================================================== --- head/sys/netinet/ip_options.c (revision 334117) +++ head/sys/netinet/ip_options.c (revision 334118) @@ -1,762 +1,763 @@ /* * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1988, 1993 * The Regents of the University of California. * Copyright (c) 2005 Andre Oppermann, Internet Business Solutions AG. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_ipstealth.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static VNET_DEFINE(int, ip_dosourceroute); SYSCTL_INT(_net_inet_ip, IPCTL_SOURCEROUTE, sourceroute, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip_dosourceroute), 0, "Enable forwarding source routed IP packets"); #define V_ip_dosourceroute VNET(ip_dosourceroute) static VNET_DEFINE(int, ip_acceptsourceroute); SYSCTL_INT(_net_inet_ip, IPCTL_ACCEPTSOURCEROUTE, accept_sourceroute, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip_acceptsourceroute), 0, "Enable accepting source routed IP packets"); #define V_ip_acceptsourceroute VNET(ip_acceptsourceroute) VNET_DEFINE(int, ip_doopts) = 1; /* 0 = ignore, 1 = process, 2 = reject */ SYSCTL_INT(_net_inet_ip, OID_AUTO, process_options, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip_doopts), 0, "Enable IP options processing ([LS]SRR, RR, TS)"); static void save_rte(struct mbuf *m, u_char *, struct in_addr); /* * Do option processing on a datagram, possibly discarding it if bad options * are encountered, or forwarding it if source-routed. * * The pass argument is used when operating in the IPSTEALTH mode to tell * what options to process: [LS]SRR (pass 0) or the others (pass 1). The * reason for as many as two passes is that when doing IPSTEALTH, non-routing * options should be processed only if the packet is for us. * * Returns 1 if packet has been forwarded/freed, 0 if the packet should be * processed further. */ int ip_dooptions(struct mbuf *m, int pass) { struct ip *ip = mtod(m, struct ip *); u_char *cp; struct in_ifaddr *ia; int opt, optlen, cnt, off, code, type = ICMP_PARAMPROB, forward = 0; struct in_addr *sin, dst; uint32_t ntime; struct nhop4_extended nh_ext; struct sockaddr_in ipaddr = { sizeof(ipaddr), AF_INET }; + NET_EPOCH_ENTER(); /* Ignore or reject packets with IP options. */ if (V_ip_doopts == 0) return 0; else if (V_ip_doopts == 2) { type = ICMP_UNREACH; code = ICMP_UNREACH_FILTER_PROHIB; goto bad; } dst = ip->ip_dst; cp = (u_char *)(ip + 1); cnt = (ip->ip_hl << 2) - sizeof (struct ip); for (; cnt > 0; cnt -= optlen, cp += optlen) { opt = cp[IPOPT_OPTVAL]; if (opt == IPOPT_EOL) break; if (opt == IPOPT_NOP) optlen = 1; else { if (cnt < IPOPT_OLEN + sizeof(*cp)) { code = &cp[IPOPT_OLEN] - (u_char *)ip; goto bad; } optlen = cp[IPOPT_OLEN]; if (optlen < IPOPT_OLEN + sizeof(*cp) || optlen > cnt) { code = &cp[IPOPT_OLEN] - (u_char *)ip; goto bad; } } switch (opt) { default: break; /* * Source routing with record. Find interface with current * destination address. If none on this machine then drop if * strictly routed, or do nothing if loosely routed. Record * interface address and bring up next address component. If * strictly routed make sure next address is on directly * accessible net. */ case IPOPT_LSRR: case IPOPT_SSRR: #ifdef IPSTEALTH if (V_ipstealth && pass > 0) break; #endif if (optlen < IPOPT_OFFSET + sizeof(*cp)) { code = &cp[IPOPT_OLEN] - (u_char *)ip; goto bad; } if ((off = cp[IPOPT_OFFSET]) < IPOPT_MINOFF) { code = &cp[IPOPT_OFFSET] - (u_char *)ip; goto bad; } ipaddr.sin_addr = ip->ip_dst; if (ifa_ifwithaddr_check((struct sockaddr *)&ipaddr) == 0) { if (opt == IPOPT_SSRR) { type = ICMP_UNREACH; code = ICMP_UNREACH_SRCFAIL; goto bad; } if (!V_ip_dosourceroute) goto nosourcerouting; /* * Loose routing, and not at next destination * yet; nothing to do except forward. */ break; } off--; /* 0 origin */ if (off > optlen - (int)sizeof(struct in_addr)) { /* * End of source route. Should be for us. */ if (!V_ip_acceptsourceroute) goto nosourcerouting; save_rte(m, cp, ip->ip_src); break; } #ifdef IPSTEALTH if (V_ipstealth) goto dropit; #endif if (!V_ip_dosourceroute) { if (V_ipforwarding) { char srcbuf[INET_ADDRSTRLEN]; char dstbuf[INET_ADDRSTRLEN]; /* * Acting as a router, so generate * ICMP */ nosourcerouting: log(LOG_WARNING, "attempted source route from %s " "to %s\n", inet_ntoa_r(ip->ip_src, srcbuf), inet_ntoa_r(ip->ip_dst, dstbuf)); type = ICMP_UNREACH; code = ICMP_UNREACH_SRCFAIL; goto bad; } else { /* * Not acting as a router, so * silently drop. */ #ifdef IPSTEALTH dropit: #endif IPSTAT_INC(ips_cantforward); m_freem(m); + NET_EPOCH_EXIT(); return (1); } } /* * locate outgoing interface */ (void)memcpy(&ipaddr.sin_addr, cp + off, sizeof(ipaddr.sin_addr)); type = ICMP_UNREACH; code = ICMP_UNREACH_SRCFAIL; if (opt == IPOPT_SSRR) { #define INA struct in_ifaddr * #define SA struct sockaddr * ia = (INA)ifa_ifwithdstaddr((SA)&ipaddr, RT_ALL_FIBS); if (ia == NULL) ia = (INA)ifa_ifwithnet((SA)&ipaddr, 0, RT_ALL_FIBS); if (ia == NULL) goto bad; memcpy(cp + off, &(IA_SIN(ia)->sin_addr), sizeof(struct in_addr)); - ifa_free(&ia->ia_ifa); } else { /* XXX MRT 0 for routing */ if (fib4_lookup_nh_ext(M_GETFIB(m), ipaddr.sin_addr, 0, 0, &nh_ext) != 0) goto bad; memcpy(cp + off, &nh_ext.nh_src, sizeof(struct in_addr)); } ip->ip_dst = ipaddr.sin_addr; cp[IPOPT_OFFSET] += sizeof(struct in_addr); /* * Let ip_intr's mcast routing check handle mcast pkts */ forward = !IN_MULTICAST(ntohl(ip->ip_dst.s_addr)); break; case IPOPT_RR: #ifdef IPSTEALTH if (V_ipstealth && pass == 0) break; #endif if (optlen < IPOPT_OFFSET + sizeof(*cp)) { code = &cp[IPOPT_OFFSET] - (u_char *)ip; goto bad; } if ((off = cp[IPOPT_OFFSET]) < IPOPT_MINOFF) { code = &cp[IPOPT_OFFSET] - (u_char *)ip; goto bad; } /* * If no space remains, ignore. */ off--; /* 0 origin */ if (off > optlen - (int)sizeof(struct in_addr)) break; (void)memcpy(&ipaddr.sin_addr, &ip->ip_dst, sizeof(ipaddr.sin_addr)); /* * Locate outgoing interface; if we're the * destination, use the incoming interface (should be * same). */ if ((ia = (INA)ifa_ifwithaddr((SA)&ipaddr)) != NULL) { memcpy(cp + off, &(IA_SIN(ia)->sin_addr), sizeof(struct in_addr)); - ifa_free(&ia->ia_ifa); } else if (fib4_lookup_nh_ext(M_GETFIB(m), ipaddr.sin_addr, 0, 0, &nh_ext) == 0) { memcpy(cp + off, &nh_ext.nh_src, sizeof(struct in_addr)); } else { type = ICMP_UNREACH; code = ICMP_UNREACH_HOST; goto bad; } cp[IPOPT_OFFSET] += sizeof(struct in_addr); break; case IPOPT_TS: #ifdef IPSTEALTH if (V_ipstealth && pass == 0) break; #endif code = cp - (u_char *)ip; if (optlen < 4 || optlen > 40) { code = &cp[IPOPT_OLEN] - (u_char *)ip; goto bad; } if ((off = cp[IPOPT_OFFSET]) < 5) { code = &cp[IPOPT_OLEN] - (u_char *)ip; goto bad; } if (off > optlen - (int)sizeof(int32_t)) { cp[IPOPT_OFFSET + 1] += (1 << 4); if ((cp[IPOPT_OFFSET + 1] & 0xf0) == 0) { code = &cp[IPOPT_OFFSET] - (u_char *)ip; goto bad; } break; } off--; /* 0 origin */ sin = (struct in_addr *)(cp + off); switch (cp[IPOPT_OFFSET + 1] & 0x0f) { case IPOPT_TS_TSONLY: break; case IPOPT_TS_TSANDADDR: if (off + sizeof(uint32_t) + sizeof(struct in_addr) > optlen) { code = &cp[IPOPT_OFFSET] - (u_char *)ip; goto bad; } ipaddr.sin_addr = dst; ia = (INA)ifaof_ifpforaddr((SA)&ipaddr, m->m_pkthdr.rcvif); if (ia == NULL) continue; (void)memcpy(sin, &IA_SIN(ia)->sin_addr, sizeof(struct in_addr)); - ifa_free(&ia->ia_ifa); cp[IPOPT_OFFSET] += sizeof(struct in_addr); off += sizeof(struct in_addr); break; case IPOPT_TS_PRESPEC: if (off + sizeof(uint32_t) + sizeof(struct in_addr) > optlen) { code = &cp[IPOPT_OFFSET] - (u_char *)ip; goto bad; } (void)memcpy(&ipaddr.sin_addr, sin, sizeof(struct in_addr)); if (ifa_ifwithaddr_check((SA)&ipaddr) == 0) continue; cp[IPOPT_OFFSET] += sizeof(struct in_addr); off += sizeof(struct in_addr); break; default: code = &cp[IPOPT_OFFSET + 1] - (u_char *)ip; goto bad; } ntime = iptime(); (void)memcpy(cp + off, &ntime, sizeof(uint32_t)); cp[IPOPT_OFFSET] += sizeof(uint32_t); } } + NET_EPOCH_EXIT(); if (forward && V_ipforwarding) { ip_forward(m, 1); return (1); } return (0); bad: + NET_EPOCH_EXIT(); icmp_error(m, type, code, 0, 0); IPSTAT_INC(ips_badoptions); return (1); } /* * Save incoming source route for use in replies, to be picked up later by * ip_srcroute if the receiver is interested. */ static void save_rte(struct mbuf *m, u_char *option, struct in_addr dst) { unsigned olen; struct ipopt_tag *opts; opts = (struct ipopt_tag *)m_tag_get(PACKET_TAG_IPOPTIONS, sizeof(struct ipopt_tag), M_NOWAIT); if (opts == NULL) return; olen = option[IPOPT_OLEN]; if (olen > sizeof(opts->ip_srcrt) - (1 + sizeof(dst))) { m_tag_free((struct m_tag *)opts); return; } bcopy(option, opts->ip_srcrt.srcopt, olen); opts->ip_nhops = (olen - IPOPT_OFFSET - 1) / sizeof(struct in_addr); opts->ip_srcrt.dst = dst; m_tag_prepend(m, (struct m_tag *)opts); } /* * Retrieve incoming source route for use in replies, in the same form used * by setsockopt. The first hop is placed before the options, will be * removed later. */ struct mbuf * ip_srcroute(struct mbuf *m0) { struct in_addr *p, *q; struct mbuf *m; struct ipopt_tag *opts; opts = (struct ipopt_tag *)m_tag_find(m0, PACKET_TAG_IPOPTIONS, NULL); if (opts == NULL) return (NULL); if (opts->ip_nhops == 0) return (NULL); m = m_get(M_NOWAIT, MT_DATA); if (m == NULL) return (NULL); #define OPTSIZ (sizeof(opts->ip_srcrt.nop) + sizeof(opts->ip_srcrt.srcopt)) /* length is (nhops+1)*sizeof(addr) + sizeof(nop + srcrt header) */ m->m_len = opts->ip_nhops * sizeof(struct in_addr) + sizeof(struct in_addr) + OPTSIZ; /* * First, save first hop for return route. */ p = &(opts->ip_srcrt.route[opts->ip_nhops - 1]); *(mtod(m, struct in_addr *)) = *p--; /* * Copy option fields and padding (nop) to mbuf. */ opts->ip_srcrt.nop = IPOPT_NOP; opts->ip_srcrt.srcopt[IPOPT_OFFSET] = IPOPT_MINOFF; (void)memcpy(mtod(m, caddr_t) + sizeof(struct in_addr), &(opts->ip_srcrt.nop), OPTSIZ); q = (struct in_addr *)(mtod(m, caddr_t) + sizeof(struct in_addr) + OPTSIZ); #undef OPTSIZ /* * Record return path as an IP source route, reversing the path * (pointers are now aligned). */ while (p >= opts->ip_srcrt.route) { *q++ = *p--; } /* * Last hop goes to final destination. */ *q = opts->ip_srcrt.dst; m_tag_delete(m0, (struct m_tag *)opts); return (m); } /* * Strip out IP options, at higher level protocol in the kernel. */ void ip_stripoptions(struct mbuf *m) { struct ip *ip = mtod(m, struct ip *); int olen; olen = (ip->ip_hl << 2) - sizeof(struct ip); m->m_len -= olen; if (m->m_flags & M_PKTHDR) m->m_pkthdr.len -= olen; ip->ip_len = htons(ntohs(ip->ip_len) - olen); ip->ip_hl = sizeof(struct ip) >> 2; bcopy((char *)ip + sizeof(struct ip) + olen, (ip + 1), (size_t )(m->m_len - sizeof(struct ip))); } /* * Insert IP options into preformed packet. Adjust IP destination as * required for IP source routing, as indicated by a non-zero in_addr at the * start of the options. * * XXX This routine assumes that the packet has no options in place. */ struct mbuf * ip_insertoptions(struct mbuf *m, struct mbuf *opt, int *phlen) { struct ipoption *p = mtod(opt, struct ipoption *); struct mbuf *n; struct ip *ip = mtod(m, struct ip *); unsigned optlen; optlen = opt->m_len - sizeof(p->ipopt_dst); if (optlen + ntohs(ip->ip_len) > IP_MAXPACKET) { *phlen = 0; return (m); /* XXX should fail */ } if (p->ipopt_dst.s_addr) ip->ip_dst = p->ipopt_dst; if (!M_WRITABLE(m) || M_LEADINGSPACE(m) < optlen) { n = m_gethdr(M_NOWAIT, MT_DATA); if (n == NULL) { *phlen = 0; return (m); } m_move_pkthdr(n, m); n->m_pkthdr.rcvif = NULL; n->m_pkthdr.len += optlen; m->m_len -= sizeof(struct ip); m->m_data += sizeof(struct ip); n->m_next = m; m = n; m->m_len = optlen + sizeof(struct ip); m->m_data += max_linkhdr; bcopy(ip, mtod(m, void *), sizeof(struct ip)); } else { m->m_data -= optlen; m->m_len += optlen; m->m_pkthdr.len += optlen; bcopy(ip, mtod(m, void *), sizeof(struct ip)); } ip = mtod(m, struct ip *); bcopy(p->ipopt_list, ip + 1, optlen); *phlen = sizeof(struct ip) + optlen; ip->ip_v = IPVERSION; ip->ip_hl = *phlen >> 2; ip->ip_len = htons(ntohs(ip->ip_len) + optlen); return (m); } /* * Copy options from ip to jp, omitting those not copied during * fragmentation. */ int ip_optcopy(struct ip *ip, struct ip *jp) { u_char *cp, *dp; int opt, optlen, cnt; cp = (u_char *)(ip + 1); dp = (u_char *)(jp + 1); cnt = (ip->ip_hl << 2) - sizeof (struct ip); for (; cnt > 0; cnt -= optlen, cp += optlen) { opt = cp[0]; if (opt == IPOPT_EOL) break; if (opt == IPOPT_NOP) { /* Preserve for IP mcast tunnel's LSRR alignment. */ *dp++ = IPOPT_NOP; optlen = 1; continue; } KASSERT(cnt >= IPOPT_OLEN + sizeof(*cp), ("ip_optcopy: malformed ipv4 option")); optlen = cp[IPOPT_OLEN]; KASSERT(optlen >= IPOPT_OLEN + sizeof(*cp) && optlen <= cnt, ("ip_optcopy: malformed ipv4 option")); /* Bogus lengths should have been caught by ip_dooptions. */ if (optlen > cnt) optlen = cnt; if (IPOPT_COPIED(opt)) { bcopy(cp, dp, optlen); dp += optlen; } } for (optlen = dp - (u_char *)(jp+1); optlen & 0x3; optlen++) *dp++ = IPOPT_EOL; return (optlen); } /* * Set up IP options in pcb for insertion in output packets. Store in mbuf * with pointer in pcbopt, adding pseudo-option with destination address if * source routed. */ int ip_pcbopts(struct inpcb *inp, int optname, struct mbuf *m) { int cnt, optlen; u_char *cp; struct mbuf **pcbopt; u_char opt; INP_WLOCK_ASSERT(inp); pcbopt = &inp->inp_options; /* turn off any old options */ if (*pcbopt) (void)m_free(*pcbopt); *pcbopt = NULL; if (m == NULL || m->m_len == 0) { /* * Only turning off any previous options. */ if (m != NULL) (void)m_free(m); return (0); } if (m->m_len % sizeof(int32_t)) goto bad; /* * IP first-hop destination address will be stored before actual * options; move other options back and clear it when none present. */ if (m->m_data + m->m_len + sizeof(struct in_addr) >= &m->m_dat[MLEN]) goto bad; cnt = m->m_len; m->m_len += sizeof(struct in_addr); cp = mtod(m, u_char *) + sizeof(struct in_addr); bcopy(mtod(m, void *), cp, (unsigned)cnt); bzero(mtod(m, void *), sizeof(struct in_addr)); for (; cnt > 0; cnt -= optlen, cp += optlen) { opt = cp[IPOPT_OPTVAL]; if (opt == IPOPT_EOL) break; if (opt == IPOPT_NOP) optlen = 1; else { if (cnt < IPOPT_OLEN + sizeof(*cp)) goto bad; optlen = cp[IPOPT_OLEN]; if (optlen < IPOPT_OLEN + sizeof(*cp) || optlen > cnt) goto bad; } switch (opt) { default: break; case IPOPT_LSRR: case IPOPT_SSRR: /* * User process specifies route as: * * ->A->B->C->D * * D must be our final destination (but we can't * check that since we may not have connected yet). * A is first hop destination, which doesn't appear * in actual IP option, but is stored before the * options. */ /* XXX-BZ PRIV_NETINET_SETHDROPTS? */ if (optlen < IPOPT_MINOFF - 1 + sizeof(struct in_addr)) goto bad; m->m_len -= sizeof(struct in_addr); cnt -= sizeof(struct in_addr); optlen -= sizeof(struct in_addr); cp[IPOPT_OLEN] = optlen; /* * Move first hop before start of options. */ bcopy((caddr_t)&cp[IPOPT_OFFSET+1], mtod(m, caddr_t), sizeof(struct in_addr)); /* * Then copy rest of options back * to close up the deleted entry. */ bcopy((&cp[IPOPT_OFFSET+1] + sizeof(struct in_addr)), &cp[IPOPT_OFFSET+1], (unsigned)cnt - (IPOPT_MINOFF - 1)); break; } } if (m->m_len > MAX_IPOPTLEN + sizeof(struct in_addr)) goto bad; *pcbopt = m; return (0); bad: (void)m_free(m); return (EINVAL); } /* * Check for the presence of the IP Router Alert option [RFC2113] * in the header of an IPv4 datagram. * * This call is not intended for use from the forwarding path; it is here * so that protocol domains may check for the presence of the option. * Given how FreeBSD's IPv4 stack is currently structured, the Router Alert * option does not have much relevance to the implementation, though this * may change in future. * Router alert options SHOULD be passed if running in IPSTEALTH mode and * we are not the endpoint. * Length checks on individual options should already have been performed * by ip_dooptions() therefore they are folded under INVARIANTS here. * * Return zero if not present or options are invalid, non-zero if present. */ int ip_checkrouteralert(struct mbuf *m) { struct ip *ip = mtod(m, struct ip *); u_char *cp; int opt, optlen, cnt, found_ra; found_ra = 0; cp = (u_char *)(ip + 1); cnt = (ip->ip_hl << 2) - sizeof (struct ip); for (; cnt > 0; cnt -= optlen, cp += optlen) { opt = cp[IPOPT_OPTVAL]; if (opt == IPOPT_EOL) break; if (opt == IPOPT_NOP) optlen = 1; else { #ifdef INVARIANTS if (cnt < IPOPT_OLEN + sizeof(*cp)) break; #endif optlen = cp[IPOPT_OLEN]; #ifdef INVARIANTS if (optlen < IPOPT_OLEN + sizeof(*cp) || optlen > cnt) break; #endif } switch (opt) { case IPOPT_RA: #ifdef INVARIANTS if (optlen != IPOPT_OFFSET + sizeof(uint16_t) || (*((uint16_t *)&cp[IPOPT_OFFSET]) != 0)) break; else #endif found_ra = 1; break; default: break; } } return (found_ra); } Index: head/sys/netinet/ip_output.c =================================================================== --- head/sys/netinet/ip_output.c (revision 334117) +++ head/sys/netinet/ip_output.c (revision 334118) @@ -1,1435 +1,1427 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1988, 1990, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ip_output.c 8.3 (Berkeley) 1/21/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_ratelimit.h" #include "opt_ipsec.h" #include "opt_mbuf_stress_test.h" #include "opt_mpath.h" #include "opt_route.h" #include "opt_sctp.h" #include "opt_rss.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef RADIX_MPATH #include #endif #include #include #include #include #include #include #include #include #include #include #include #ifdef SCTP #include #include #endif #include #include #include #ifdef MBUF_STRESS_TEST static int mbuf_frag_size = 0; SYSCTL_INT(_net_inet_ip, OID_AUTO, mbuf_frag_size, CTLFLAG_RW, &mbuf_frag_size, 0, "Fragment outgoing mbufs to this size"); #endif static void ip_mloopback(struct ifnet *, const struct mbuf *, int); extern int in_mcast_loop; extern struct protosw inetsw[]; static inline int ip_output_pfil(struct mbuf **mp, struct ifnet *ifp, struct inpcb *inp, struct sockaddr_in *dst, int *fibnum, int *error) { struct m_tag *fwd_tag = NULL; struct mbuf *m; struct in_addr odst; struct ip *ip; m = *mp; ip = mtod(m, struct ip *); /* Run through list of hooks for output packets. */ odst.s_addr = ip->ip_dst.s_addr; *error = pfil_run_hooks(&V_inet_pfil_hook, mp, ifp, PFIL_OUT, 0, inp); m = *mp; if ((*error) != 0 || m == NULL) return 1; /* Finished */ ip = mtod(m, struct ip *); /* See if destination IP address was changed by packet filter. */ if (odst.s_addr != ip->ip_dst.s_addr) { m->m_flags |= M_SKIP_FIREWALL; /* If destination is now ourself drop to ip_input(). */ if (in_localip(ip->ip_dst)) { m->m_flags |= M_FASTFWD_OURS; if (m->m_pkthdr.rcvif == NULL) m->m_pkthdr.rcvif = V_loif; if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { m->m_pkthdr.csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR; m->m_pkthdr.csum_data = 0xffff; } m->m_pkthdr.csum_flags |= CSUM_IP_CHECKED | CSUM_IP_VALID; #ifdef SCTP if (m->m_pkthdr.csum_flags & CSUM_SCTP) m->m_pkthdr.csum_flags |= CSUM_SCTP_VALID; #endif *error = netisr_queue(NETISR_IP, m); return 1; /* Finished */ } bzero(dst, sizeof(*dst)); dst->sin_family = AF_INET; dst->sin_len = sizeof(*dst); dst->sin_addr = ip->ip_dst; return -1; /* Reloop */ } /* See if fib was changed by packet filter. */ if ((*fibnum) != M_GETFIB(m)) { m->m_flags |= M_SKIP_FIREWALL; *fibnum = M_GETFIB(m); return -1; /* Reloop for FIB change */ } /* See if local, if yes, send it to netisr with IP_FASTFWD_OURS. */ if (m->m_flags & M_FASTFWD_OURS) { if (m->m_pkthdr.rcvif == NULL) m->m_pkthdr.rcvif = V_loif; if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { m->m_pkthdr.csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR; m->m_pkthdr.csum_data = 0xffff; } #ifdef SCTP if (m->m_pkthdr.csum_flags & CSUM_SCTP) m->m_pkthdr.csum_flags |= CSUM_SCTP_VALID; #endif m->m_pkthdr.csum_flags |= CSUM_IP_CHECKED | CSUM_IP_VALID; *error = netisr_queue(NETISR_IP, m); return 1; /* Finished */ } /* Or forward to some other address? */ if ((m->m_flags & M_IP_NEXTHOP) && ((fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL)) != NULL)) { bcopy((fwd_tag+1), dst, sizeof(struct sockaddr_in)); m->m_flags |= M_SKIP_FIREWALL; m->m_flags &= ~M_IP_NEXTHOP; m_tag_delete(m, fwd_tag); return -1; /* Reloop for CHANGE of dst */ } return 0; } /* * IP output. The packet in mbuf chain m contains a skeletal IP * header (with len, off, ttl, proto, tos, src, dst). * The mbuf chain containing the packet will be freed. * The mbuf opt, if present, will not be freed. * If route ro is present and has ro_rt initialized, route lookup would be * skipped and ro->ro_rt would be used. If ro is present but ro->ro_rt is NULL, * then result of route lookup is stored in ro->ro_rt. * * In the IP forwarding case, the packet will arrive with options already * inserted, so must have a NULL opt pointer. */ int ip_output(struct mbuf *m, struct mbuf *opt, struct route *ro, int flags, struct ip_moptions *imo, struct inpcb *inp) { struct rm_priotracker in_ifa_tracker; struct ip *ip; struct ifnet *ifp = NULL; /* keep compiler happy */ struct mbuf *m0; int hlen = sizeof (struct ip); int mtu; int error = 0; struct sockaddr_in *dst; const struct sockaddr_in *gw; struct in_ifaddr *ia; int isbroadcast; uint16_t ip_len, ip_off; struct route iproute; struct rtentry *rte; /* cache for ro->ro_rt */ uint32_t fibnum; - int have_ia_ref; #if defined(IPSEC) || defined(IPSEC_SUPPORT) int no_route_but_check_spd = 0; #endif M_ASSERTPKTHDR(m); if (inp != NULL) { INP_LOCK_ASSERT(inp); M_SETFIB(m, inp->inp_inc.inc_fibnum); if ((flags & IP_NODEFAULTFLOWID) == 0) { m->m_pkthdr.flowid = inp->inp_flowid; M_HASHTYPE_SET(m, inp->inp_flowtype); } } if (ro == NULL) { ro = &iproute; bzero(ro, sizeof (*ro)); } if (opt) { int len = 0; m = ip_insertoptions(m, opt, &len); if (len != 0) hlen = len; /* ip->ip_hl is updated above */ } ip = mtod(m, struct ip *); ip_len = ntohs(ip->ip_len); ip_off = ntohs(ip->ip_off); if ((flags & (IP_FORWARDING|IP_RAWOUTPUT)) == 0) { ip->ip_v = IPVERSION; ip->ip_hl = hlen >> 2; ip_fillid(ip); IPSTAT_INC(ips_localout); } else { /* Header already set, fetch hlen from there */ hlen = ip->ip_hl << 2; } /* * dst/gw handling: * * dst can be rewritten but always points to &ro->ro_dst. * gw is readonly but can point either to dst OR rt_gateway, * therefore we need restore gw if we're redoing lookup. */ gw = dst = (struct sockaddr_in *)&ro->ro_dst; fibnum = (inp != NULL) ? inp->inp_inc.inc_fibnum : M_GETFIB(m); rte = ro->ro_rt; if (rte == NULL) { bzero(dst, sizeof(*dst)); dst->sin_family = AF_INET; dst->sin_len = sizeof(*dst); dst->sin_addr = ip->ip_dst; } + NET_EPOCH_ENTER(); again: /* * Validate route against routing table additions; * a better/more specific route might have been added. */ if (inp) RT_VALIDATE(ro, &inp->inp_rt_cookie, fibnum); /* * If there is a cached route, * check that it is to the same destination * and is still up. If not, free it and try again. * The address family should also be checked in case of sharing the * cache with IPv6. * Also check whether routing cache needs invalidation. */ rte = ro->ro_rt; if (rte && ((rte->rt_flags & RTF_UP) == 0 || rte->rt_ifp == NULL || !RT_LINK_IS_UP(rte->rt_ifp) || dst->sin_family != AF_INET || dst->sin_addr.s_addr != ip->ip_dst.s_addr)) { RO_INVALIDATE_CACHE(ro); rte = NULL; } ia = NULL; - have_ia_ref = 0; /* * If routing to interface only, short circuit routing lookup. * The use of an all-ones broadcast address implies this; an * interface is specified by the broadcast address of an interface, * or the destination address of a ptp interface. */ if (flags & IP_SENDONES) { if ((ia = ifatoia(ifa_ifwithbroadaddr(sintosa(dst), M_GETFIB(m)))) == NULL && (ia = ifatoia(ifa_ifwithdstaddr(sintosa(dst), M_GETFIB(m)))) == NULL) { IPSTAT_INC(ips_noroute); error = ENETUNREACH; goto bad; } - have_ia_ref = 1; ip->ip_dst.s_addr = INADDR_BROADCAST; dst->sin_addr = ip->ip_dst; ifp = ia->ia_ifp; ip->ip_ttl = 1; isbroadcast = 1; } else if (flags & IP_ROUTETOIF) { if ((ia = ifatoia(ifa_ifwithdstaddr(sintosa(dst), M_GETFIB(m)))) == NULL && (ia = ifatoia(ifa_ifwithnet(sintosa(dst), 0, M_GETFIB(m)))) == NULL) { IPSTAT_INC(ips_noroute); error = ENETUNREACH; goto bad; } - have_ia_ref = 1; ifp = ia->ia_ifp; ip->ip_ttl = 1; isbroadcast = ifp->if_flags & IFF_BROADCAST ? in_ifaddr_broadcast(dst->sin_addr, ia) : 0; } else if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) && imo != NULL && imo->imo_multicast_ifp != NULL) { /* * Bypass the normal routing lookup for multicast * packets if the interface is specified. */ ifp = imo->imo_multicast_ifp; IFP_TO_IA(ifp, ia, &in_ifa_tracker); - if (ia) - have_ia_ref = 1; isbroadcast = 0; /* fool gcc */ } else { /* * We want to do any cloning requested by the link layer, * as this is probably required in all cases for correct * operation (as it is for ARP). */ if (rte == NULL) { #ifdef RADIX_MPATH rtalloc_mpath_fib(ro, ntohl(ip->ip_src.s_addr ^ ip->ip_dst.s_addr), fibnum); #else in_rtalloc_ign(ro, 0, fibnum); #endif rte = ro->ro_rt; } if (rte == NULL || (rte->rt_flags & RTF_UP) == 0 || rte->rt_ifp == NULL || !RT_LINK_IS_UP(rte->rt_ifp)) { #if defined(IPSEC) || defined(IPSEC_SUPPORT) /* * There is no route for this packet, but it is * possible that a matching SPD entry exists. */ no_route_but_check_spd = 1; mtu = 0; /* Silence GCC warning. */ goto sendit; #endif IPSTAT_INC(ips_noroute); error = EHOSTUNREACH; goto bad; } ia = ifatoia(rte->rt_ifa); ifp = rte->rt_ifp; counter_u64_add(rte->rt_pksent, 1); rt_update_ro_flags(ro); if (rte->rt_flags & RTF_GATEWAY) gw = (struct sockaddr_in *)rte->rt_gateway; if (rte->rt_flags & RTF_HOST) isbroadcast = (rte->rt_flags & RTF_BROADCAST); else if (ifp->if_flags & IFF_BROADCAST) isbroadcast = in_ifaddr_broadcast(gw->sin_addr, ia); else isbroadcast = 0; } /* * Calculate MTU. If we have a route that is up, use that, * otherwise use the interface's MTU. */ if (rte != NULL && (rte->rt_flags & (RTF_UP|RTF_HOST))) mtu = rte->rt_mtu; else mtu = ifp->if_mtu; /* Catch a possible divide by zero later. */ KASSERT(mtu > 0, ("%s: mtu %d <= 0, rte=%p (rt_flags=0x%08x) ifp=%p", __func__, mtu, rte, (rte != NULL) ? rte->rt_flags : 0, ifp)); if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) { m->m_flags |= M_MCAST; /* * IP destination address is multicast. Make sure "gw" * still points to the address in "ro". (It may have been * changed to point to a gateway address, above.) */ gw = dst; /* * See if the caller provided any multicast options */ if (imo != NULL) { ip->ip_ttl = imo->imo_multicast_ttl; if (imo->imo_multicast_vif != -1) ip->ip_src.s_addr = ip_mcast_src ? ip_mcast_src(imo->imo_multicast_vif) : INADDR_ANY; } else ip->ip_ttl = IP_DEFAULT_MULTICAST_TTL; /* * Confirm that the outgoing interface supports multicast. */ if ((imo == NULL) || (imo->imo_multicast_vif == -1)) { if ((ifp->if_flags & IFF_MULTICAST) == 0) { IPSTAT_INC(ips_noroute); error = ENETUNREACH; goto bad; } } /* * If source address not specified yet, use address * of outgoing interface. */ if (ip->ip_src.s_addr == INADDR_ANY) { /* Interface may have no addresses. */ if (ia != NULL) ip->ip_src = IA_SIN(ia)->sin_addr; } if ((imo == NULL && in_mcast_loop) || (imo && imo->imo_multicast_loop)) { /* * Loop back multicast datagram if not expressly * forbidden to do so, even if we are not a member * of the group; ip_input() will filter it later, * thus deferring a hash lookup and mutex acquisition * at the expense of a cheap copy using m_copym(). */ ip_mloopback(ifp, m, hlen); } else { /* * If we are acting as a multicast router, perform * multicast forwarding as if the packet had just * arrived on the interface to which we are about * to send. The multicast forwarding function * recursively calls this function, using the * IP_FORWARDING flag to prevent infinite recursion. * * Multicasts that are looped back by ip_mloopback(), * above, will be forwarded by the ip_input() routine, * if necessary. */ if (V_ip_mrouter && (flags & IP_FORWARDING) == 0) { /* * If rsvp daemon is not running, do not * set ip_moptions. This ensures that the packet * is multicast and not just sent down one link * as prescribed by rsvpd. */ if (!V_rsvp_on) imo = NULL; if (ip_mforward && ip_mforward(ip, ifp, m, imo) != 0) { m_freem(m); goto done; } } } /* * Multicasts with a time-to-live of zero may be looped- * back, above, but must not be transmitted on a network. * Also, multicasts addressed to the loopback interface * are not sent -- the above call to ip_mloopback() will * loop back a copy. ip_input() will drop the copy if * this host does not belong to the destination group on * the loopback interface. */ if (ip->ip_ttl == 0 || ifp->if_flags & IFF_LOOPBACK) { m_freem(m); goto done; } goto sendit; } /* * If the source address is not specified yet, use the address * of the outoing interface. */ if (ip->ip_src.s_addr == INADDR_ANY) { /* Interface may have no addresses. */ if (ia != NULL) { ip->ip_src = IA_SIN(ia)->sin_addr; } } /* * Look for broadcast address and * verify user is allowed to send * such a packet. */ if (isbroadcast) { if ((ifp->if_flags & IFF_BROADCAST) == 0) { error = EADDRNOTAVAIL; goto bad; } if ((flags & IP_ALLOWBROADCAST) == 0) { error = EACCES; goto bad; } /* don't allow broadcast messages to be fragmented */ if (ip_len > mtu) { error = EMSGSIZE; goto bad; } m->m_flags |= M_BCAST; } else { m->m_flags &= ~M_BCAST; } sendit: #if defined(IPSEC) || defined(IPSEC_SUPPORT) if (IPSEC_ENABLED(ipv4)) { if ((error = IPSEC_OUTPUT(ipv4, m, inp)) != 0) { if (error == EINPROGRESS) error = 0; goto done; } } /* * Check if there was a route for this packet; return error if not. */ if (no_route_but_check_spd) { IPSTAT_INC(ips_noroute); error = EHOSTUNREACH; goto bad; } /* Update variables that are affected by ipsec4_output(). */ ip = mtod(m, struct ip *); hlen = ip->ip_hl << 2; #endif /* IPSEC */ /* Jump over all PFIL processing if hooks are not active. */ if (PFIL_HOOKED(&V_inet_pfil_hook)) { switch (ip_output_pfil(&m, ifp, inp, dst, &fibnum, &error)) { case 1: /* Finished */ goto done; case 0: /* Continue normally */ ip = mtod(m, struct ip *); break; case -1: /* Need to try again */ /* Reset everything for a new round */ RO_RTFREE(ro); - if (have_ia_ref) - ifa_free(&ia->ia_ifa); ro->ro_prepend = NULL; rte = NULL; gw = dst; ip = mtod(m, struct ip *); goto again; } } /* 127/8 must not appear on wire - RFC1122. */ if ((ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET || (ntohl(ip->ip_src.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET) { if ((ifp->if_flags & IFF_LOOPBACK) == 0) { IPSTAT_INC(ips_badaddr); error = EADDRNOTAVAIL; goto bad; } } m->m_pkthdr.csum_flags |= CSUM_IP; if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA & ~ifp->if_hwassist) { in_delayed_cksum(m); m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA; } #ifdef SCTP if (m->m_pkthdr.csum_flags & CSUM_SCTP & ~ifp->if_hwassist) { sctp_delayed_cksum(m, (uint32_t)(ip->ip_hl << 2)); m->m_pkthdr.csum_flags &= ~CSUM_SCTP; } #endif /* * If small enough for interface, or the interface will take * care of the fragmentation for us, we can just send directly. */ if (ip_len <= mtu || (m->m_pkthdr.csum_flags & ifp->if_hwassist & CSUM_TSO) != 0) { ip->ip_sum = 0; if (m->m_pkthdr.csum_flags & CSUM_IP & ~ifp->if_hwassist) { ip->ip_sum = in_cksum(m, hlen); m->m_pkthdr.csum_flags &= ~CSUM_IP; } /* * Record statistics for this interface address. * With CSUM_TSO the byte/packet count will be slightly * incorrect because we count the IP+TCP headers only * once instead of for every generated packet. */ if (!(flags & IP_FORWARDING) && ia) { if (m->m_pkthdr.csum_flags & CSUM_TSO) counter_u64_add(ia->ia_ifa.ifa_opackets, m->m_pkthdr.len / m->m_pkthdr.tso_segsz); else counter_u64_add(ia->ia_ifa.ifa_opackets, 1); counter_u64_add(ia->ia_ifa.ifa_obytes, m->m_pkthdr.len); } #ifdef MBUF_STRESS_TEST if (mbuf_frag_size && m->m_pkthdr.len > mbuf_frag_size) m = m_fragment(m, M_NOWAIT, mbuf_frag_size); #endif /* * Reset layer specific mbuf flags * to avoid confusing lower layers. */ m_clrprotoflags(m); IP_PROBE(send, NULL, NULL, ip, ifp, ip, NULL); #ifdef RATELIMIT if (inp != NULL) { if (inp->inp_flags2 & INP_RATE_LIMIT_CHANGED) in_pcboutput_txrtlmt(inp, ifp, m); /* stamp send tag on mbuf */ m->m_pkthdr.snd_tag = inp->inp_snd_tag; } else { m->m_pkthdr.snd_tag = NULL; } #endif error = (*ifp->if_output)(ifp, m, (const struct sockaddr *)gw, ro); #ifdef RATELIMIT /* check for route change */ if (error == EAGAIN) in_pcboutput_eagain(inp); #endif goto done; } /* Balk when DF bit is set or the interface didn't support TSO. */ if ((ip_off & IP_DF) || (m->m_pkthdr.csum_flags & CSUM_TSO)) { error = EMSGSIZE; IPSTAT_INC(ips_cantfrag); goto bad; } /* * Too large for interface; fragment if possible. If successful, * on return, m will point to a list of packets to be sent. */ error = ip_fragment(ip, &m, mtu, ifp->if_hwassist); if (error) goto bad; for (; m; m = m0) { m0 = m->m_nextpkt; m->m_nextpkt = 0; if (error == 0) { /* Record statistics for this interface address. */ if (ia != NULL) { counter_u64_add(ia->ia_ifa.ifa_opackets, 1); counter_u64_add(ia->ia_ifa.ifa_obytes, m->m_pkthdr.len); } /* * Reset layer specific mbuf flags * to avoid confusing upper layers. */ m_clrprotoflags(m); IP_PROBE(send, NULL, NULL, mtod(m, struct ip *), ifp, mtod(m, struct ip *), NULL); #ifdef RATELIMIT if (inp != NULL) { if (inp->inp_flags2 & INP_RATE_LIMIT_CHANGED) in_pcboutput_txrtlmt(inp, ifp, m); /* stamp send tag on mbuf */ m->m_pkthdr.snd_tag = inp->inp_snd_tag; } else { m->m_pkthdr.snd_tag = NULL; } #endif error = (*ifp->if_output)(ifp, m, (const struct sockaddr *)gw, ro); #ifdef RATELIMIT /* check for route change */ if (error == EAGAIN) in_pcboutput_eagain(inp); #endif } else m_freem(m); } if (error == 0) IPSTAT_INC(ips_fragmented); done: if (ro == &iproute) RO_RTFREE(ro); else if (rte == NULL) /* * If the caller supplied a route but somehow the reference * to it has been released need to prevent the caller * calling RTFREE on it again. */ ro->ro_rt = NULL; - if (have_ia_ref) - ifa_free(&ia->ia_ifa); + NET_EPOCH_EXIT(); return (error); -bad: + bad: m_freem(m); goto done; } /* * Create a chain of fragments which fit the given mtu. m_frag points to the * mbuf to be fragmented; on return it points to the chain with the fragments. * Return 0 if no error. If error, m_frag may contain a partially built * chain of fragments that should be freed by the caller. * * if_hwassist_flags is the hw offload capabilities (see if_data.ifi_hwassist) */ int ip_fragment(struct ip *ip, struct mbuf **m_frag, int mtu, u_long if_hwassist_flags) { int error = 0; int hlen = ip->ip_hl << 2; int len = (mtu - hlen) & ~7; /* size of payload in each fragment */ int off; struct mbuf *m0 = *m_frag; /* the original packet */ int firstlen; struct mbuf **mnext; int nfrags; uint16_t ip_len, ip_off; ip_len = ntohs(ip->ip_len); ip_off = ntohs(ip->ip_off); if (ip_off & IP_DF) { /* Fragmentation not allowed */ IPSTAT_INC(ips_cantfrag); return EMSGSIZE; } /* * Must be able to put at least 8 bytes per fragment. */ if (len < 8) return EMSGSIZE; /* * If the interface will not calculate checksums on * fragmented packets, then do it here. */ if (m0->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { in_delayed_cksum(m0); m0->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA; } #ifdef SCTP if (m0->m_pkthdr.csum_flags & CSUM_SCTP) { sctp_delayed_cksum(m0, hlen); m0->m_pkthdr.csum_flags &= ~CSUM_SCTP; } #endif if (len > PAGE_SIZE) { /* * Fragment large datagrams such that each segment * contains a multiple of PAGE_SIZE amount of data, * plus headers. This enables a receiver to perform * page-flipping zero-copy optimizations. * * XXX When does this help given that sender and receiver * could have different page sizes, and also mtu could * be less than the receiver's page size ? */ int newlen; off = MIN(mtu, m0->m_pkthdr.len); /* * firstlen (off - hlen) must be aligned on an * 8-byte boundary */ if (off < hlen) goto smart_frag_failure; off = ((off - hlen) & ~7) + hlen; newlen = (~PAGE_MASK) & mtu; if ((newlen + sizeof (struct ip)) > mtu) { /* we failed, go back the default */ smart_frag_failure: newlen = len; off = hlen + len; } len = newlen; } else { off = hlen + len; } firstlen = off - hlen; mnext = &m0->m_nextpkt; /* pointer to next packet */ /* * Loop through length of segment after first fragment, * make new header and copy data of each part and link onto chain. * Here, m0 is the original packet, m is the fragment being created. * The fragments are linked off the m_nextpkt of the original * packet, which after processing serves as the first fragment. */ for (nfrags = 1; off < ip_len; off += len, nfrags++) { struct ip *mhip; /* ip header on the fragment */ struct mbuf *m; int mhlen = sizeof (struct ip); m = m_gethdr(M_NOWAIT, MT_DATA); if (m == NULL) { error = ENOBUFS; IPSTAT_INC(ips_odropped); goto done; } /* * Make sure the complete packet header gets copied * from the originating mbuf to the newly created * mbuf. This also ensures that existing firewall * classification(s), VLAN tags and so on get copied * to the resulting fragmented packet(s): */ if (m_dup_pkthdr(m, m0, M_NOWAIT) == 0) { m_free(m); error = ENOBUFS; IPSTAT_INC(ips_odropped); goto done; } /* * In the first mbuf, leave room for the link header, then * copy the original IP header including options. The payload * goes into an additional mbuf chain returned by m_copym(). */ m->m_data += max_linkhdr; mhip = mtod(m, struct ip *); *mhip = *ip; if (hlen > sizeof (struct ip)) { mhlen = ip_optcopy(ip, mhip) + sizeof (struct ip); mhip->ip_v = IPVERSION; mhip->ip_hl = mhlen >> 2; } m->m_len = mhlen; /* XXX do we need to add ip_off below ? */ mhip->ip_off = ((off - hlen) >> 3) + ip_off; if (off + len >= ip_len) len = ip_len - off; else mhip->ip_off |= IP_MF; mhip->ip_len = htons((u_short)(len + mhlen)); m->m_next = m_copym(m0, off, len, M_NOWAIT); if (m->m_next == NULL) { /* copy failed */ m_free(m); error = ENOBUFS; /* ??? */ IPSTAT_INC(ips_odropped); goto done; } m->m_pkthdr.len = mhlen + len; #ifdef MAC mac_netinet_fragment(m0, m); #endif mhip->ip_off = htons(mhip->ip_off); mhip->ip_sum = 0; if (m->m_pkthdr.csum_flags & CSUM_IP & ~if_hwassist_flags) { mhip->ip_sum = in_cksum(m, mhlen); m->m_pkthdr.csum_flags &= ~CSUM_IP; } *mnext = m; mnext = &m->m_nextpkt; } IPSTAT_ADD(ips_ofragments, nfrags); /* * Update first fragment by trimming what's been copied out * and updating header. */ m_adj(m0, hlen + firstlen - ip_len); m0->m_pkthdr.len = hlen + firstlen; ip->ip_len = htons((u_short)m0->m_pkthdr.len); ip->ip_off = htons(ip_off | IP_MF); ip->ip_sum = 0; if (m0->m_pkthdr.csum_flags & CSUM_IP & ~if_hwassist_flags) { ip->ip_sum = in_cksum(m0, hlen); m0->m_pkthdr.csum_flags &= ~CSUM_IP; } done: *m_frag = m0; return error; } void in_delayed_cksum(struct mbuf *m) { struct ip *ip; uint16_t csum, offset, ip_len; ip = mtod(m, struct ip *); offset = ip->ip_hl << 2 ; ip_len = ntohs(ip->ip_len); csum = in_cksum_skip(m, ip_len, offset); if (m->m_pkthdr.csum_flags & CSUM_UDP && csum == 0) csum = 0xffff; offset += m->m_pkthdr.csum_data; /* checksum offset */ /* find the mbuf in the chain where the checksum starts*/ while ((m != NULL) && (offset >= m->m_len)) { offset -= m->m_len; m = m->m_next; } KASSERT(m != NULL, ("in_delayed_cksum: checksum outside mbuf chain.")); KASSERT(offset + sizeof(u_short) <= m->m_len, ("in_delayed_cksum: checksum split between mbufs.")); *(u_short *)(m->m_data + offset) = csum; } /* * IP socket option processing. */ int ip_ctloutput(struct socket *so, struct sockopt *sopt) { struct inpcb *inp = sotoinpcb(so); int error, optval; #ifdef RSS uint32_t rss_bucket; int retval; #endif error = optval = 0; if (sopt->sopt_level != IPPROTO_IP) { error = EINVAL; if (sopt->sopt_level == SOL_SOCKET && sopt->sopt_dir == SOPT_SET) { switch (sopt->sopt_name) { case SO_REUSEADDR: INP_WLOCK(inp); if ((so->so_options & SO_REUSEADDR) != 0) inp->inp_flags2 |= INP_REUSEADDR; else inp->inp_flags2 &= ~INP_REUSEADDR; INP_WUNLOCK(inp); error = 0; break; case SO_REUSEPORT: INP_WLOCK(inp); if ((so->so_options & SO_REUSEPORT) != 0) inp->inp_flags2 |= INP_REUSEPORT; else inp->inp_flags2 &= ~INP_REUSEPORT; INP_WUNLOCK(inp); error = 0; break; case SO_SETFIB: INP_WLOCK(inp); inp->inp_inc.inc_fibnum = so->so_fibnum; INP_WUNLOCK(inp); error = 0; break; case SO_MAX_PACING_RATE: #ifdef RATELIMIT INP_WLOCK(inp); inp->inp_flags2 |= INP_RATE_LIMIT_CHANGED; INP_WUNLOCK(inp); error = 0; #else error = EOPNOTSUPP; #endif break; default: break; } } return (error); } switch (sopt->sopt_dir) { case SOPT_SET: switch (sopt->sopt_name) { case IP_OPTIONS: #ifdef notyet case IP_RETOPTS: #endif { struct mbuf *m; if (sopt->sopt_valsize > MLEN) { error = EMSGSIZE; break; } m = m_get(sopt->sopt_td ? M_WAITOK : M_NOWAIT, MT_DATA); if (m == NULL) { error = ENOBUFS; break; } m->m_len = sopt->sopt_valsize; error = sooptcopyin(sopt, mtod(m, char *), m->m_len, m->m_len); if (error) { m_free(m); break; } INP_WLOCK(inp); error = ip_pcbopts(inp, sopt->sopt_name, m); INP_WUNLOCK(inp); return (error); } case IP_BINDANY: if (sopt->sopt_td != NULL) { error = priv_check(sopt->sopt_td, PRIV_NETINET_BINDANY); if (error) break; } /* FALLTHROUGH */ case IP_BINDMULTI: #ifdef RSS case IP_RSS_LISTEN_BUCKET: #endif case IP_TOS: case IP_TTL: case IP_MINTTL: case IP_RECVOPTS: case IP_RECVRETOPTS: case IP_ORIGDSTADDR: case IP_RECVDSTADDR: case IP_RECVTTL: case IP_RECVIF: case IP_ONESBCAST: case IP_DONTFRAG: case IP_RECVTOS: case IP_RECVFLOWID: #ifdef RSS case IP_RECVRSSBUCKETID: #endif error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) break; switch (sopt->sopt_name) { case IP_TOS: inp->inp_ip_tos = optval; break; case IP_TTL: inp->inp_ip_ttl = optval; break; case IP_MINTTL: if (optval >= 0 && optval <= MAXTTL) inp->inp_ip_minttl = optval; else error = EINVAL; break; #define OPTSET(bit) do { \ INP_WLOCK(inp); \ if (optval) \ inp->inp_flags |= bit; \ else \ inp->inp_flags &= ~bit; \ INP_WUNLOCK(inp); \ } while (0) #define OPTSET2(bit, val) do { \ INP_WLOCK(inp); \ if (val) \ inp->inp_flags2 |= bit; \ else \ inp->inp_flags2 &= ~bit; \ INP_WUNLOCK(inp); \ } while (0) case IP_RECVOPTS: OPTSET(INP_RECVOPTS); break; case IP_RECVRETOPTS: OPTSET(INP_RECVRETOPTS); break; case IP_RECVDSTADDR: OPTSET(INP_RECVDSTADDR); break; case IP_ORIGDSTADDR: OPTSET2(INP_ORIGDSTADDR, optval); break; case IP_RECVTTL: OPTSET(INP_RECVTTL); break; case IP_RECVIF: OPTSET(INP_RECVIF); break; case IP_ONESBCAST: OPTSET(INP_ONESBCAST); break; case IP_DONTFRAG: OPTSET(INP_DONTFRAG); break; case IP_BINDANY: OPTSET(INP_BINDANY); break; case IP_RECVTOS: OPTSET(INP_RECVTOS); break; case IP_BINDMULTI: OPTSET2(INP_BINDMULTI, optval); break; case IP_RECVFLOWID: OPTSET2(INP_RECVFLOWID, optval); break; #ifdef RSS case IP_RSS_LISTEN_BUCKET: if ((optval >= 0) && (optval < rss_getnumbuckets())) { inp->inp_rss_listen_bucket = optval; OPTSET2(INP_RSS_BUCKET_SET, 1); } else { error = EINVAL; } break; case IP_RECVRSSBUCKETID: OPTSET2(INP_RECVRSSBUCKETID, optval); break; #endif } break; #undef OPTSET #undef OPTSET2 /* * Multicast socket options are processed by the in_mcast * module. */ case IP_MULTICAST_IF: case IP_MULTICAST_VIF: case IP_MULTICAST_TTL: case IP_MULTICAST_LOOP: case IP_ADD_MEMBERSHIP: case IP_DROP_MEMBERSHIP: case IP_ADD_SOURCE_MEMBERSHIP: case IP_DROP_SOURCE_MEMBERSHIP: case IP_BLOCK_SOURCE: case IP_UNBLOCK_SOURCE: case IP_MSFILTER: case MCAST_JOIN_GROUP: case MCAST_LEAVE_GROUP: case MCAST_JOIN_SOURCE_GROUP: case MCAST_LEAVE_SOURCE_GROUP: case MCAST_BLOCK_SOURCE: case MCAST_UNBLOCK_SOURCE: error = inp_setmoptions(inp, sopt); break; case IP_PORTRANGE: error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) break; INP_WLOCK(inp); switch (optval) { case IP_PORTRANGE_DEFAULT: inp->inp_flags &= ~(INP_LOWPORT); inp->inp_flags &= ~(INP_HIGHPORT); break; case IP_PORTRANGE_HIGH: inp->inp_flags &= ~(INP_LOWPORT); inp->inp_flags |= INP_HIGHPORT; break; case IP_PORTRANGE_LOW: inp->inp_flags &= ~(INP_HIGHPORT); inp->inp_flags |= INP_LOWPORT; break; default: error = EINVAL; break; } INP_WUNLOCK(inp); break; #if defined(IPSEC) || defined(IPSEC_SUPPORT) case IP_IPSEC_POLICY: if (IPSEC_ENABLED(ipv4)) { error = IPSEC_PCBCTL(ipv4, inp, sopt); break; } /* FALLTHROUGH */ #endif /* IPSEC */ default: error = ENOPROTOOPT; break; } break; case SOPT_GET: switch (sopt->sopt_name) { case IP_OPTIONS: case IP_RETOPTS: if (inp->inp_options) error = sooptcopyout(sopt, mtod(inp->inp_options, char *), inp->inp_options->m_len); else sopt->sopt_valsize = 0; break; case IP_TOS: case IP_TTL: case IP_MINTTL: case IP_RECVOPTS: case IP_RECVRETOPTS: case IP_ORIGDSTADDR: case IP_RECVDSTADDR: case IP_RECVTTL: case IP_RECVIF: case IP_PORTRANGE: case IP_ONESBCAST: case IP_DONTFRAG: case IP_BINDANY: case IP_RECVTOS: case IP_BINDMULTI: case IP_FLOWID: case IP_FLOWTYPE: case IP_RECVFLOWID: #ifdef RSS case IP_RSSBUCKETID: case IP_RECVRSSBUCKETID: #endif switch (sopt->sopt_name) { case IP_TOS: optval = inp->inp_ip_tos; break; case IP_TTL: optval = inp->inp_ip_ttl; break; case IP_MINTTL: optval = inp->inp_ip_minttl; break; #define OPTBIT(bit) (inp->inp_flags & bit ? 1 : 0) #define OPTBIT2(bit) (inp->inp_flags2 & bit ? 1 : 0) case IP_RECVOPTS: optval = OPTBIT(INP_RECVOPTS); break; case IP_RECVRETOPTS: optval = OPTBIT(INP_RECVRETOPTS); break; case IP_RECVDSTADDR: optval = OPTBIT(INP_RECVDSTADDR); break; case IP_ORIGDSTADDR: optval = OPTBIT2(INP_ORIGDSTADDR); break; case IP_RECVTTL: optval = OPTBIT(INP_RECVTTL); break; case IP_RECVIF: optval = OPTBIT(INP_RECVIF); break; case IP_PORTRANGE: if (inp->inp_flags & INP_HIGHPORT) optval = IP_PORTRANGE_HIGH; else if (inp->inp_flags & INP_LOWPORT) optval = IP_PORTRANGE_LOW; else optval = 0; break; case IP_ONESBCAST: optval = OPTBIT(INP_ONESBCAST); break; case IP_DONTFRAG: optval = OPTBIT(INP_DONTFRAG); break; case IP_BINDANY: optval = OPTBIT(INP_BINDANY); break; case IP_RECVTOS: optval = OPTBIT(INP_RECVTOS); break; case IP_FLOWID: optval = inp->inp_flowid; break; case IP_FLOWTYPE: optval = inp->inp_flowtype; break; case IP_RECVFLOWID: optval = OPTBIT2(INP_RECVFLOWID); break; #ifdef RSS case IP_RSSBUCKETID: retval = rss_hash2bucket(inp->inp_flowid, inp->inp_flowtype, &rss_bucket); if (retval == 0) optval = rss_bucket; else error = EINVAL; break; case IP_RECVRSSBUCKETID: optval = OPTBIT2(INP_RECVRSSBUCKETID); break; #endif case IP_BINDMULTI: optval = OPTBIT2(INP_BINDMULTI); break; } error = sooptcopyout(sopt, &optval, sizeof optval); break; /* * Multicast socket options are processed by the in_mcast * module. */ case IP_MULTICAST_IF: case IP_MULTICAST_VIF: case IP_MULTICAST_TTL: case IP_MULTICAST_LOOP: case IP_MSFILTER: error = inp_getmoptions(inp, sopt); break; #if defined(IPSEC) || defined(IPSEC_SUPPORT) case IP_IPSEC_POLICY: if (IPSEC_ENABLED(ipv4)) { error = IPSEC_PCBCTL(ipv4, inp, sopt); break; } /* FALLTHROUGH */ #endif /* IPSEC */ default: error = ENOPROTOOPT; break; } break; } return (error); } /* * Routine called from ip_output() to loop back a copy of an IP multicast * packet to the input queue of a specified interface. Note that this * calls the output routine of the loopback "driver", but with an interface * pointer that might NOT be a loopback interface -- evil, but easier than * replicating that code here. */ static void ip_mloopback(struct ifnet *ifp, const struct mbuf *m, int hlen) { struct ip *ip; struct mbuf *copym; /* * Make a deep copy of the packet because we're going to * modify the pack in order to generate checksums. */ copym = m_dup(m, M_NOWAIT); if (copym != NULL && (!M_WRITABLE(copym) || copym->m_len < hlen)) copym = m_pullup(copym, hlen); if (copym != NULL) { /* If needed, compute the checksum and mark it as valid. */ if (copym->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { in_delayed_cksum(copym); copym->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA; copym->m_pkthdr.csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR; copym->m_pkthdr.csum_data = 0xffff; } /* * We don't bother to fragment if the IP length is greater * than the interface's MTU. Can this possibly matter? */ ip = mtod(copym, struct ip *); ip->ip_sum = 0; ip->ip_sum = in_cksum(copym, hlen); if_simloop(ifp, copym, AF_INET, 0); } } Index: head/sys/netinet/netdump/netdump_client.c =================================================================== --- head/sys/netinet/netdump/netdump_client.c (revision 334117) +++ head/sys/netinet/netdump/netdump_client.c (revision 334118) @@ -1,1308 +1,1308 @@ /*- * Copyright (c) 2005-2014 Sandvine Incorporated. All rights reserved. * Copyright (c) 2000 Darrell Anderson * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * netdump_client.c * FreeBSD subsystem supporting netdump network dumps. * A dedicated server must be running to accept client dumps. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define NETDDEBUG(f, ...) do { \ if (nd_debug > 0) \ printf(("%s: " f), __func__, ## __VA_ARGS__); \ } while (0) #define NETDDEBUG_IF(i, f, ...) do { \ if (nd_debug > 0) \ if_printf((i), ("%s: " f), __func__, ## __VA_ARGS__); \ } while (0) #define NETDDEBUGV(f, ...) do { \ if (nd_debug > 1) \ printf(("%s: " f), __func__, ## __VA_ARGS__); \ } while (0) #define NETDDEBUGV_IF(i, f, ...) do { \ if (nd_debug > 1) \ if_printf((i), ("%s: " f), __func__, ## __VA_ARGS__); \ } while (0) static int netdump_arp_gw(void); static void netdump_cleanup(void); static int netdump_configure(struct netdump_conf *, struct thread *); static int netdump_dumper(void *priv __unused, void *virtual, vm_offset_t physical __unused, off_t offset, size_t length); static int netdump_ether_output(struct mbuf *m, struct ifnet *ifp, struct ether_addr dst, u_short etype); static void netdump_handle_arp(struct mbuf **mb); static void netdump_handle_ip(struct mbuf **mb); static int netdump_ioctl(struct cdev *dev __unused, u_long cmd, caddr_t addr, int flags __unused, struct thread *td); static int netdump_modevent(module_t mod, int type, void *priv); static void netdump_network_poll(void); static void netdump_pkt_in(struct ifnet *ifp, struct mbuf *m); static int netdump_send(uint32_t type, off_t offset, unsigned char *data, uint32_t datalen); static int netdump_send_arp(in_addr_t dst); static int netdump_start(struct dumperinfo *di); static int netdump_udp_output(struct mbuf *m); /* Must be at least as big as the chunks dumpsys() gives us. */ static unsigned char nd_buf[MAXDUMPPGS * PAGE_SIZE]; static uint32_t nd_seqno; static int dump_failed, have_gw_mac; static void (*drv_if_input)(struct ifnet *, struct mbuf *); static int restore_gw_addr; static uint64_t rcvd_acks; CTASSERT(sizeof(rcvd_acks) * NBBY == NETDUMP_MAX_IN_FLIGHT); /* * Times to poll the NIC (0.5ms each poll) before assuming packetloss * occurred (default to 1s). */ static int nd_polls = 2000; /* Times to retransmit lost packets. */ static int nd_retries = 10; /* Number of ARP retries. */ static int nd_arp_retries = 3; /* Configuration parameters. */ static struct netdump_conf nd_conf; #define nd_server nd_conf.ndc_server #define nd_client nd_conf.ndc_client #define nd_gateway nd_conf.ndc_gateway /* General dynamic settings. */ static struct ether_addr nd_gw_mac; static struct ifnet *nd_ifp; static uint16_t nd_server_port = NETDUMP_PORT; FEATURE(netdump, "Netdump client support"); static SYSCTL_NODE(_net, OID_AUTO, netdump, CTLFLAG_RD, NULL, "netdump parameters"); static int nd_debug; SYSCTL_INT(_net_netdump, OID_AUTO, debug, CTLFLAG_RWTUN, &nd_debug, 0, "Debug message verbosity"); static int nd_enabled; SYSCTL_INT(_net_netdump, OID_AUTO, enabled, CTLFLAG_RD, &nd_enabled, 0, "netdump configuration status"); static char nd_path[MAXPATHLEN]; SYSCTL_STRING(_net_netdump, OID_AUTO, path, CTLFLAG_RW, nd_path, sizeof(nd_path), "Server path for output files"); /* * Checks for netdump support on a network interface * * Parameters: * ifp The network interface that is being tested for support * * Returns: * int 1 if the interface is supported, 0 if not */ static bool netdump_supported_nic(struct ifnet *ifp) { return (ifp->if_netdump_methods != NULL); } /*- * Network specific primitives. * Following down the code they are divided ordered as: * - Packet buffer primitives * - Output primitives * - Input primitives * - Polling primitives */ /* * Handles creation of the ethernet header, then places outgoing packets into * the tx buffer for the NIC * * Parameters: * m The mbuf containing the packet to be sent (will be freed by * this function or the NIC driver) * ifp The interface to send on * dst The destination ethernet address (source address will be looked * up using ifp) * etype The ETHERTYPE_* value for the protocol that is being sent * * Returns: * int see errno.h, 0 for success */ static int netdump_ether_output(struct mbuf *m, struct ifnet *ifp, struct ether_addr dst, u_short etype) { struct ether_header *eh; if (((ifp->if_flags & (IFF_MONITOR | IFF_UP)) != IFF_UP) || (ifp->if_drv_flags & IFF_DRV_RUNNING) != IFF_DRV_RUNNING) { if_printf(ifp, "netdump_ether_output: interface isn't up\n"); m_freem(m); return (ENETDOWN); } /* Fill in the ethernet header. */ M_PREPEND(m, ETHER_HDR_LEN, M_NOWAIT); if (m == NULL) { printf("%s: out of mbufs\n", __func__); return (ENOBUFS); } eh = mtod(m, struct ether_header *); memcpy(eh->ether_shost, IF_LLADDR(ifp), ETHER_ADDR_LEN); memcpy(eh->ether_dhost, dst.octet, ETHER_ADDR_LEN); eh->ether_type = htons(etype); return ((ifp->if_netdump_methods->nd_transmit)(ifp, m)); } /* * Unreliable transmission of an mbuf chain to the netdump server * Note: can't handle fragmentation; fails if the packet is larger than * nd_ifp->if_mtu after adding the UDP/IP headers * * Parameters: * m mbuf chain * * Returns: * int see errno.h, 0 for success */ static int netdump_udp_output(struct mbuf *m) { struct udpiphdr *ui; struct ip *ip; MPASS(nd_ifp != NULL); M_PREPEND(m, sizeof(struct udpiphdr), M_NOWAIT); if (m == NULL) { printf("%s: out of mbufs\n", __func__); return (ENOBUFS); } if (m->m_pkthdr.len > nd_ifp->if_mtu) { printf("netdump_udp_output: Packet is too big: %d > MTU %u\n", m->m_pkthdr.len, nd_ifp->if_mtu); m_freem(m); return (ENOBUFS); } ui = mtod(m, struct udpiphdr *); bzero(ui->ui_x1, sizeof(ui->ui_x1)); ui->ui_pr = IPPROTO_UDP; ui->ui_len = htons(m->m_pkthdr.len - sizeof(struct ip)); ui->ui_ulen = ui->ui_len; ui->ui_src = nd_client; ui->ui_dst = nd_server; /* Use this src port so that the server can connect() the socket */ ui->ui_sport = htons(NETDUMP_ACKPORT); ui->ui_dport = htons(nd_server_port); ui->ui_sum = 0; if ((ui->ui_sum = in_cksum(m, m->m_pkthdr.len)) == 0) ui->ui_sum = 0xffff; ip = mtod(m, struct ip *); ip->ip_v = IPVERSION; ip->ip_hl = sizeof(struct ip) >> 2; ip->ip_tos = 0; ip->ip_len = htons(m->m_pkthdr.len); ip->ip_id = 0; ip->ip_off = htons(IP_DF); ip->ip_ttl = 255; ip->ip_sum = 0; ip->ip_sum = in_cksum(m, sizeof(struct ip)); return (netdump_ether_output(m, nd_ifp, nd_gw_mac, ETHERTYPE_IP)); } /* * Builds and sends a single ARP request to locate the server * * Return value: * 0 on success * errno on error */ static int netdump_send_arp(in_addr_t dst) { struct ether_addr bcast; struct mbuf *m; struct arphdr *ah; int pktlen; MPASS(nd_ifp != NULL); /* Fill-up a broadcast address. */ memset(&bcast, 0xFF, ETHER_ADDR_LEN); m = m_gethdr(M_NOWAIT, MT_DATA); if (m == NULL) { printf("netdump_send_arp: Out of mbufs\n"); return (ENOBUFS); } pktlen = arphdr_len2(ETHER_ADDR_LEN, sizeof(struct in_addr)); m->m_len = pktlen; m->m_pkthdr.len = pktlen; MH_ALIGN(m, pktlen); ah = mtod(m, struct arphdr *); ah->ar_hrd = htons(ARPHRD_ETHER); ah->ar_pro = htons(ETHERTYPE_IP); ah->ar_hln = ETHER_ADDR_LEN; ah->ar_pln = sizeof(struct in_addr); ah->ar_op = htons(ARPOP_REQUEST); memcpy(ar_sha(ah), IF_LLADDR(nd_ifp), ETHER_ADDR_LEN); ((struct in_addr *)ar_spa(ah))->s_addr = nd_client.s_addr; bzero(ar_tha(ah), ETHER_ADDR_LEN); ((struct in_addr *)ar_tpa(ah))->s_addr = dst; return (netdump_ether_output(m, nd_ifp, bcast, ETHERTYPE_ARP)); } /* * Sends ARP requests to locate the server and waits for a response. * We first try to ARP the server itself, and fall back to the provided * gateway if the server appears to be off-link. * * Return value: * 0 on success * errno on error */ static int netdump_arp_gw(void) { in_addr_t dst; int error, polls, retries; dst = nd_server.s_addr; restart: for (retries = 0; retries < nd_arp_retries && have_gw_mac == 0; retries++) { error = netdump_send_arp(dst); if (error != 0) return (error); for (polls = 0; polls < nd_polls && have_gw_mac == 0; polls++) { netdump_network_poll(); DELAY(500); } if (have_gw_mac == 0) printf("(ARP retry)"); } if (have_gw_mac != 0) return (0); if (dst == nd_server.s_addr && nd_server.s_addr != nd_gateway.s_addr) { printf("Failed to ARP server, trying to reach gateway...\n"); dst = nd_gateway.s_addr; goto restart; } printf("\nARP timed out.\n"); return (ETIMEDOUT); } /* * Dummy free function for netdump clusters. */ static void netdump_mbuf_free(struct mbuf *m __unused) { } /* * Construct and reliably send a netdump packet. May fail from a resource * shortage or extreme number of unacknowledged retransmissions. Wait for * an acknowledgement before returning. Splits packets into chunks small * enough to be sent without fragmentation (looks up the interface MTU) * * Parameters: * type netdump packet type (HERALD, FINISHED, or VMCORE) * offset vmcore data offset (bytes) * data vmcore data * datalen vmcore data size (bytes) * * Returns: * int see errno.h, 0 for success */ static int netdump_send(uint32_t type, off_t offset, unsigned char *data, uint32_t datalen) { struct netdump_msg_hdr *nd_msg_hdr; struct mbuf *m, *m2; uint64_t want_acks; uint32_t i, pktlen, sent_so_far; int retries, polls, error; want_acks = 0; rcvd_acks = 0; retries = 0; MPASS(nd_ifp != NULL); retransmit: /* Chunks can be too big to fit in packets. */ for (i = sent_so_far = 0; sent_so_far < datalen || (i == 0 && datalen == 0); i++) { pktlen = datalen - sent_so_far; /* First bound: the packet structure. */ pktlen = min(pktlen, NETDUMP_DATASIZE); /* Second bound: the interface MTU (assume no IP options). */ pktlen = min(pktlen, nd_ifp->if_mtu - sizeof(struct udpiphdr) - sizeof(struct netdump_msg_hdr)); /* * Check if it is retransmitting and this has been ACKed * already. */ if ((rcvd_acks & (1 << i)) != 0) { sent_so_far += pktlen; continue; } /* * Get and fill a header mbuf, then chain data as an extended * mbuf. */ m = m_gethdr(M_NOWAIT, MT_DATA); if (m == NULL) { printf("netdump_send: Out of mbufs\n"); return (ENOBUFS); } m->m_len = sizeof(struct netdump_msg_hdr); m->m_pkthdr.len = sizeof(struct netdump_msg_hdr); MH_ALIGN(m, sizeof(struct netdump_msg_hdr)); nd_msg_hdr = mtod(m, struct netdump_msg_hdr *); nd_msg_hdr->mh_seqno = htonl(nd_seqno + i); nd_msg_hdr->mh_type = htonl(type); nd_msg_hdr->mh_offset = htobe64(offset + sent_so_far); nd_msg_hdr->mh_len = htonl(pktlen); nd_msg_hdr->mh__pad = 0; if (pktlen != 0) { m2 = m_get(M_NOWAIT, MT_DATA); if (m2 == NULL) { m_freem(m); printf("netdump_send: Out of mbufs\n"); return (ENOBUFS); } MEXTADD(m2, data + sent_so_far, pktlen, netdump_mbuf_free, NULL, NULL, 0, EXT_DISPOSABLE); m2->m_len = pktlen; m_cat(m, m2); m->m_pkthdr.len += pktlen; } error = netdump_udp_output(m); if (error != 0) return (error); /* Note that we're waiting for this packet in the bitfield. */ want_acks |= (1 << i); sent_so_far += pktlen; } if (i >= NETDUMP_MAX_IN_FLIGHT) printf("Warning: Sent more than %d packets (%d). " "Acknowledgements will fail unless the size of " "rcvd_acks/want_acks is increased.\n", NETDUMP_MAX_IN_FLIGHT, i); /* * Wait for acks. A *real* window would speed things up considerably. */ polls = 0; while (rcvd_acks != want_acks) { if (polls++ > nd_polls) { if (retries++ > nd_retries) return (ETIMEDOUT); printf(". "); goto retransmit; } netdump_network_poll(); DELAY(500); } nd_seqno += i; return (0); } /* * Handler for IP packets: checks their sanity and then processes any netdump * ACK packets it finds. * * It needs to replicate partially the behaviour of ip_input() and * udp_input(). * * Parameters: * mb a pointer to an mbuf * containing the packet received * Updates *mb if m_pullup et al change the pointer * Assumes the calling function will take care of freeing the mbuf */ static void netdump_handle_ip(struct mbuf **mb) { struct ip *ip; struct udpiphdr *udp; struct netdump_ack *nd_ack; struct mbuf *m; int rcv_ackno; unsigned short hlen; /* IP processing. */ m = *mb; if (m->m_pkthdr.len < sizeof(struct ip)) { NETDDEBUG("dropping packet too small for IP header\n"); return; } if (m->m_len < sizeof(struct ip)) { m = m_pullup(m, sizeof(struct ip)); *mb = m; if (m == NULL) { NETDDEBUG("m_pullup failed\n"); return; } } ip = mtod(m, struct ip *); /* IP version. */ if (ip->ip_v != IPVERSION) { NETDDEBUG("bad IP version %d\n", ip->ip_v); return; } /* Header length. */ hlen = ip->ip_hl << 2; if (hlen < sizeof(struct ip)) { NETDDEBUG("bad IP header length (%hu)\n", hlen); return; } if (hlen > m->m_len) { m = m_pullup(m, hlen); *mb = m; if (m == NULL) { NETDDEBUG("m_pullup failed\n"); return; } ip = mtod(m, struct ip *); } /* Ignore packets with IP options. */ if (hlen > sizeof(struct ip)) { NETDDEBUG("drop packet with IP options\n"); return; } #ifdef INVARIANTS if (((ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET || (ntohl(ip->ip_src.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET) && (m->m_pkthdr.rcvif->if_flags & IFF_LOOPBACK) == 0) { NETDDEBUG("Bad IP header (RFC1122)\n"); return; } #endif /* Checksum. */ if ((m->m_pkthdr.csum_flags & CSUM_IP_CHECKED) != 0) { if ((m->m_pkthdr.csum_flags & CSUM_IP_VALID) == 0) { NETDDEBUG("bad IP checksum\n"); return; } } else { /* XXX */ ; } /* Convert fields to host byte order. */ ip->ip_len = ntohs(ip->ip_len); if (ip->ip_len < hlen) { NETDDEBUG("IP packet smaller (%hu) than header (%hu)\n", ip->ip_len, hlen); return; } if (m->m_pkthdr.len < ip->ip_len) { NETDDEBUG("IP packet bigger (%hu) than ethernet packet (%d)\n", ip->ip_len, m->m_pkthdr.len); return; } if (m->m_pkthdr.len > ip->ip_len) { /* Truncate the packet to the IP length. */ if (m->m_len == m->m_pkthdr.len) { m->m_len = ip->ip_len; m->m_pkthdr.len = ip->ip_len; } else m_adj(m, ip->ip_len - m->m_pkthdr.len); } ip->ip_off = ntohs(ip->ip_off); /* Check that the source is the server's IP. */ if (ip->ip_src.s_addr != nd_server.s_addr) { NETDDEBUG("drop packet not from server (from 0x%x)\n", ip->ip_src.s_addr); return; } /* Check if the destination IP is ours. */ if (ip->ip_dst.s_addr != nd_client.s_addr) { NETDDEBUGV("drop packet not to our IP\n"); return; } if (ip->ip_p != IPPROTO_UDP) { NETDDEBUG("drop non-UDP packet\n"); return; } /* Do not deal with fragments. */ if ((ip->ip_off & (IP_MF | IP_OFFMASK)) != 0) { NETDDEBUG("drop fragmented packet\n"); return; } /* UDP custom is to have packet length not include IP header. */ ip->ip_len -= hlen; /* UDP processing. */ /* Get IP and UDP headers together, along with the netdump packet. */ if (m->m_pkthdr.len < sizeof(struct udpiphdr) + sizeof(struct netdump_ack)) { NETDDEBUG("ignoring small packet\n"); return; } if (m->m_len < sizeof(struct udpiphdr) + sizeof(struct netdump_ack)) { m = m_pullup(m, sizeof(struct udpiphdr) + sizeof(struct netdump_ack)); *mb = m; if (m == NULL) { NETDDEBUG("m_pullup failed\n"); return; } } udp = mtod(m, struct udpiphdr *); if (ntohs(udp->ui_u.uh_dport) != NETDUMP_ACKPORT) { NETDDEBUG("not on the netdump port.\n"); return; } /* Netdump processing. */ /* * Packet is meant for us. Extract the ack sequence number and the * port number if necessary. */ nd_ack = (struct netdump_ack *)(mtod(m, caddr_t) + sizeof(struct udpiphdr)); rcv_ackno = ntohl(nd_ack->na_seqno); if (nd_server_port == NETDUMP_PORT) nd_server_port = ntohs(udp->ui_u.uh_sport); if (rcv_ackno >= nd_seqno + NETDUMP_MAX_IN_FLIGHT) printf("%s: ACK %d too far in future!\n", __func__, rcv_ackno); else if (rcv_ackno >= nd_seqno) { /* We're interested in this ack. Record it. */ rcvd_acks |= 1 << (rcv_ackno - nd_seqno); } } /* * Handler for ARP packets: checks their sanity and then * 1. If the ARP is a request for our IP, respond with our MAC address * 2. If the ARP is a response from our server, record its MAC address * * It needs to replicate partially the behaviour of arpintr() and * in_arpinput(). * * Parameters: * mb a pointer to an mbuf * containing the packet received * Updates *mb if m_pullup et al change the pointer * Assumes the calling function will take care of freeing the mbuf */ static void netdump_handle_arp(struct mbuf **mb) { char buf[INET_ADDRSTRLEN]; struct in_addr isaddr, itaddr, myaddr; struct ether_addr dst; struct mbuf *m; struct arphdr *ah; struct ifnet *ifp; uint8_t *enaddr; int req_len, op; m = *mb; ifp = m->m_pkthdr.rcvif; if (m->m_len < sizeof(struct arphdr)) { m = m_pullup(m, sizeof(struct arphdr)); *mb = m; if (m == NULL) { NETDDEBUG("runt packet: m_pullup failed\n"); return; } } ah = mtod(m, struct arphdr *); if (ntohs(ah->ar_hrd) != ARPHRD_ETHER) { NETDDEBUG("unknown hardware address 0x%2D)\n", (unsigned char *)&ah->ar_hrd, ""); return; } if (ntohs(ah->ar_pro) != ETHERTYPE_IP) { NETDDEBUG("drop ARP for unknown protocol %d\n", ntohs(ah->ar_pro)); return; } req_len = arphdr_len2(ifp->if_addrlen, sizeof(struct in_addr)); if (m->m_len < req_len) { m = m_pullup(m, req_len); *mb = m; if (m == NULL) { NETDDEBUG("runt packet: m_pullup failed\n"); return; } } ah = mtod(m, struct arphdr *); op = ntohs(ah->ar_op); memcpy(&isaddr, ar_spa(ah), sizeof(isaddr)); memcpy(&itaddr, ar_tpa(ah), sizeof(itaddr)); enaddr = (uint8_t *)IF_LLADDR(ifp); myaddr = nd_client; if (memcmp(ar_sha(ah), enaddr, ifp->if_addrlen) == 0) { NETDDEBUG("ignoring ARP from myself\n"); return; } if (isaddr.s_addr == nd_client.s_addr) { printf("%s: %*D is using my IP address %s!\n", __func__, ifp->if_addrlen, (u_char *)ar_sha(ah), ":", inet_ntoa_r(isaddr, buf)); return; } if (memcmp(ar_sha(ah), ifp->if_broadcastaddr, ifp->if_addrlen) == 0) { NETDDEBUG("ignoring ARP from broadcast address\n"); return; } if (op == ARPOP_REPLY) { if (isaddr.s_addr != nd_gateway.s_addr && isaddr.s_addr != nd_server.s_addr) { inet_ntoa_r(isaddr, buf); NETDDEBUG( "ignoring ARP reply from %s (not netdump server)\n", buf); return; } memcpy(nd_gw_mac.octet, ar_sha(ah), min(ah->ar_hln, ETHER_ADDR_LEN)); have_gw_mac = 1; NETDDEBUG("got server MAC address %6D\n", nd_gw_mac.octet, ":"); return; } if (op != ARPOP_REQUEST) { NETDDEBUG("ignoring ARP non-request/reply\n"); return; } if (itaddr.s_addr != nd_client.s_addr) { NETDDEBUG("ignoring ARP not to our IP\n"); return; } memcpy(ar_tha(ah), ar_sha(ah), ah->ar_hln); memcpy(ar_sha(ah), enaddr, ah->ar_hln); memcpy(ar_tpa(ah), ar_spa(ah), ah->ar_pln); memcpy(ar_spa(ah), &itaddr, ah->ar_pln); ah->ar_op = htons(ARPOP_REPLY); ah->ar_pro = htons(ETHERTYPE_IP); m->m_flags &= ~(M_BCAST|M_MCAST); m->m_len = arphdr_len(ah); m->m_pkthdr.len = m->m_len; memcpy(dst.octet, ar_tha(ah), ETHER_ADDR_LEN); netdump_ether_output(m, ifp, dst, ETHERTYPE_ARP); *mb = NULL; } /* * Handler for incoming packets directly from the network adapter * Identifies the packet type (IP or ARP) and passes it along to one of the * helper functions netdump_handle_ip or netdump_handle_arp. * * It needs to replicate partially the behaviour of ether_input() and * ether_demux(). * * Parameters: * ifp the interface the packet came from (should be nd_ifp) * m an mbuf containing the packet received */ static void netdump_pkt_in(struct ifnet *ifp, struct mbuf *m) { struct ifreq ifr; struct ether_header *eh; u_short etype; /* Ethernet processing. */ if ((m->m_flags & M_PKTHDR) == 0) { NETDDEBUG_IF(ifp, "discard frame without packet header\n"); goto done; } if (m->m_len < ETHER_HDR_LEN) { NETDDEBUG_IF(ifp, "discard frame without leading eth header (len %u pktlen %u)\n", m->m_len, m->m_pkthdr.len); goto done; } if ((m->m_flags & M_HASFCS) != 0) { m_adj(m, -ETHER_CRC_LEN); m->m_flags &= ~M_HASFCS; } eh = mtod(m, struct ether_header *); etype = ntohs(eh->ether_type); if ((m->m_flags & M_VLANTAG) != 0 || etype == ETHERTYPE_VLAN) { NETDDEBUG_IF(ifp, "ignoring vlan packets\n"); goto done; } if (if_gethwaddr(ifp, &ifr) != 0) { NETDDEBUG_IF(ifp, "failed to get hw addr for interface\n"); goto done; } if (memcmp(ifr.ifr_addr.sa_data, eh->ether_dhost, ETHER_ADDR_LEN) != 0) { NETDDEBUG_IF(ifp, "discard frame with incorrect destination addr\n"); goto done; } /* Done ethernet processing. Strip off the ethernet header. */ m_adj(m, ETHER_HDR_LEN); switch (etype) { case ETHERTYPE_ARP: netdump_handle_arp(&m); break; case ETHERTYPE_IP: netdump_handle_ip(&m); break; default: NETDDEBUG_IF(ifp, "dropping unknown ethertype %hu\n", etype); break; } done: if (m != NULL) m_freem(m); } /* * After trapping, instead of assuming that most of the network stack is sane, * we just poll the driver directly for packets. */ static void netdump_network_poll(void) { MPASS(nd_ifp != NULL); nd_ifp->if_netdump_methods->nd_poll(nd_ifp, 1000); } /*- * Dumping specific primitives. */ /* * Callback from dumpsys() to dump a chunk of memory. * Copies it out to our static buffer then sends it across the network. * Detects the initial KDH and makes sure it is given a special packet type. * * Parameters: * priv Unused. Optional private pointer. * virtual Virtual address (where to read the data from) * physical Unused. Physical memory address. * offset Offset from start of core file * length Data length * * Return value: * 0 on success * errno on error */ static int netdump_dumper(void *priv __unused, void *virtual, vm_offset_t physical __unused, off_t offset, size_t length) { int error; NETDDEBUGV("netdump_dumper(NULL, %p, NULL, %ju, %zu)\n", virtual, (uintmax_t)offset, length); if (virtual == NULL) { if (dump_failed != 0) printf("failed to dump the kernel core\n"); else if (netdump_send(NETDUMP_FINISHED, 0, NULL, 0) != 0) printf("failed to close the transaction\n"); else printf("\nnetdump finished.\n"); netdump_cleanup(); return (0); } if (length > sizeof(nd_buf)) return (ENOSPC); memmove(nd_buf, virtual, length); error = netdump_send(NETDUMP_VMCORE, offset, nd_buf, length); if (error != 0) { dump_failed = 1; return (error); } return (0); } /* * Perform any initalization needed prior to transmitting the kernel core. */ static int netdump_start(struct dumperinfo *di) { char *path; char buf[INET_ADDRSTRLEN]; uint32_t len; int error; error = 0; /* Check if the dumping is allowed to continue. */ if (nd_enabled == 0) return (EINVAL); if (panicstr == NULL) { printf( "netdump_start: netdump may only be used after a panic\n"); return (EINVAL); } MPASS(nd_ifp != NULL); if (nd_server.s_addr == INADDR_ANY) { printf("netdump_start: can't netdump; no server IP given\n"); return (EINVAL); } if (nd_client.s_addr == INADDR_ANY) { printf("netdump_start: can't netdump; no client IP given\n"); return (EINVAL); } /* We start dumping at offset 0. */ di->dumpoff = 0; nd_seqno = 1; /* * nd_server_port could have switched after the first ack the * first time it gets called. Adjust it accordingly. */ nd_server_port = NETDUMP_PORT; /* Switch to the netdump mbuf zones. */ netdump_mbuf_dump(); nd_ifp->if_netdump_methods->nd_event(nd_ifp, NETDUMP_START); /* Make the card use *our* receive callback. */ drv_if_input = nd_ifp->if_input; nd_ifp->if_input = netdump_pkt_in; if (nd_gateway.s_addr == INADDR_ANY) { restore_gw_addr = 1; nd_gateway.s_addr = nd_server.s_addr; } printf("netdump in progress. searching for server...\n"); if (netdump_arp_gw()) { printf("failed to locate server MAC address\n"); error = EINVAL; goto trig_abort; } if (nd_path[0] != '\0') { path = nd_path; len = strlen(path) + 1; } else { path = NULL; len = 0; } if (netdump_send(NETDUMP_HERALD, 0, path, len) != 0) { printf("failed to contact netdump server\n"); error = EINVAL; goto trig_abort; } printf("netdumping to %s (%6D)\n", inet_ntoa_r(nd_server, buf), nd_gw_mac.octet, ":"); return (0); trig_abort: netdump_cleanup(); return (error); } static int netdump_write_headers(struct dumperinfo *di, struct kerneldumpheader *kdh, void *key, uint32_t keysize) { int error; memcpy(nd_buf, kdh, sizeof(*kdh)); error = netdump_send(NETDUMP_KDH, 0, nd_buf, sizeof(*kdh)); if (error == 0 && keysize > 0) { if (keysize > sizeof(nd_buf)) return (EINVAL); memcpy(nd_buf, key, keysize); error = netdump_send(NETDUMP_EKCD_KEY, 0, nd_buf, keysize); } return (error); } /* * Cleanup routine for a possibly failed netdump. */ static void netdump_cleanup(void) { if (restore_gw_addr != 0) { nd_gateway.s_addr = INADDR_ANY; restore_gw_addr = 0; } if (drv_if_input != NULL) { nd_ifp->if_input = drv_if_input; drv_if_input = NULL; } nd_ifp->if_netdump_methods->nd_event(nd_ifp, NETDUMP_END); } /*- * KLD specific code. */ static struct cdevsw netdump_cdevsw = { .d_version = D_VERSION, .d_ioctl = netdump_ioctl, .d_name = "netdump", }; static struct cdev *netdump_cdev; static int netdump_configure(struct netdump_conf *conf, struct thread *td) { struct ifnet *ifp; CURVNET_SET(TD_TO_VNET(td)); if (!IS_DEFAULT_VNET(curvnet)) { CURVNET_RESTORE(); return (EINVAL); } IFNET_RLOCK_NOSLEEP(); - TAILQ_FOREACH(ifp, &V_ifnet, if_link) { + CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) { if (strcmp(ifp->if_xname, conf->ndc_iface) == 0) break; } /* XXX ref */ IFNET_RUNLOCK_NOSLEEP(); CURVNET_RESTORE(); if (ifp == NULL) return (ENOENT); if ((if_getflags(ifp) & IFF_UP) == 0) return (ENXIO); if (!netdump_supported_nic(ifp) || ifp->if_type != IFT_ETHER) return (EINVAL); nd_ifp = ifp; netdump_reinit(ifp); memcpy(&nd_conf, conf, sizeof(nd_conf)); nd_enabled = 1; return (0); } /* * Reinitialize the mbuf pool used by drivers while dumping. This is called * from the generic ioctl handler for SIOCSIFMTU after the driver has * reconfigured itself. */ void netdump_reinit(struct ifnet *ifp) { int clsize, nmbuf, ncl, nrxr; if (ifp != nd_ifp) return; ifp->if_netdump_methods->nd_init(ifp, &nrxr, &ncl, &clsize); KASSERT(nrxr > 0, ("invalid receive ring count %d", nrxr)); /* * We need two headers per message on the transmit side. Multiply by * four to give us some breathing room. */ nmbuf = ncl * (4 + nrxr); ncl *= nrxr; netdump_mbuf_reinit(nmbuf, ncl, clsize); } /* * ioctl(2) handler for the netdump device. This is currently only used to * register netdump as a dump device. * * Parameters: * dev, Unused. * cmd, The ioctl to be handled. * addr, The parameter for the ioctl. * flags, Unused. * td, The thread invoking this ioctl. * * Returns: * 0 on success, and an errno value on failure. */ static int netdump_ioctl(struct cdev *dev __unused, u_long cmd, caddr_t addr, int flags __unused, struct thread *td) { struct diocskerneldump_arg *kda; struct dumperinfo dumper; struct netdump_conf *conf; uint8_t *encryptedkey; int error; u_int u; error = 0; switch (cmd) { case DIOCSKERNELDUMP: u = *(u_int *)addr; if (u != 0) { error = ENXIO; break; } if (nd_enabled) { nd_enabled = 0; netdump_mbuf_drain(); } break; case NETDUMPGCONF: conf = (struct netdump_conf *)addr; if (!nd_enabled) { error = ENXIO; break; } strlcpy(conf->ndc_iface, nd_ifp->if_xname, sizeof(conf->ndc_iface)); memcpy(&conf->ndc_server, &nd_server, sizeof(nd_server)); memcpy(&conf->ndc_client, &nd_client, sizeof(nd_client)); memcpy(&conf->ndc_gateway, &nd_gateway, sizeof(nd_gateway)); break; case NETDUMPSCONF: conf = (struct netdump_conf *)addr; encryptedkey = NULL; kda = &conf->ndc_kda; conf->ndc_iface[sizeof(conf->ndc_iface) - 1] = '\0'; if (kda->kda_enable == 0) { if (nd_enabled) { error = clear_dumper(td); if (error == 0) { nd_enabled = 0; netdump_mbuf_drain(); } } break; } error = netdump_configure(conf, td); if (error != 0) break; if (kda->kda_encryption != KERNELDUMP_ENC_NONE) { if (kda->kda_encryptedkeysize <= 0 || kda->kda_encryptedkeysize > KERNELDUMP_ENCKEY_MAX_SIZE) return (EINVAL); encryptedkey = malloc(kda->kda_encryptedkeysize, M_TEMP, M_WAITOK); error = copyin(kda->kda_encryptedkey, encryptedkey, kda->kda_encryptedkeysize); if (error != 0) { free(encryptedkey, M_TEMP); return (error); } } memset(&dumper, 0, sizeof(dumper)); dumper.dumper_start = netdump_start; dumper.dumper_hdr = netdump_write_headers; dumper.dumper = netdump_dumper; dumper.priv = NULL; dumper.blocksize = NETDUMP_DATASIZE; dumper.maxiosize = MAXDUMPPGS * PAGE_SIZE; dumper.mediaoffset = 0; dumper.mediasize = 0; error = set_dumper(&dumper, conf->ndc_iface, td, kda->kda_compression, kda->kda_encryption, kda->kda_key, kda->kda_encryptedkeysize, encryptedkey); if (encryptedkey != NULL) { explicit_bzero(encryptedkey, kda->kda_encryptedkeysize); free(encryptedkey, M_TEMP); } if (error != 0) { nd_enabled = 0; netdump_mbuf_drain(); } break; default: error = EINVAL; break; } return (error); } /* * Called upon system init or kld load. Initializes the netdump parameters to * sane defaults (locates the first available NIC and uses the first IPv4 IP on * that card as the client IP). Leaves the server IP unconfigured. * * Parameters: * mod, Unused. * what, The module event type. * priv, Unused. * * Returns: * int, An errno value if an error occured, 0 otherwise. */ static int netdump_modevent(module_t mod __unused, int what, void *priv __unused) { struct netdump_conf conf; char *arg; int error; error = 0; switch (what) { case MOD_LOAD: error = make_dev_p(MAKEDEV_WAITOK, &netdump_cdev, &netdump_cdevsw, 0, UID_ROOT, GID_WHEEL, 0600, "netdump"); if (error != 0) return (error); if ((arg = kern_getenv("net.dump.iface")) != NULL) { strlcpy(conf.ndc_iface, arg, sizeof(conf.ndc_iface)); freeenv(arg); if ((arg = kern_getenv("net.dump.server")) != NULL) { inet_aton(arg, &conf.ndc_server); freeenv(arg); } if ((arg = kern_getenv("net.dump.client")) != NULL) { inet_aton(arg, &conf.ndc_server); freeenv(arg); } if ((arg = kern_getenv("net.dump.gateway")) != NULL) { inet_aton(arg, &conf.ndc_server); freeenv(arg); } /* Ignore errors; we print a message to the console. */ (void)netdump_configure(&conf, curthread); } break; case MOD_UNLOAD: destroy_dev(netdump_cdev); if (nd_enabled) { printf("netdump: disabling dump device for unload\n"); (void)clear_dumper(curthread); nd_enabled = 0; } break; default: error = EOPNOTSUPP; break; } return (error); } static moduledata_t netdump_mod = { "netdump", netdump_modevent, NULL, }; MODULE_VERSION(netdump, 1); DECLARE_MODULE(netdump, netdump_mod, SI_SUB_PSEUDO, SI_ORDER_ANY); Index: head/sys/netinet/raw_ip.c =================================================================== --- head/sys/netinet/raw_ip.c (revision 334117) +++ head/sys/netinet/raw_ip.c (revision 334118) @@ -1,1130 +1,1130 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1988, 1993 * The Regents of the University of California. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)raw_ip.c 8.7 (Berkeley) 5/15/95 */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ipsec.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include VNET_DEFINE(int, ip_defttl) = IPDEFTTL; SYSCTL_INT(_net_inet_ip, IPCTL_DEFTTL, ttl, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip_defttl), 0, "Maximum TTL on IP packets"); VNET_DEFINE(struct inpcbhead, ripcb); VNET_DEFINE(struct inpcbinfo, ripcbinfo); #define V_ripcb VNET(ripcb) #define V_ripcbinfo VNET(ripcbinfo) /* * Control and data hooks for ipfw, dummynet, divert and so on. * The data hooks are not used here but it is convenient * to keep them all in one place. */ VNET_DEFINE(ip_fw_chk_ptr_t, ip_fw_chk_ptr) = NULL; VNET_DEFINE(ip_fw_ctl_ptr_t, ip_fw_ctl_ptr) = NULL; int (*ip_dn_ctl_ptr)(struct sockopt *); int (*ip_dn_io_ptr)(struct mbuf **, int, struct ip_fw_args *); void (*ip_divert_ptr)(struct mbuf *, int); int (*ng_ipfw_input_p)(struct mbuf **, int, struct ip_fw_args *, int); #ifdef INET /* * Hooks for multicast routing. They all default to NULL, so leave them not * initialized and rely on BSS being set to 0. */ /* * The socket used to communicate with the multicast routing daemon. */ VNET_DEFINE(struct socket *, ip_mrouter); /* * The various mrouter and rsvp functions. */ int (*ip_mrouter_set)(struct socket *, struct sockopt *); int (*ip_mrouter_get)(struct socket *, struct sockopt *); int (*ip_mrouter_done)(void); int (*ip_mforward)(struct ip *, struct ifnet *, struct mbuf *, struct ip_moptions *); int (*mrt_ioctl)(u_long, caddr_t, int); int (*legal_vif_num)(int); u_long (*ip_mcast_src)(int); int (*rsvp_input_p)(struct mbuf **, int *, int); int (*ip_rsvp_vif)(struct socket *, struct sockopt *); void (*ip_rsvp_force_done)(struct socket *); #endif /* INET */ extern struct protosw inetsw[]; u_long rip_sendspace = 9216; SYSCTL_ULONG(_net_inet_raw, OID_AUTO, maxdgram, CTLFLAG_RW, &rip_sendspace, 0, "Maximum outgoing raw IP datagram size"); u_long rip_recvspace = 9216; SYSCTL_ULONG(_net_inet_raw, OID_AUTO, recvspace, CTLFLAG_RW, &rip_recvspace, 0, "Maximum space for incoming raw IP datagrams"); /* * Hash functions */ #define INP_PCBHASH_RAW_SIZE 256 #define INP_PCBHASH_RAW(proto, laddr, faddr, mask) \ (((proto) + (laddr) + (faddr)) % (mask) + 1) #ifdef INET static void rip_inshash(struct inpcb *inp) { struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; struct inpcbhead *pcbhash; int hash; INP_INFO_WLOCK_ASSERT(pcbinfo); INP_WLOCK_ASSERT(inp); if (inp->inp_ip_p != 0 && inp->inp_laddr.s_addr != INADDR_ANY && inp->inp_faddr.s_addr != INADDR_ANY) { hash = INP_PCBHASH_RAW(inp->inp_ip_p, inp->inp_laddr.s_addr, inp->inp_faddr.s_addr, pcbinfo->ipi_hashmask); } else hash = 0; pcbhash = &pcbinfo->ipi_hashbase[hash]; LIST_INSERT_HEAD(pcbhash, inp, inp_hash); } static void rip_delhash(struct inpcb *inp) { INP_INFO_WLOCK_ASSERT(inp->inp_pcbinfo); INP_WLOCK_ASSERT(inp); LIST_REMOVE(inp, inp_hash); } #endif /* INET */ /* * Raw interface to IP protocol. */ /* * Initialize raw connection block q. */ static void rip_zone_change(void *tag) { uma_zone_set_max(V_ripcbinfo.ipi_zone, maxsockets); } static int rip_inpcb_init(void *mem, int size, int flags) { struct inpcb *inp = mem; INP_LOCK_INIT(inp, "inp", "rawinp"); return (0); } void rip_init(void) { in_pcbinfo_init(&V_ripcbinfo, "rip", &V_ripcb, INP_PCBHASH_RAW_SIZE, 1, "ripcb", rip_inpcb_init, IPI_HASHFIELDS_NONE); EVENTHANDLER_REGISTER(maxsockets_change, rip_zone_change, NULL, EVENTHANDLER_PRI_ANY); } #ifdef VIMAGE static void rip_destroy(void *unused __unused) { in_pcbinfo_destroy(&V_ripcbinfo); } VNET_SYSUNINIT(raw_ip, SI_SUB_PROTO_DOMAIN, SI_ORDER_FOURTH, rip_destroy, NULL); #endif #ifdef INET static int rip_append(struct inpcb *last, struct ip *ip, struct mbuf *n, struct sockaddr_in *ripsrc) { int policyfail = 0; INP_LOCK_ASSERT(last); #if defined(IPSEC) || defined(IPSEC_SUPPORT) /* check AH/ESP integrity. */ if (IPSEC_ENABLED(ipv4)) { if (IPSEC_CHECK_POLICY(ipv4, n, last) != 0) policyfail = 1; } #endif /* IPSEC */ #ifdef MAC if (!policyfail && mac_inpcb_check_deliver(last, n) != 0) policyfail = 1; #endif /* Check the minimum TTL for socket. */ if (last->inp_ip_minttl && last->inp_ip_minttl > ip->ip_ttl) policyfail = 1; if (!policyfail) { struct mbuf *opts = NULL; struct socket *so; so = last->inp_socket; if ((last->inp_flags & INP_CONTROLOPTS) || (so->so_options & (SO_TIMESTAMP | SO_BINTIME))) ip_savecontrol(last, &opts, ip, n); SOCKBUF_LOCK(&so->so_rcv); if (sbappendaddr_locked(&so->so_rcv, (struct sockaddr *)ripsrc, n, opts) == 0) { /* should notify about lost packet */ m_freem(n); if (opts) m_freem(opts); SOCKBUF_UNLOCK(&so->so_rcv); } else sorwakeup_locked(so); } else m_freem(n); return (policyfail); } /* * Setup generic address and protocol structures for raw_input routine, then * pass them along with mbuf chain. */ int rip_input(struct mbuf **mp, int *offp, int proto) { struct ifnet *ifp; struct mbuf *m = *mp; struct ip *ip = mtod(m, struct ip *); struct inpcb *inp, *last; struct sockaddr_in ripsrc; int hash; *mp = NULL; bzero(&ripsrc, sizeof(ripsrc)); ripsrc.sin_len = sizeof(ripsrc); ripsrc.sin_family = AF_INET; ripsrc.sin_addr = ip->ip_src; last = NULL; ifp = m->m_pkthdr.rcvif; hash = INP_PCBHASH_RAW(proto, ip->ip_src.s_addr, ip->ip_dst.s_addr, V_ripcbinfo.ipi_hashmask); INP_INFO_RLOCK(&V_ripcbinfo); LIST_FOREACH(inp, &V_ripcbinfo.ipi_hashbase[hash], inp_hash) { if (inp->inp_ip_p != proto) continue; #ifdef INET6 /* XXX inp locking */ if ((inp->inp_vflag & INP_IPV4) == 0) continue; #endif if (inp->inp_laddr.s_addr != ip->ip_dst.s_addr) continue; if (inp->inp_faddr.s_addr != ip->ip_src.s_addr) continue; if (jailed_without_vnet(inp->inp_cred)) { /* * XXX: If faddr was bound to multicast group, * jailed raw socket will drop datagram. */ if (prison_check_ip4(inp->inp_cred, &ip->ip_dst) != 0) continue; } if (last != NULL) { struct mbuf *n; n = m_copym(m, 0, M_COPYALL, M_NOWAIT); if (n != NULL) (void) rip_append(last, ip, n, &ripsrc); /* XXX count dropped packet */ INP_RUNLOCK(last); } INP_RLOCK(inp); last = inp; } LIST_FOREACH(inp, &V_ripcbinfo.ipi_hashbase[0], inp_hash) { if (inp->inp_ip_p && inp->inp_ip_p != proto) continue; #ifdef INET6 /* XXX inp locking */ if ((inp->inp_vflag & INP_IPV4) == 0) continue; #endif if (!in_nullhost(inp->inp_laddr) && !in_hosteq(inp->inp_laddr, ip->ip_dst)) continue; if (!in_nullhost(inp->inp_faddr) && !in_hosteq(inp->inp_faddr, ip->ip_src)) continue; if (jailed_without_vnet(inp->inp_cred)) { /* * Allow raw socket in jail to receive multicast; * assume process had PRIV_NETINET_RAW at attach, * and fall through into normal filter path if so. */ if (!IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) && prison_check_ip4(inp->inp_cred, &ip->ip_dst) != 0) continue; } /* * If this raw socket has multicast state, and we * have received a multicast, check if this socket * should receive it, as multicast filtering is now * the responsibility of the transport layer. */ if (inp->inp_moptions != NULL && IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) { /* * If the incoming datagram is for IGMP, allow it * through unconditionally to the raw socket. * * In the case of IGMPv2, we may not have explicitly * joined the group, and may have set IFF_ALLMULTI * on the interface. imo_multi_filter() may discard * control traffic we actually need to see. * * Userland multicast routing daemons should continue * filter the control traffic appropriately. */ int blocked; blocked = MCAST_PASS; if (proto != IPPROTO_IGMP) { struct sockaddr_in group; bzero(&group, sizeof(struct sockaddr_in)); group.sin_len = sizeof(struct sockaddr_in); group.sin_family = AF_INET; group.sin_addr = ip->ip_dst; blocked = imo_multi_filter(inp->inp_moptions, ifp, (struct sockaddr *)&group, (struct sockaddr *)&ripsrc); } if (blocked != MCAST_PASS) { IPSTAT_INC(ips_notmember); continue; } } if (last != NULL) { struct mbuf *n; n = m_copym(m, 0, M_COPYALL, M_NOWAIT); if (n != NULL) (void) rip_append(last, ip, n, &ripsrc); /* XXX count dropped packet */ INP_RUNLOCK(last); } INP_RLOCK(inp); last = inp; } INP_INFO_RUNLOCK(&V_ripcbinfo); if (last != NULL) { if (rip_append(last, ip, m, &ripsrc) != 0) IPSTAT_INC(ips_delivered); INP_RUNLOCK(last); } else { if (inetsw[ip_protox[ip->ip_p]].pr_input == rip_input) { IPSTAT_INC(ips_noproto); IPSTAT_DEC(ips_delivered); icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_PROTOCOL, 0, 0); } else { m_freem(m); } } return (IPPROTO_DONE); } /* * Generate IP header and pass packet to ip_output. Tack on options user may * have setup with control call. */ int rip_output(struct mbuf *m, struct socket *so, ...) { struct ip *ip; int error; struct inpcb *inp = sotoinpcb(so); va_list ap; u_long dst; int flags = ((so->so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0) | IP_ALLOWBROADCAST; va_start(ap, so); dst = va_arg(ap, u_long); va_end(ap); /* * If the user handed us a complete IP packet, use it. Otherwise, * allocate an mbuf for a header and fill it in. */ if ((inp->inp_flags & INP_HDRINCL) == 0) { if (m->m_pkthdr.len + sizeof(struct ip) > IP_MAXPACKET) { m_freem(m); return(EMSGSIZE); } M_PREPEND(m, sizeof(struct ip), M_NOWAIT); if (m == NULL) return(ENOBUFS); INP_RLOCK(inp); ip = mtod(m, struct ip *); ip->ip_tos = inp->inp_ip_tos; if (inp->inp_flags & INP_DONTFRAG) ip->ip_off = htons(IP_DF); else ip->ip_off = htons(0); ip->ip_p = inp->inp_ip_p; ip->ip_len = htons(m->m_pkthdr.len); ip->ip_src = inp->inp_laddr; ip->ip_dst.s_addr = dst; if (jailed(inp->inp_cred)) { /* * prison_local_ip4() would be good enough but would * let a source of INADDR_ANY pass, which we do not * want to see from jails. */ if (ip->ip_src.s_addr == INADDR_ANY) { error = in_pcbladdr(inp, &ip->ip_dst, &ip->ip_src, inp->inp_cred); } else { error = prison_local_ip4(inp->inp_cred, &ip->ip_src); } if (error != 0) { INP_RUNLOCK(inp); m_freem(m); return (error); } } ip->ip_ttl = inp->inp_ip_ttl; } else { if (m->m_pkthdr.len > IP_MAXPACKET) { m_freem(m); return(EMSGSIZE); } INP_RLOCK(inp); ip = mtod(m, struct ip *); error = prison_check_ip4(inp->inp_cred, &ip->ip_src); if (error != 0) { INP_RUNLOCK(inp); m_freem(m); return (error); } /* * Don't allow both user specified and setsockopt options, * and don't allow packet length sizes that will crash. */ if (((ip->ip_hl != (sizeof (*ip) >> 2)) && inp->inp_options) || (ntohs(ip->ip_len) != m->m_pkthdr.len) || (ntohs(ip->ip_len) < (ip->ip_hl << 2))) { INP_RUNLOCK(inp); m_freem(m); return (EINVAL); } /* * This doesn't allow application to specify ID of zero, * but we got this limitation from the beginning of history. */ if (ip->ip_id == 0) ip_fillid(ip); /* * XXX prevent ip_output from overwriting header fields. */ flags |= IP_RAWOUTPUT; IPSTAT_INC(ips_rawout); } if (inp->inp_flags & INP_ONESBCAST) flags |= IP_SENDONES; #ifdef MAC mac_inpcb_create_mbuf(inp, m); #endif error = ip_output(m, inp->inp_options, NULL, flags, inp->inp_moptions, inp); INP_RUNLOCK(inp); return (error); } /* * Raw IP socket option processing. * * IMPORTANT NOTE regarding access control: Traditionally, raw sockets could * only be created by a privileged process, and as such, socket option * operations to manage system properties on any raw socket were allowed to * take place without explicit additional access control checks. However, * raw sockets can now also be created in jail(), and therefore explicit * checks are now required. Likewise, raw sockets can be used by a process * after it gives up privilege, so some caution is required. For options * passed down to the IP layer via ip_ctloutput(), checks are assumed to be * performed in ip_ctloutput() and therefore no check occurs here. * Unilaterally checking priv_check() here breaks normal IP socket option * operations on raw sockets. * * When adding new socket options here, make sure to add access control * checks here as necessary. * * XXX-BZ inp locking? */ int rip_ctloutput(struct socket *so, struct sockopt *sopt) { struct inpcb *inp = sotoinpcb(so); int error, optval; if (sopt->sopt_level != IPPROTO_IP) { if ((sopt->sopt_level == SOL_SOCKET) && (sopt->sopt_name == SO_SETFIB)) { inp->inp_inc.inc_fibnum = so->so_fibnum; return (0); } return (EINVAL); } error = 0; switch (sopt->sopt_dir) { case SOPT_GET: switch (sopt->sopt_name) { case IP_HDRINCL: optval = inp->inp_flags & INP_HDRINCL; error = sooptcopyout(sopt, &optval, sizeof optval); break; case IP_FW3: /* generic ipfw v.3 functions */ case IP_FW_ADD: /* ADD actually returns the body... */ case IP_FW_GET: case IP_FW_TABLE_GETSIZE: case IP_FW_TABLE_LIST: case IP_FW_NAT_GET_CONFIG: case IP_FW_NAT_GET_LOG: if (V_ip_fw_ctl_ptr != NULL) error = V_ip_fw_ctl_ptr(sopt); else error = ENOPROTOOPT; break; case IP_DUMMYNET3: /* generic dummynet v.3 functions */ case IP_DUMMYNET_GET: if (ip_dn_ctl_ptr != NULL) error = ip_dn_ctl_ptr(sopt); else error = ENOPROTOOPT; break ; case MRT_INIT: case MRT_DONE: case MRT_ADD_VIF: case MRT_DEL_VIF: case MRT_ADD_MFC: case MRT_DEL_MFC: case MRT_VERSION: case MRT_ASSERT: case MRT_API_SUPPORT: case MRT_API_CONFIG: case MRT_ADD_BW_UPCALL: case MRT_DEL_BW_UPCALL: error = priv_check(curthread, PRIV_NETINET_MROUTE); if (error != 0) return (error); error = ip_mrouter_get ? ip_mrouter_get(so, sopt) : EOPNOTSUPP; break; default: error = ip_ctloutput(so, sopt); break; } break; case SOPT_SET: switch (sopt->sopt_name) { case IP_HDRINCL: error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) break; if (optval) inp->inp_flags |= INP_HDRINCL; else inp->inp_flags &= ~INP_HDRINCL; break; case IP_FW3: /* generic ipfw v.3 functions */ case IP_FW_ADD: case IP_FW_DEL: case IP_FW_FLUSH: case IP_FW_ZERO: case IP_FW_RESETLOG: case IP_FW_TABLE_ADD: case IP_FW_TABLE_DEL: case IP_FW_TABLE_FLUSH: case IP_FW_NAT_CFG: case IP_FW_NAT_DEL: if (V_ip_fw_ctl_ptr != NULL) error = V_ip_fw_ctl_ptr(sopt); else error = ENOPROTOOPT; break; case IP_DUMMYNET3: /* generic dummynet v.3 functions */ case IP_DUMMYNET_CONFIGURE: case IP_DUMMYNET_DEL: case IP_DUMMYNET_FLUSH: if (ip_dn_ctl_ptr != NULL) error = ip_dn_ctl_ptr(sopt); else error = ENOPROTOOPT ; break ; case IP_RSVP_ON: error = priv_check(curthread, PRIV_NETINET_MROUTE); if (error != 0) return (error); error = ip_rsvp_init(so); break; case IP_RSVP_OFF: error = priv_check(curthread, PRIV_NETINET_MROUTE); if (error != 0) return (error); error = ip_rsvp_done(); break; case IP_RSVP_VIF_ON: case IP_RSVP_VIF_OFF: error = priv_check(curthread, PRIV_NETINET_MROUTE); if (error != 0) return (error); error = ip_rsvp_vif ? ip_rsvp_vif(so, sopt) : EINVAL; break; case MRT_INIT: case MRT_DONE: case MRT_ADD_VIF: case MRT_DEL_VIF: case MRT_ADD_MFC: case MRT_DEL_MFC: case MRT_VERSION: case MRT_ASSERT: case MRT_API_SUPPORT: case MRT_API_CONFIG: case MRT_ADD_BW_UPCALL: case MRT_DEL_BW_UPCALL: error = priv_check(curthread, PRIV_NETINET_MROUTE); if (error != 0) return (error); error = ip_mrouter_set ? ip_mrouter_set(so, sopt) : EOPNOTSUPP; break; default: error = ip_ctloutput(so, sopt); break; } break; } return (error); } /* * This function exists solely to receive the PRC_IFDOWN messages which are * sent by if_down(). It looks for an ifaddr whose ifa_addr is sa, and calls * in_ifadown() to remove all routes corresponding to that address. It also * receives the PRC_IFUP messages from if_up() and reinstalls the interface * routes. */ void rip_ctlinput(int cmd, struct sockaddr *sa, void *vip) { struct rm_priotracker in_ifa_tracker; struct in_ifaddr *ia; struct ifnet *ifp; int err; int flags; switch (cmd) { case PRC_IFDOWN: IN_IFADDR_RLOCK(&in_ifa_tracker); CK_STAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) { if (ia->ia_ifa.ifa_addr == sa && (ia->ia_flags & IFA_ROUTE)) { ifa_ref(&ia->ia_ifa); IN_IFADDR_RUNLOCK(&in_ifa_tracker); /* * in_scrubprefix() kills the interface route. */ in_scrubprefix(ia, 0); /* * in_ifadown gets rid of all the rest of the * routes. This is not quite the right thing * to do, but at least if we are running a * routing process they will come back. */ in_ifadown(&ia->ia_ifa, 0); ifa_free(&ia->ia_ifa); break; } } if (ia == NULL) /* If ia matched, already unlocked. */ IN_IFADDR_RUNLOCK(&in_ifa_tracker); break; case PRC_IFUP: IN_IFADDR_RLOCK(&in_ifa_tracker); CK_STAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) { if (ia->ia_ifa.ifa_addr == sa) break; } if (ia == NULL || (ia->ia_flags & IFA_ROUTE)) { IN_IFADDR_RUNLOCK(&in_ifa_tracker); return; } ifa_ref(&ia->ia_ifa); IN_IFADDR_RUNLOCK(&in_ifa_tracker); flags = RTF_UP; ifp = ia->ia_ifa.ifa_ifp; if ((ifp->if_flags & IFF_LOOPBACK) || (ifp->if_flags & IFF_POINTOPOINT)) flags |= RTF_HOST; err = ifa_del_loopback_route((struct ifaddr *)ia, sa); err = rtinit(&ia->ia_ifa, RTM_ADD, flags); if (err == 0) ia->ia_flags |= IFA_ROUTE; err = ifa_add_loopback_route((struct ifaddr *)ia, sa); ifa_free(&ia->ia_ifa); break; } } static int rip_attach(struct socket *so, int proto, struct thread *td) { struct inpcb *inp; int error; inp = sotoinpcb(so); KASSERT(inp == NULL, ("rip_attach: inp != NULL")); error = priv_check(td, PRIV_NETINET_RAW); if (error) return (error); if (proto >= IPPROTO_MAX || proto < 0) return EPROTONOSUPPORT; error = soreserve(so, rip_sendspace, rip_recvspace); if (error) return (error); INP_INFO_WLOCK(&V_ripcbinfo); error = in_pcballoc(so, &V_ripcbinfo); if (error) { INP_INFO_WUNLOCK(&V_ripcbinfo); return (error); } inp = (struct inpcb *)so->so_pcb; inp->inp_vflag |= INP_IPV4; inp->inp_ip_p = proto; inp->inp_ip_ttl = V_ip_defttl; rip_inshash(inp); INP_INFO_WUNLOCK(&V_ripcbinfo); INP_WUNLOCK(inp); return (0); } static void rip_detach(struct socket *so) { struct inpcb *inp; inp = sotoinpcb(so); KASSERT(inp != NULL, ("rip_detach: inp == NULL")); KASSERT(inp->inp_faddr.s_addr == INADDR_ANY, ("rip_detach: not closed")); INP_INFO_WLOCK(&V_ripcbinfo); INP_WLOCK(inp); rip_delhash(inp); if (so == V_ip_mrouter && ip_mrouter_done) ip_mrouter_done(); if (ip_rsvp_force_done) ip_rsvp_force_done(so); if (so == V_ip_rsvpd) ip_rsvp_done(); /* XXX defer to epoch_call */ in_pcbdetach(inp); in_pcbfree(inp); INP_INFO_WUNLOCK(&V_ripcbinfo); } static void rip_dodisconnect(struct socket *so, struct inpcb *inp) { struct inpcbinfo *pcbinfo; pcbinfo = inp->inp_pcbinfo; INP_INFO_WLOCK(pcbinfo); INP_WLOCK(inp); rip_delhash(inp); inp->inp_faddr.s_addr = INADDR_ANY; rip_inshash(inp); SOCK_LOCK(so); so->so_state &= ~SS_ISCONNECTED; SOCK_UNLOCK(so); INP_WUNLOCK(inp); INP_INFO_WUNLOCK(pcbinfo); } static void rip_abort(struct socket *so) { struct inpcb *inp; inp = sotoinpcb(so); KASSERT(inp != NULL, ("rip_abort: inp == NULL")); rip_dodisconnect(so, inp); } static void rip_close(struct socket *so) { struct inpcb *inp; inp = sotoinpcb(so); KASSERT(inp != NULL, ("rip_close: inp == NULL")); rip_dodisconnect(so, inp); } static int rip_disconnect(struct socket *so) { struct inpcb *inp; if ((so->so_state & SS_ISCONNECTED) == 0) return (ENOTCONN); inp = sotoinpcb(so); KASSERT(inp != NULL, ("rip_disconnect: inp == NULL")); rip_dodisconnect(so, inp); return (0); } static int rip_bind(struct socket *so, struct sockaddr *nam, struct thread *td) { struct sockaddr_in *addr = (struct sockaddr_in *)nam; struct inpcb *inp; int error; if (nam->sa_len != sizeof(*addr)) return (EINVAL); error = prison_check_ip4(td->td_ucred, &addr->sin_addr); if (error != 0) return (error); inp = sotoinpcb(so); KASSERT(inp != NULL, ("rip_bind: inp == NULL")); - if (TAILQ_EMPTY(&V_ifnet) || + if (CK_STAILQ_EMPTY(&V_ifnet) || (addr->sin_family != AF_INET && addr->sin_family != AF_IMPLINK) || (addr->sin_addr.s_addr && (inp->inp_flags & INP_BINDANY) == 0 && ifa_ifwithaddr_check((struct sockaddr *)addr) == 0)) return (EADDRNOTAVAIL); INP_INFO_WLOCK(&V_ripcbinfo); INP_WLOCK(inp); rip_delhash(inp); inp->inp_laddr = addr->sin_addr; rip_inshash(inp); INP_WUNLOCK(inp); INP_INFO_WUNLOCK(&V_ripcbinfo); return (0); } static int rip_connect(struct socket *so, struct sockaddr *nam, struct thread *td) { struct sockaddr_in *addr = (struct sockaddr_in *)nam; struct inpcb *inp; if (nam->sa_len != sizeof(*addr)) return (EINVAL); - if (TAILQ_EMPTY(&V_ifnet)) + if (CK_STAILQ_EMPTY(&V_ifnet)) return (EADDRNOTAVAIL); if (addr->sin_family != AF_INET && addr->sin_family != AF_IMPLINK) return (EAFNOSUPPORT); inp = sotoinpcb(so); KASSERT(inp != NULL, ("rip_connect: inp == NULL")); INP_INFO_WLOCK(&V_ripcbinfo); INP_WLOCK(inp); rip_delhash(inp); inp->inp_faddr = addr->sin_addr; rip_inshash(inp); soisconnected(so); INP_WUNLOCK(inp); INP_INFO_WUNLOCK(&V_ripcbinfo); return (0); } static int rip_shutdown(struct socket *so) { struct inpcb *inp; inp = sotoinpcb(so); KASSERT(inp != NULL, ("rip_shutdown: inp == NULL")); INP_WLOCK(inp); socantsendmore(so); INP_WUNLOCK(inp); return (0); } static int rip_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam, struct mbuf *control, struct thread *td) { struct inpcb *inp; u_long dst; inp = sotoinpcb(so); KASSERT(inp != NULL, ("rip_send: inp == NULL")); /* * Note: 'dst' reads below are unlocked. */ if (so->so_state & SS_ISCONNECTED) { if (nam) { m_freem(m); return (EISCONN); } dst = inp->inp_faddr.s_addr; /* Unlocked read. */ } else { if (nam == NULL) { m_freem(m); return (ENOTCONN); } dst = ((struct sockaddr_in *)nam)->sin_addr.s_addr; } return (rip_output(m, so, dst)); } #endif /* INET */ static int rip_pcblist(SYSCTL_HANDLER_ARGS) { int error, i, n; struct in_pcblist *il; struct inpcb *inp, **inp_list; inp_gen_t gencnt; struct xinpgen xig; /* * The process of preparing the TCB list is too time-consuming and * resource-intensive to repeat twice on every request. */ if (req->oldptr == 0) { n = V_ripcbinfo.ipi_count; n += imax(n / 8, 10); req->oldidx = 2 * (sizeof xig) + n * sizeof(struct xinpcb); return (0); } if (req->newptr != 0) return (EPERM); /* * OK, now we're committed to doing something. */ INP_INFO_RLOCK(&V_ripcbinfo); gencnt = V_ripcbinfo.ipi_gencnt; n = V_ripcbinfo.ipi_count; INP_INFO_RUNLOCK(&V_ripcbinfo); xig.xig_len = sizeof xig; xig.xig_count = n; xig.xig_gen = gencnt; xig.xig_sogen = so_gencnt; error = SYSCTL_OUT(req, &xig, sizeof xig); if (error) return (error); il = malloc(sizeof(struct in_pcblist) + n * sizeof(struct inpcb *), M_TEMP, M_EPOCH_CALL_WAITOK); inp_list = il->il_inp_list; INP_INFO_RLOCK(&V_ripcbinfo); for (inp = LIST_FIRST(V_ripcbinfo.ipi_listhead), i = 0; inp && i < n; inp = LIST_NEXT(inp, inp_list)) { INP_WLOCK(inp); if (inp->inp_gencnt <= gencnt && cr_canseeinpcb(req->td->td_ucred, inp) == 0) { in_pcbref(inp); inp_list[i++] = inp; } INP_WUNLOCK(inp); } INP_INFO_RUNLOCK(&V_ripcbinfo); n = i; error = 0; for (i = 0; i < n; i++) { inp = inp_list[i]; INP_RLOCK(inp); if (inp->inp_gencnt <= gencnt) { struct xinpcb xi; in_pcbtoxinpcb(inp, &xi); INP_RUNLOCK(inp); error = SYSCTL_OUT(req, &xi, sizeof xi); } else INP_RUNLOCK(inp); } il->il_count = n; il->il_pcbinfo = &V_ripcbinfo; epoch_call(net_epoch_preempt, &il->il_epoch_ctx, in_pcblist_rele_rlocked); if (!error) { /* * Give the user an updated idea of our state. If the * generation differs from what we told her before, she knows * that something happened while we were processing this * request, and it might be necessary to retry. */ INP_INFO_RLOCK(&V_ripcbinfo); xig.xig_gen = V_ripcbinfo.ipi_gencnt; xig.xig_sogen = so_gencnt; xig.xig_count = V_ripcbinfo.ipi_count; INP_INFO_RUNLOCK(&V_ripcbinfo); error = SYSCTL_OUT(req, &xig, sizeof xig); } return (error); } SYSCTL_PROC(_net_inet_raw, OID_AUTO/*XXX*/, pcblist, CTLTYPE_OPAQUE | CTLFLAG_RD, NULL, 0, rip_pcblist, "S,xinpcb", "List of active raw IP sockets"); #ifdef INET struct pr_usrreqs rip_usrreqs = { .pru_abort = rip_abort, .pru_attach = rip_attach, .pru_bind = rip_bind, .pru_connect = rip_connect, .pru_control = in_control, .pru_detach = rip_detach, .pru_disconnect = rip_disconnect, .pru_peeraddr = in_getpeeraddr, .pru_send = rip_send, .pru_shutdown = rip_shutdown, .pru_sockaddr = in_getsockaddr, .pru_sosetlabel = in_pcbsosetlabel, .pru_close = rip_close, }; #endif /* INET */ Index: head/sys/netinet/sctp_bsd_addr.c =================================================================== --- head/sys/netinet/sctp_bsd_addr.c (revision 334117) +++ head/sys/netinet/sctp_bsd_addr.c (revision 334118) @@ -1,546 +1,546 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 2001-2007, by Cisco Systems, Inc. All rights reserved. * Copyright (c) 2008-2012, by Randall Stewart. All rights reserved. * Copyright (c) 2008-2012, by Michael Tuexen. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * a) Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * b) Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in * the documentation and/or other materials provided with the distribution. * * c) Neither the name of Cisco Systems, Inc. nor the names of its * contributors may be used to endorse or promote products derived * from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF * THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* Declare all of our malloc named types */ MALLOC_DEFINE(SCTP_M_MAP, "sctp_map", "sctp asoc map descriptor"); MALLOC_DEFINE(SCTP_M_STRMI, "sctp_stri", "sctp stream in array"); MALLOC_DEFINE(SCTP_M_STRMO, "sctp_stro", "sctp stream out array"); MALLOC_DEFINE(SCTP_M_ASC_ADDR, "sctp_aadr", "sctp asconf address"); MALLOC_DEFINE(SCTP_M_ASC_IT, "sctp_a_it", "sctp asconf iterator"); MALLOC_DEFINE(SCTP_M_AUTH_CL, "sctp_atcl", "sctp auth chunklist"); MALLOC_DEFINE(SCTP_M_AUTH_KY, "sctp_atky", "sctp auth key"); MALLOC_DEFINE(SCTP_M_AUTH_HL, "sctp_athm", "sctp auth hmac list"); MALLOC_DEFINE(SCTP_M_AUTH_IF, "sctp_athi", "sctp auth info"); MALLOC_DEFINE(SCTP_M_STRESET, "sctp_stre", "sctp stream reset"); MALLOC_DEFINE(SCTP_M_CMSG, "sctp_cmsg", "sctp CMSG buffer"); MALLOC_DEFINE(SCTP_M_COPYAL, "sctp_cpal", "sctp copy all"); MALLOC_DEFINE(SCTP_M_VRF, "sctp_vrf", "sctp vrf struct"); MALLOC_DEFINE(SCTP_M_IFA, "sctp_ifa", "sctp ifa struct"); MALLOC_DEFINE(SCTP_M_IFN, "sctp_ifn", "sctp ifn struct"); MALLOC_DEFINE(SCTP_M_TIMW, "sctp_timw", "sctp time block"); MALLOC_DEFINE(SCTP_M_MVRF, "sctp_mvrf", "sctp mvrf pcb list"); MALLOC_DEFINE(SCTP_M_ITER, "sctp_iter", "sctp iterator control"); MALLOC_DEFINE(SCTP_M_SOCKOPT, "sctp_socko", "sctp socket option"); MALLOC_DEFINE(SCTP_M_MCORE, "sctp_mcore", "sctp mcore queue"); /* Global NON-VNET structure that controls the iterator */ struct iterator_control sctp_it_ctl; void sctp_wakeup_iterator(void) { wakeup(&sctp_it_ctl.iterator_running); } static void sctp_iterator_thread(void *v SCTP_UNUSED) { SCTP_IPI_ITERATOR_WQ_LOCK(); /* In FreeBSD this thread never terminates. */ for (;;) { msleep(&sctp_it_ctl.iterator_running, &sctp_it_ctl.ipi_iterator_wq_mtx, 0, "waiting_for_work", 0); sctp_iterator_worker(); } } void sctp_startup_iterator(void) { if (sctp_it_ctl.thread_proc) { /* You only get one */ return; } /* Initialize global locks here, thus only once. */ SCTP_ITERATOR_LOCK_INIT(); SCTP_IPI_ITERATOR_WQ_INIT(); TAILQ_INIT(&sctp_it_ctl.iteratorhead); kproc_create(sctp_iterator_thread, (void *)NULL, &sctp_it_ctl.thread_proc, RFPROC, SCTP_KTHREAD_PAGES, SCTP_KTRHEAD_NAME); } #ifdef INET6 void sctp_gather_internal_ifa_flags(struct sctp_ifa *ifa) { struct in6_ifaddr *ifa6; ifa6 = (struct in6_ifaddr *)ifa->ifa; ifa->flags = ifa6->ia6_flags; if (!MODULE_GLOBAL(ip6_use_deprecated)) { if (ifa->flags & IN6_IFF_DEPRECATED) { ifa->localifa_flags |= SCTP_ADDR_IFA_UNUSEABLE; } else { ifa->localifa_flags &= ~SCTP_ADDR_IFA_UNUSEABLE; } } else { ifa->localifa_flags &= ~SCTP_ADDR_IFA_UNUSEABLE; } if (ifa->flags & (IN6_IFF_DETACHED | IN6_IFF_ANYCAST | IN6_IFF_NOTREADY)) { ifa->localifa_flags |= SCTP_ADDR_IFA_UNUSEABLE; } else { ifa->localifa_flags &= ~SCTP_ADDR_IFA_UNUSEABLE; } } #endif /* INET6 */ static uint32_t sctp_is_desired_interface_type(struct ifnet *ifn) { int result; /* check the interface type to see if it's one we care about */ switch (ifn->if_type) { case IFT_ETHER: case IFT_ISO88023: case IFT_ISO88024: case IFT_ISO88025: case IFT_ISO88026: case IFT_STARLAN: case IFT_P10: case IFT_P80: case IFT_HY: case IFT_FDDI: case IFT_XETHER: case IFT_ISDNBASIC: case IFT_ISDNPRIMARY: case IFT_PTPSERIAL: case IFT_OTHER: case IFT_PPP: case IFT_LOOP: case IFT_SLIP: case IFT_GIF: case IFT_L2VLAN: case IFT_STF: case IFT_IP: case IFT_IPOVERCDLC: case IFT_IPOVERCLAW: case IFT_PROPVIRTUAL: /* NetGraph Virtual too */ case IFT_VIRTUALIPADDRESS: result = 1; break; default: result = 0; } return (result); } static void sctp_init_ifns_for_vrf(int vrfid) { /* * Here we must apply ANY locks needed by the IFN we access and also * make sure we lock any IFA that exists as we float through the * list of IFA's */ struct ifnet *ifn; struct ifaddr *ifa; struct sctp_ifa *sctp_ifa; uint32_t ifa_flags; #ifdef INET6 struct in6_ifaddr *ifa6; #endif IFNET_RLOCK(); - TAILQ_FOREACH(ifn, &MODULE_GLOBAL(ifnet), if_link) { + CK_STAILQ_FOREACH(ifn, &MODULE_GLOBAL(ifnet), if_link) { if (sctp_is_desired_interface_type(ifn) == 0) { /* non desired type */ continue; } IF_ADDR_RLOCK(ifn); CK_STAILQ_FOREACH(ifa, &ifn->if_addrhead, ifa_link) { if (ifa->ifa_addr == NULL) { continue; } switch (ifa->ifa_addr->sa_family) { #ifdef INET case AF_INET: if (((struct sockaddr_in *)ifa->ifa_addr)->sin_addr.s_addr == 0) { continue; } break; #endif #ifdef INET6 case AF_INET6: if (IN6_IS_ADDR_UNSPECIFIED(&((struct sockaddr_in6 *)ifa->ifa_addr)->sin6_addr)) { /* skip unspecifed addresses */ continue; } break; #endif default: continue; } switch (ifa->ifa_addr->sa_family) { #ifdef INET case AF_INET: ifa_flags = 0; break; #endif #ifdef INET6 case AF_INET6: ifa6 = (struct in6_ifaddr *)ifa; ifa_flags = ifa6->ia6_flags; break; #endif default: ifa_flags = 0; break; } sctp_ifa = sctp_add_addr_to_vrf(vrfid, (void *)ifn, ifn->if_index, ifn->if_type, ifn->if_xname, (void *)ifa, ifa->ifa_addr, ifa_flags, 0); if (sctp_ifa) { sctp_ifa->localifa_flags &= ~SCTP_ADDR_DEFER_USE; } } IF_ADDR_RUNLOCK(ifn); } IFNET_RUNLOCK(); } void sctp_init_vrf_list(int vrfid) { if (vrfid > SCTP_MAX_VRF_ID) /* can't do that */ return; /* Don't care about return here */ (void)sctp_allocate_vrf(vrfid); /* * Now we need to build all the ifn's for this vrf and there * addresses */ sctp_init_ifns_for_vrf(vrfid); } void sctp_addr_change(struct ifaddr *ifa, int cmd) { uint32_t ifa_flags = 0; if (SCTP_BASE_VAR(sctp_pcb_initialized) == 0) { return; } /* * BSD only has one VRF, if this changes we will need to hook in the * right things here to get the id to pass to the address management * routine. */ if (SCTP_BASE_VAR(first_time) == 0) { /* Special test to see if my ::1 will showup with this */ SCTP_BASE_VAR(first_time) = 1; sctp_init_ifns_for_vrf(SCTP_DEFAULT_VRFID); } if ((cmd != RTM_ADD) && (cmd != RTM_DELETE)) { /* don't know what to do with this */ return; } if (ifa->ifa_addr == NULL) { return; } if (sctp_is_desired_interface_type(ifa->ifa_ifp) == 0) { /* non desired type */ return; } switch (ifa->ifa_addr->sa_family) { #ifdef INET case AF_INET: if (((struct sockaddr_in *)ifa->ifa_addr)->sin_addr.s_addr == 0) { return; } break; #endif #ifdef INET6 case AF_INET6: ifa_flags = ((struct in6_ifaddr *)ifa)->ia6_flags; if (IN6_IS_ADDR_UNSPECIFIED(&((struct sockaddr_in6 *)ifa->ifa_addr)->sin6_addr)) { /* skip unspecifed addresses */ return; } break; #endif default: /* non inet/inet6 skip */ return; } if (cmd == RTM_ADD) { (void)sctp_add_addr_to_vrf(SCTP_DEFAULT_VRFID, (void *)ifa->ifa_ifp, ifa->ifa_ifp->if_index, ifa->ifa_ifp->if_type, ifa->ifa_ifp->if_xname, (void *)ifa, ifa->ifa_addr, ifa_flags, 1); } else { sctp_del_addr_from_vrf(SCTP_DEFAULT_VRFID, ifa->ifa_addr, ifa->ifa_ifp->if_index, ifa->ifa_ifp->if_xname); /* * We don't bump refcount here so when it completes the * final delete will happen. */ } } void sctp_add_or_del_interfaces(int (*pred) (struct ifnet *), int add){ struct ifnet *ifn; struct ifaddr *ifa; IFNET_RLOCK(); - TAILQ_FOREACH(ifn, &MODULE_GLOBAL(ifnet), if_link) { + CK_STAILQ_FOREACH(ifn, &MODULE_GLOBAL(ifnet), if_link) { if (!(*pred) (ifn)) { continue; } CK_STAILQ_FOREACH(ifa, &ifn->if_addrhead, ifa_link) { sctp_addr_change(ifa, add ? RTM_ADD : RTM_DELETE); } } IFNET_RUNLOCK(); } struct mbuf * sctp_get_mbuf_for_msg(unsigned int space_needed, int want_header, int how, int allonebuf, int type) { struct mbuf *m = NULL; m = m_getm2(NULL, space_needed, how, type, want_header ? M_PKTHDR : 0); if (m == NULL) { /* bad, no memory */ return (m); } if (allonebuf) { if (SCTP_BUF_SIZE(m) < space_needed) { m_freem(m); return (NULL); } KASSERT(SCTP_BUF_NEXT(m) == NULL, ("%s: no chain allowed", __FUNCTION__)); } #ifdef SCTP_MBUF_LOGGING if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_MBUF_LOGGING_ENABLE) { sctp_log_mb(m, SCTP_MBUF_IALLOC); } #endif return (m); } #ifdef SCTP_PACKET_LOGGING void sctp_packet_log(struct mbuf *m) { int *lenat, thisone; void *copyto; uint32_t *tick_tock; int length; int total_len; int grabbed_lock = 0; int value, newval, thisend, thisbegin; /* * Buffer layout. -sizeof this entry (total_len) -previous end * (value) -ticks of log (ticks) o -ip packet o -as logged - * where this started (thisbegin) x <--end points here */ length = SCTP_HEADER_LEN(m); total_len = SCTP_SIZE32((length + (4 * sizeof(int)))); /* Log a packet to the buffer. */ if (total_len > SCTP_PACKET_LOG_SIZE) { /* Can't log this packet I have not a buffer big enough */ return; } if (length < (int)(SCTP_MIN_V4_OVERHEAD + sizeof(struct sctp_cookie_ack_chunk))) { return; } atomic_add_int(&SCTP_BASE_VAR(packet_log_writers), 1); try_again: if (SCTP_BASE_VAR(packet_log_writers) > SCTP_PKTLOG_WRITERS_NEED_LOCK) { SCTP_IP_PKTLOG_LOCK(); grabbed_lock = 1; again_locked: value = SCTP_BASE_VAR(packet_log_end); newval = SCTP_BASE_VAR(packet_log_end) + total_len; if (newval >= SCTP_PACKET_LOG_SIZE) { /* we wrapped */ thisbegin = 0; thisend = total_len; } else { thisbegin = SCTP_BASE_VAR(packet_log_end); thisend = newval; } if (!(atomic_cmpset_int(&SCTP_BASE_VAR(packet_log_end), value, thisend))) { goto again_locked; } } else { value = SCTP_BASE_VAR(packet_log_end); newval = SCTP_BASE_VAR(packet_log_end) + total_len; if (newval >= SCTP_PACKET_LOG_SIZE) { /* we wrapped */ thisbegin = 0; thisend = total_len; } else { thisbegin = SCTP_BASE_VAR(packet_log_end); thisend = newval; } if (!(atomic_cmpset_int(&SCTP_BASE_VAR(packet_log_end), value, thisend))) { goto try_again; } } /* Sanity check */ if (thisend >= SCTP_PACKET_LOG_SIZE) { SCTP_PRINTF("Insanity stops a log thisbegin:%d thisend:%d writers:%d lock:%d end:%d\n", thisbegin, thisend, SCTP_BASE_VAR(packet_log_writers), grabbed_lock, SCTP_BASE_VAR(packet_log_end)); SCTP_BASE_VAR(packet_log_end) = 0; goto no_log; } lenat = (int *)&SCTP_BASE_VAR(packet_log_buffer)[thisbegin]; *lenat = total_len; lenat++; *lenat = value; lenat++; tick_tock = (uint32_t *)lenat; lenat++; *tick_tock = sctp_get_tick_count(); copyto = (void *)lenat; thisone = thisend - sizeof(int); lenat = (int *)&SCTP_BASE_VAR(packet_log_buffer)[thisone]; *lenat = thisbegin; if (grabbed_lock) { SCTP_IP_PKTLOG_UNLOCK(); grabbed_lock = 0; } m_copydata(m, 0, length, (caddr_t)copyto); no_log: if (grabbed_lock) { SCTP_IP_PKTLOG_UNLOCK(); } atomic_subtract_int(&SCTP_BASE_VAR(packet_log_writers), 1); } int sctp_copy_out_packet_log(uint8_t *target, int length) { /* * We wind through the packet log starting at start copying up to * length bytes out. We return the number of bytes copied. */ int tocopy, this_copy; int *lenat; int did_delay = 0; tocopy = length; if (length < (int)(2 * sizeof(int))) { /* not enough room */ return (0); } if (SCTP_PKTLOG_WRITERS_NEED_LOCK) { atomic_add_int(&SCTP_BASE_VAR(packet_log_writers), SCTP_PKTLOG_WRITERS_NEED_LOCK); again: if ((did_delay == 0) && (SCTP_BASE_VAR(packet_log_writers) != SCTP_PKTLOG_WRITERS_NEED_LOCK)) { /* * we delay here for just a moment hoping the * writer(s) that were present when we entered will * have left and we only have locking ones that will * contend with us for the lock. This does not * assure 100% access, but its good enough for a * logging facility like this. */ did_delay = 1; DELAY(10); goto again; } } SCTP_IP_PKTLOG_LOCK(); lenat = (int *)target; *lenat = SCTP_BASE_VAR(packet_log_end); lenat++; this_copy = min((length - sizeof(int)), SCTP_PACKET_LOG_SIZE); memcpy((void *)lenat, (void *)SCTP_BASE_VAR(packet_log_buffer), this_copy); if (SCTP_PKTLOG_WRITERS_NEED_LOCK) { atomic_subtract_int(&SCTP_BASE_VAR(packet_log_writers), SCTP_PKTLOG_WRITERS_NEED_LOCK); } SCTP_IP_PKTLOG_UNLOCK(); return (this_copy + sizeof(int)); } #endif Index: head/sys/netinet6/icmp6.c =================================================================== --- head/sys/netinet6/icmp6.c (revision 334117) +++ head/sys/netinet6/icmp6.c (revision 334118) @@ -1,2818 +1,2818 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $KAME: icmp6.c,v 1.211 2001/04/04 05:56:20 itojun Exp $ */ /*- * Copyright (c) 1982, 1986, 1988, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ip_icmp.c 8.2 (Berkeley) 1/4/94 */ #include __FBSDID("$FreeBSD$"); #define MBUF_PRIVATE /* XXXRW: Optimisation tries to avoid M_EXT mbufs */ #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include extern struct domain inet6domain; VNET_PCPUSTAT_DEFINE(struct icmp6stat, icmp6stat); VNET_PCPUSTAT_SYSINIT(icmp6stat); #ifdef VIMAGE VNET_PCPUSTAT_SYSUNINIT(icmp6stat); #endif /* VIMAGE */ VNET_DECLARE(struct inpcbinfo, ripcbinfo); VNET_DECLARE(struct inpcbhead, ripcb); VNET_DECLARE(int, icmp6errppslim); static VNET_DEFINE(int, icmp6errpps_count) = 0; static VNET_DEFINE(struct timeval, icmp6errppslim_last); VNET_DECLARE(int, icmp6_nodeinfo); #define V_ripcbinfo VNET(ripcbinfo) #define V_ripcb VNET(ripcb) #define V_icmp6errppslim VNET(icmp6errppslim) #define V_icmp6errpps_count VNET(icmp6errpps_count) #define V_icmp6errppslim_last VNET(icmp6errppslim_last) #define V_icmp6_nodeinfo VNET(icmp6_nodeinfo) static void icmp6_errcount(int, int); static int icmp6_rip6_input(struct mbuf **, int); static int icmp6_ratelimit(const struct in6_addr *, const int, const int); static const char *icmp6_redirect_diag(struct in6_addr *, struct in6_addr *, struct in6_addr *); static struct mbuf *ni6_input(struct mbuf *, int); static struct mbuf *ni6_nametodns(const char *, int, int); static int ni6_dnsmatch(const char *, int, const char *, int); static int ni6_addrs(struct icmp6_nodeinfo *, struct mbuf *, struct ifnet **, struct in6_addr *); static int ni6_store_addrs(struct icmp6_nodeinfo *, struct icmp6_nodeinfo *, struct ifnet *, int); static int icmp6_notify_error(struct mbuf **, int, int, int); /* * Kernel module interface for updating icmp6stat. The argument is an index * into icmp6stat treated as an array of u_quad_t. While this encodes the * general layout of icmp6stat into the caller, it doesn't encode its * location, so that future changes to add, for example, per-CPU stats * support won't cause binary compatibility problems for kernel modules. */ void kmod_icmp6stat_inc(int statnum) { counter_u64_add(VNET(icmp6stat)[statnum], 1); } static void icmp6_errcount(int type, int code) { switch (type) { case ICMP6_DST_UNREACH: switch (code) { case ICMP6_DST_UNREACH_NOROUTE: ICMP6STAT_INC(icp6s_odst_unreach_noroute); return; case ICMP6_DST_UNREACH_ADMIN: ICMP6STAT_INC(icp6s_odst_unreach_admin); return; case ICMP6_DST_UNREACH_BEYONDSCOPE: ICMP6STAT_INC(icp6s_odst_unreach_beyondscope); return; case ICMP6_DST_UNREACH_ADDR: ICMP6STAT_INC(icp6s_odst_unreach_addr); return; case ICMP6_DST_UNREACH_NOPORT: ICMP6STAT_INC(icp6s_odst_unreach_noport); return; } break; case ICMP6_PACKET_TOO_BIG: ICMP6STAT_INC(icp6s_opacket_too_big); return; case ICMP6_TIME_EXCEEDED: switch (code) { case ICMP6_TIME_EXCEED_TRANSIT: ICMP6STAT_INC(icp6s_otime_exceed_transit); return; case ICMP6_TIME_EXCEED_REASSEMBLY: ICMP6STAT_INC(icp6s_otime_exceed_reassembly); return; } break; case ICMP6_PARAM_PROB: switch (code) { case ICMP6_PARAMPROB_HEADER: ICMP6STAT_INC(icp6s_oparamprob_header); return; case ICMP6_PARAMPROB_NEXTHEADER: ICMP6STAT_INC(icp6s_oparamprob_nextheader); return; case ICMP6_PARAMPROB_OPTION: ICMP6STAT_INC(icp6s_oparamprob_option); return; } break; case ND_REDIRECT: ICMP6STAT_INC(icp6s_oredirect); return; } ICMP6STAT_INC(icp6s_ounknown); } /* * A wrapper function for icmp6_error() necessary when the erroneous packet * may not contain enough scope zone information. */ void icmp6_error2(struct mbuf *m, int type, int code, int param, struct ifnet *ifp) { struct ip6_hdr *ip6; if (ifp == NULL) return; #ifndef PULLDOWN_TEST IP6_EXTHDR_CHECK(m, 0, sizeof(struct ip6_hdr), ); #else if (m->m_len < sizeof(struct ip6_hdr)) { m = m_pullup(m, sizeof(struct ip6_hdr)); if (m == NULL) return; } #endif ip6 = mtod(m, struct ip6_hdr *); if (in6_setscope(&ip6->ip6_src, ifp, NULL) != 0) return; if (in6_setscope(&ip6->ip6_dst, ifp, NULL) != 0) return; icmp6_error(m, type, code, param); } /* * Generate an error packet of type error in response to bad IP6 packet. */ void icmp6_error(struct mbuf *m, int type, int code, int param) { struct ip6_hdr *oip6, *nip6; struct icmp6_hdr *icmp6; u_int preplen; int off; int nxt; ICMP6STAT_INC(icp6s_error); /* count per-type-code statistics */ icmp6_errcount(type, code); #ifdef M_DECRYPTED /*not openbsd*/ if (m->m_flags & M_DECRYPTED) { ICMP6STAT_INC(icp6s_canterror); goto freeit; } #endif #ifndef PULLDOWN_TEST IP6_EXTHDR_CHECK(m, 0, sizeof(struct ip6_hdr), ); #else if (m->m_len < sizeof(struct ip6_hdr)) { m = m_pullup(m, sizeof(struct ip6_hdr)); if (m == NULL) return; } #endif oip6 = mtod(m, struct ip6_hdr *); /* * If the destination address of the erroneous packet is a multicast * address, or the packet was sent using link-layer multicast, * we should basically suppress sending an error (RFC 2463, Section * 2.4). * We have two exceptions (the item e.2 in that section): * - the Packet Too Big message can be sent for path MTU discovery. * - the Parameter Problem Message that can be allowed an icmp6 error * in the option type field. This check has been done in * ip6_unknown_opt(), so we can just check the type and code. */ if ((m->m_flags & (M_BCAST|M_MCAST) || IN6_IS_ADDR_MULTICAST(&oip6->ip6_dst)) && (type != ICMP6_PACKET_TOO_BIG && (type != ICMP6_PARAM_PROB || code != ICMP6_PARAMPROB_OPTION))) goto freeit; /* * RFC 2463, 2.4 (e.5): source address check. * XXX: the case of anycast source? */ if (IN6_IS_ADDR_UNSPECIFIED(&oip6->ip6_src) || IN6_IS_ADDR_MULTICAST(&oip6->ip6_src)) goto freeit; /* * If we are about to send ICMPv6 against ICMPv6 error/redirect, * don't do it. */ nxt = -1; off = ip6_lasthdr(m, 0, IPPROTO_IPV6, &nxt); if (off >= 0 && nxt == IPPROTO_ICMPV6) { struct icmp6_hdr *icp; #ifndef PULLDOWN_TEST IP6_EXTHDR_CHECK(m, 0, off + sizeof(struct icmp6_hdr), ); icp = (struct icmp6_hdr *)(mtod(m, caddr_t) + off); #else IP6_EXTHDR_GET(icp, struct icmp6_hdr *, m, off, sizeof(*icp)); if (icp == NULL) { ICMP6STAT_INC(icp6s_tooshort); return; } #endif if (icp->icmp6_type < ICMP6_ECHO_REQUEST || icp->icmp6_type == ND_REDIRECT) { /* * ICMPv6 error * Special case: for redirect (which is * informational) we must not send icmp6 error. */ ICMP6STAT_INC(icp6s_canterror); goto freeit; } else { /* ICMPv6 informational - send the error */ } } else { /* non-ICMPv6 - send the error */ } oip6 = mtod(m, struct ip6_hdr *); /* adjust pointer */ /* Finally, do rate limitation check. */ if (icmp6_ratelimit(&oip6->ip6_src, type, code)) { ICMP6STAT_INC(icp6s_toofreq); goto freeit; } /* * OK, ICMP6 can be generated. */ if (m->m_pkthdr.len >= ICMPV6_PLD_MAXLEN) m_adj(m, ICMPV6_PLD_MAXLEN - m->m_pkthdr.len); preplen = sizeof(struct ip6_hdr) + sizeof(struct icmp6_hdr); M_PREPEND(m, preplen, M_NOWAIT); /* FIB is also copied over. */ if (m == NULL) { nd6log((LOG_DEBUG, "ENOBUFS in icmp6_error %d\n", __LINE__)); return; } nip6 = mtod(m, struct ip6_hdr *); nip6->ip6_src = oip6->ip6_src; nip6->ip6_dst = oip6->ip6_dst; in6_clearscope(&oip6->ip6_src); in6_clearscope(&oip6->ip6_dst); icmp6 = (struct icmp6_hdr *)(nip6 + 1); icmp6->icmp6_type = type; icmp6->icmp6_code = code; icmp6->icmp6_pptr = htonl((u_int32_t)param); ICMP6STAT_INC(icp6s_outhist[type]); icmp6_reflect(m, sizeof(struct ip6_hdr)); /* header order: IPv6 - ICMPv6 */ return; freeit: /* * If we can't tell whether or not we can generate ICMP6, free it. */ m_freem(m); } /* * Process a received ICMP6 message. */ int icmp6_input(struct mbuf **mp, int *offp, int proto) { struct mbuf *m = *mp, *n; struct ifnet *ifp; struct ip6_hdr *ip6, *nip6; struct icmp6_hdr *icmp6, *nicmp6; int off = *offp; int icmp6len = m->m_pkthdr.len - *offp; int code, sum, noff; char ip6bufs[INET6_ADDRSTRLEN], ip6bufd[INET6_ADDRSTRLEN]; int ip6len, error; ifp = m->m_pkthdr.rcvif; #ifndef PULLDOWN_TEST IP6_EXTHDR_CHECK(m, off, sizeof(struct icmp6_hdr), IPPROTO_DONE); /* m might change if M_LOOP. So, call mtod after this */ #endif /* * Locate icmp6 structure in mbuf, and check * that not corrupted and of at least minimum length */ ip6 = mtod(m, struct ip6_hdr *); ip6len = sizeof(struct ip6_hdr) + ntohs(ip6->ip6_plen); if (icmp6len < sizeof(struct icmp6_hdr)) { ICMP6STAT_INC(icp6s_tooshort); goto freeit; } /* * Check multicast group membership. * Note: SSM filters are not applied for ICMPv6 traffic. */ if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) { struct in6_multi *inm; inm = in6m_lookup(ifp, &ip6->ip6_dst); if (inm == NULL) { IP6STAT_INC(ip6s_notmember); in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_discard); goto freeit; } } /* * calculate the checksum */ #ifndef PULLDOWN_TEST icmp6 = (struct icmp6_hdr *)((caddr_t)ip6 + off); #else IP6_EXTHDR_GET(icmp6, struct icmp6_hdr *, m, off, sizeof(*icmp6)); if (icmp6 == NULL) { ICMP6STAT_INC(icp6s_tooshort); return IPPROTO_DONE; } #endif code = icmp6->icmp6_code; if ((sum = in6_cksum(m, IPPROTO_ICMPV6, off, icmp6len)) != 0) { nd6log((LOG_ERR, "ICMP6 checksum error(%d|%x) %s\n", icmp6->icmp6_type, sum, ip6_sprintf(ip6bufs, &ip6->ip6_src))); ICMP6STAT_INC(icp6s_checksum); goto freeit; } ICMP6STAT_INC(icp6s_inhist[icmp6->icmp6_type]); icmp6_ifstat_inc(ifp, ifs6_in_msg); if (icmp6->icmp6_type < ICMP6_INFOMSG_MASK) icmp6_ifstat_inc(ifp, ifs6_in_error); switch (icmp6->icmp6_type) { case ICMP6_DST_UNREACH: icmp6_ifstat_inc(ifp, ifs6_in_dstunreach); switch (code) { case ICMP6_DST_UNREACH_NOROUTE: case ICMP6_DST_UNREACH_ADDR: /* PRC_HOSTDEAD is a DOS */ code = PRC_UNREACH_NET; break; case ICMP6_DST_UNREACH_ADMIN: icmp6_ifstat_inc(ifp, ifs6_in_adminprohib); code = PRC_UNREACH_ADMIN_PROHIB; break; case ICMP6_DST_UNREACH_BEYONDSCOPE: /* I mean "source address was incorrect." */ code = PRC_PARAMPROB; break; case ICMP6_DST_UNREACH_NOPORT: code = PRC_UNREACH_PORT; break; default: goto badcode; } goto deliver; break; case ICMP6_PACKET_TOO_BIG: icmp6_ifstat_inc(ifp, ifs6_in_pkttoobig); /* validation is made in icmp6_mtudisc_update */ code = PRC_MSGSIZE; /* * Updating the path MTU will be done after examining * intermediate extension headers. */ goto deliver; break; case ICMP6_TIME_EXCEEDED: icmp6_ifstat_inc(ifp, ifs6_in_timeexceed); switch (code) { case ICMP6_TIME_EXCEED_TRANSIT: code = PRC_TIMXCEED_INTRANS; break; case ICMP6_TIME_EXCEED_REASSEMBLY: code = PRC_TIMXCEED_REASS; break; default: goto badcode; } goto deliver; break; case ICMP6_PARAM_PROB: icmp6_ifstat_inc(ifp, ifs6_in_paramprob); switch (code) { case ICMP6_PARAMPROB_NEXTHEADER: code = PRC_UNREACH_PROTOCOL; break; case ICMP6_PARAMPROB_HEADER: case ICMP6_PARAMPROB_OPTION: code = PRC_PARAMPROB; break; default: goto badcode; } goto deliver; break; case ICMP6_ECHO_REQUEST: icmp6_ifstat_inc(ifp, ifs6_in_echo); if (code != 0) goto badcode; if ((n = m_copym(m, 0, M_COPYALL, M_NOWAIT)) == NULL) { /* Give up remote */ break; } if (!M_WRITABLE(n) || n->m_len < off + sizeof(struct icmp6_hdr)) { struct mbuf *n0 = n; int n0len; CTASSERT(sizeof(*nip6) + sizeof(*nicmp6) <= MHLEN); n = m_gethdr(M_NOWAIT, n0->m_type); if (n == NULL) { /* Give up remote */ m_freem(n0); break; } m_move_pkthdr(n, n0); /* FIB copied. */ n0len = n0->m_pkthdr.len; /* save for use below */ /* * Copy IPv6 and ICMPv6 only. */ nip6 = mtod(n, struct ip6_hdr *); bcopy(ip6, nip6, sizeof(struct ip6_hdr)); nicmp6 = (struct icmp6_hdr *)(nip6 + 1); bcopy(icmp6, nicmp6, sizeof(struct icmp6_hdr)); noff = sizeof(struct ip6_hdr); /* new mbuf contains only ipv6+icmpv6 headers */ n->m_len = noff + sizeof(struct icmp6_hdr); /* * Adjust mbuf. ip6_plen will be adjusted in * ip6_output(). */ m_adj(n0, off + sizeof(struct icmp6_hdr)); /* recalculate complete packet size */ n->m_pkthdr.len = n0len + (noff - off); n->m_next = n0; } else { IP6_EXTHDR_GET(nicmp6, struct icmp6_hdr *, n, off, sizeof(*nicmp6)); noff = off; } if (n) { nicmp6->icmp6_type = ICMP6_ECHO_REPLY; nicmp6->icmp6_code = 0; ICMP6STAT_INC(icp6s_reflect); ICMP6STAT_INC(icp6s_outhist[ICMP6_ECHO_REPLY]); icmp6_reflect(n, noff); } break; case ICMP6_ECHO_REPLY: icmp6_ifstat_inc(ifp, ifs6_in_echoreply); if (code != 0) goto badcode; break; case MLD_LISTENER_QUERY: case MLD_LISTENER_REPORT: case MLD_LISTENER_DONE: case MLDV2_LISTENER_REPORT: /* * Drop MLD traffic which is not link-local, has a hop limit * of greater than 1 hop, or which does not have the * IPv6 HBH Router Alert option. * As IPv6 HBH options are stripped in ip6_input() we must * check an mbuf header flag. * XXX Should we also sanity check that these messages * were directed to a link-local multicast prefix? */ if ((ip6->ip6_hlim != 1) || (m->m_flags & M_RTALERT_MLD) == 0) goto freeit; if (mld_input(m, off, icmp6len) != 0) return (IPPROTO_DONE); /* m stays. */ break; case ICMP6_WRUREQUEST: /* ICMP6_FQDN_QUERY */ { enum { WRU, FQDN } mode; if (!V_icmp6_nodeinfo) break; if (icmp6len == sizeof(struct icmp6_hdr) + 4) mode = WRU; else if (icmp6len >= sizeof(struct icmp6_nodeinfo)) mode = FQDN; else goto badlen; if (mode == FQDN) { #ifndef PULLDOWN_TEST IP6_EXTHDR_CHECK(m, off, sizeof(struct icmp6_nodeinfo), IPPROTO_DONE); #endif n = m_copym(m, 0, M_COPYALL, M_NOWAIT); if (n) n = ni6_input(n, off); /* XXX meaningless if n == NULL */ noff = sizeof(struct ip6_hdr); } else { struct prison *pr; u_char *p; int maxhlen, hlen; /* * XXX: this combination of flags is pointless, * but should we keep this for compatibility? */ if ((V_icmp6_nodeinfo & (ICMP6_NODEINFO_FQDNOK | ICMP6_NODEINFO_TMPADDROK)) != (ICMP6_NODEINFO_FQDNOK | ICMP6_NODEINFO_TMPADDROK)) break; if (code != 0) goto badcode; CTASSERT(sizeof(*nip6) + sizeof(*nicmp6) + 4 <= MHLEN); n = m_gethdr(M_NOWAIT, m->m_type); if (n == NULL) { /* Give up remote */ break; } if (!m_dup_pkthdr(n, m, M_NOWAIT)) { /* * Previous code did a blind M_COPY_PKTHDR * and said "just for rcvif". If true, then * we could tolerate the dup failing (due to * the deep copy of the tag chain). For now * be conservative and just fail. */ m_free(n); n = NULL; break; } maxhlen = M_TRAILINGSPACE(n) - (sizeof(*nip6) + sizeof(*nicmp6) + 4); pr = curthread->td_ucred->cr_prison; mtx_lock(&pr->pr_mtx); hlen = strlen(pr->pr_hostname); if (maxhlen > hlen) maxhlen = hlen; /* * Copy IPv6 and ICMPv6 only. */ nip6 = mtod(n, struct ip6_hdr *); bcopy(ip6, nip6, sizeof(struct ip6_hdr)); nicmp6 = (struct icmp6_hdr *)(nip6 + 1); bcopy(icmp6, nicmp6, sizeof(struct icmp6_hdr)); p = (u_char *)(nicmp6 + 1); bzero(p, 4); /* meaningless TTL */ bcopy(pr->pr_hostname, p + 4, maxhlen); mtx_unlock(&pr->pr_mtx); noff = sizeof(struct ip6_hdr); n->m_pkthdr.len = n->m_len = sizeof(struct ip6_hdr) + sizeof(struct icmp6_hdr) + 4 + maxhlen; nicmp6->icmp6_type = ICMP6_WRUREPLY; nicmp6->icmp6_code = 0; } if (n) { ICMP6STAT_INC(icp6s_reflect); ICMP6STAT_INC(icp6s_outhist[ICMP6_WRUREPLY]); icmp6_reflect(n, noff); } break; } case ICMP6_WRUREPLY: if (code != 0) goto badcode; break; case ND_ROUTER_SOLICIT: icmp6_ifstat_inc(ifp, ifs6_in_routersolicit); if (code != 0) goto badcode; if (icmp6len < sizeof(struct nd_router_solicit)) goto badlen; if (send_sendso_input_hook != NULL) { IP6_EXTHDR_CHECK(m, off, icmp6len, IPPROTO_DONE); error = send_sendso_input_hook(m, ifp, SND_IN, ip6len); if (error == 0) { m = NULL; goto freeit; } } n = m_copym(m, 0, M_COPYALL, M_NOWAIT); nd6_rs_input(m, off, icmp6len); m = n; if (m == NULL) goto freeit; break; case ND_ROUTER_ADVERT: icmp6_ifstat_inc(ifp, ifs6_in_routeradvert); if (code != 0) goto badcode; if (icmp6len < sizeof(struct nd_router_advert)) goto badlen; if (send_sendso_input_hook != NULL) { error = send_sendso_input_hook(m, ifp, SND_IN, ip6len); if (error == 0) { m = NULL; goto freeit; } } n = m_copym(m, 0, M_COPYALL, M_NOWAIT); nd6_ra_input(m, off, icmp6len); m = n; if (m == NULL) goto freeit; break; case ND_NEIGHBOR_SOLICIT: icmp6_ifstat_inc(ifp, ifs6_in_neighborsolicit); if (code != 0) goto badcode; if (icmp6len < sizeof(struct nd_neighbor_solicit)) goto badlen; if (send_sendso_input_hook != NULL) { error = send_sendso_input_hook(m, ifp, SND_IN, ip6len); if (error == 0) { m = NULL; goto freeit; } } n = m_copym(m, 0, M_COPYALL, M_NOWAIT); nd6_ns_input(m, off, icmp6len); m = n; if (m == NULL) goto freeit; break; case ND_NEIGHBOR_ADVERT: icmp6_ifstat_inc(ifp, ifs6_in_neighboradvert); if (code != 0) goto badcode; if (icmp6len < sizeof(struct nd_neighbor_advert)) goto badlen; if (send_sendso_input_hook != NULL) { error = send_sendso_input_hook(m, ifp, SND_IN, ip6len); if (error == 0) { m = NULL; goto freeit; } } n = m_copym(m, 0, M_COPYALL, M_NOWAIT); nd6_na_input(m, off, icmp6len); m = n; if (m == NULL) goto freeit; break; case ND_REDIRECT: icmp6_ifstat_inc(ifp, ifs6_in_redirect); if (code != 0) goto badcode; if (icmp6len < sizeof(struct nd_redirect)) goto badlen; if (send_sendso_input_hook != NULL) { error = send_sendso_input_hook(m, ifp, SND_IN, ip6len); if (error == 0) { m = NULL; goto freeit; } } n = m_copym(m, 0, M_COPYALL, M_NOWAIT); icmp6_redirect_input(m, off); m = n; if (m == NULL) goto freeit; break; case ICMP6_ROUTER_RENUMBERING: if (code != ICMP6_ROUTER_RENUMBERING_COMMAND && code != ICMP6_ROUTER_RENUMBERING_RESULT) goto badcode; if (icmp6len < sizeof(struct icmp6_router_renum)) goto badlen; break; default: nd6log((LOG_DEBUG, "icmp6_input: unknown type %d(src=%s, dst=%s, ifid=%d)\n", icmp6->icmp6_type, ip6_sprintf(ip6bufs, &ip6->ip6_src), ip6_sprintf(ip6bufd, &ip6->ip6_dst), ifp ? ifp->if_index : 0)); if (icmp6->icmp6_type < ICMP6_ECHO_REQUEST) { /* ICMPv6 error: MUST deliver it by spec... */ code = PRC_NCMDS; /* deliver */ } else { /* ICMPv6 informational: MUST not deliver */ break; } deliver: if (icmp6_notify_error(&m, off, icmp6len, code) != 0) { /* In this case, m should've been freed. */ return (IPPROTO_DONE); } break; badcode: ICMP6STAT_INC(icp6s_badcode); break; badlen: ICMP6STAT_INC(icp6s_badlen); break; } /* deliver the packet to appropriate sockets */ icmp6_rip6_input(&m, *offp); return IPPROTO_DONE; freeit: m_freem(m); return IPPROTO_DONE; } static int icmp6_notify_error(struct mbuf **mp, int off, int icmp6len, int code) { struct mbuf *m = *mp; struct icmp6_hdr *icmp6; struct ip6_hdr *eip6; u_int32_t notifymtu; struct sockaddr_in6 icmp6src, icmp6dst; if (icmp6len < sizeof(struct icmp6_hdr) + sizeof(struct ip6_hdr)) { ICMP6STAT_INC(icp6s_tooshort); goto freeit; } #ifndef PULLDOWN_TEST IP6_EXTHDR_CHECK(m, off, sizeof(struct icmp6_hdr) + sizeof(struct ip6_hdr), -1); icmp6 = (struct icmp6_hdr *)(mtod(m, caddr_t) + off); #else IP6_EXTHDR_GET(icmp6, struct icmp6_hdr *, m, off, sizeof(*icmp6) + sizeof(struct ip6_hdr)); if (icmp6 == NULL) { ICMP6STAT_INC(icp6s_tooshort); return (-1); } #endif eip6 = (struct ip6_hdr *)(icmp6 + 1); /* Detect the upper level protocol */ { void (*ctlfunc)(int, struct sockaddr *, void *); u_int8_t nxt = eip6->ip6_nxt; int eoff = off + sizeof(struct icmp6_hdr) + sizeof(struct ip6_hdr); struct ip6ctlparam ip6cp; struct in6_addr *finaldst = NULL; int icmp6type = icmp6->icmp6_type; struct ip6_frag *fh; struct ip6_rthdr *rth; struct ip6_rthdr0 *rth0; int rthlen; while (1) { /* XXX: should avoid infinite loop explicitly? */ struct ip6_ext *eh; switch (nxt) { case IPPROTO_HOPOPTS: case IPPROTO_DSTOPTS: case IPPROTO_AH: #ifndef PULLDOWN_TEST IP6_EXTHDR_CHECK(m, 0, eoff + sizeof(struct ip6_ext), -1); eh = (struct ip6_ext *)(mtod(m, caddr_t) + eoff); #else IP6_EXTHDR_GET(eh, struct ip6_ext *, m, eoff, sizeof(*eh)); if (eh == NULL) { ICMP6STAT_INC(icp6s_tooshort); return (-1); } #endif if (nxt == IPPROTO_AH) eoff += (eh->ip6e_len + 2) << 2; else eoff += (eh->ip6e_len + 1) << 3; nxt = eh->ip6e_nxt; break; case IPPROTO_ROUTING: /* * When the erroneous packet contains a * routing header, we should examine the * header to determine the final destination. * Otherwise, we can't properly update * information that depends on the final * destination (e.g. path MTU). */ #ifndef PULLDOWN_TEST IP6_EXTHDR_CHECK(m, 0, eoff + sizeof(*rth), -1); rth = (struct ip6_rthdr *) (mtod(m, caddr_t) + eoff); #else IP6_EXTHDR_GET(rth, struct ip6_rthdr *, m, eoff, sizeof(*rth)); if (rth == NULL) { ICMP6STAT_INC(icp6s_tooshort); return (-1); } #endif rthlen = (rth->ip6r_len + 1) << 3; /* * XXX: currently there is no * officially defined type other * than type-0. * Note that if the segment left field * is 0, all intermediate hops must * have been passed. */ if (rth->ip6r_segleft && rth->ip6r_type == IPV6_RTHDR_TYPE_0) { int hops; #ifndef PULLDOWN_TEST IP6_EXTHDR_CHECK(m, 0, eoff + rthlen, -1); rth0 = (struct ip6_rthdr0 *) (mtod(m, caddr_t) + eoff); #else IP6_EXTHDR_GET(rth0, struct ip6_rthdr0 *, m, eoff, rthlen); if (rth0 == NULL) { ICMP6STAT_INC(icp6s_tooshort); return (-1); } #endif /* just ignore a bogus header */ if ((rth0->ip6r0_len % 2) == 0 && (hops = rth0->ip6r0_len/2)) finaldst = (struct in6_addr *)(rth0 + 1) + (hops - 1); } eoff += rthlen; nxt = rth->ip6r_nxt; break; case IPPROTO_FRAGMENT: #ifndef PULLDOWN_TEST IP6_EXTHDR_CHECK(m, 0, eoff + sizeof(struct ip6_frag), -1); fh = (struct ip6_frag *)(mtod(m, caddr_t) + eoff); #else IP6_EXTHDR_GET(fh, struct ip6_frag *, m, eoff, sizeof(*fh)); if (fh == NULL) { ICMP6STAT_INC(icp6s_tooshort); return (-1); } #endif /* * Data after a fragment header is meaningless * unless it is the first fragment, but * we'll go to the notify label for path MTU * discovery. */ if (fh->ip6f_offlg & IP6F_OFF_MASK) goto notify; eoff += sizeof(struct ip6_frag); nxt = fh->ip6f_nxt; break; default: /* * This case includes ESP and the No Next * Header. In such cases going to the notify * label does not have any meaning * (i.e. ctlfunc will be NULL), but we go * anyway since we might have to update * path MTU information. */ goto notify; } } notify: #ifndef PULLDOWN_TEST icmp6 = (struct icmp6_hdr *)(mtod(m, caddr_t) + off); #else IP6_EXTHDR_GET(icmp6, struct icmp6_hdr *, m, off, sizeof(*icmp6) + sizeof(struct ip6_hdr)); if (icmp6 == NULL) { ICMP6STAT_INC(icp6s_tooshort); return (-1); } #endif /* * retrieve parameters from the inner IPv6 header, and convert * them into sockaddr structures. * XXX: there is no guarantee that the source or destination * addresses of the inner packet are in the same scope as * the addresses of the icmp packet. But there is no other * way to determine the zone. */ eip6 = (struct ip6_hdr *)(icmp6 + 1); bzero(&icmp6dst, sizeof(icmp6dst)); icmp6dst.sin6_len = sizeof(struct sockaddr_in6); icmp6dst.sin6_family = AF_INET6; if (finaldst == NULL) icmp6dst.sin6_addr = eip6->ip6_dst; else icmp6dst.sin6_addr = *finaldst; if (in6_setscope(&icmp6dst.sin6_addr, m->m_pkthdr.rcvif, NULL)) goto freeit; bzero(&icmp6src, sizeof(icmp6src)); icmp6src.sin6_len = sizeof(struct sockaddr_in6); icmp6src.sin6_family = AF_INET6; icmp6src.sin6_addr = eip6->ip6_src; if (in6_setscope(&icmp6src.sin6_addr, m->m_pkthdr.rcvif, NULL)) goto freeit; icmp6src.sin6_flowinfo = (eip6->ip6_flow & IPV6_FLOWLABEL_MASK); if (finaldst == NULL) finaldst = &eip6->ip6_dst; ip6cp.ip6c_m = m; ip6cp.ip6c_icmp6 = icmp6; ip6cp.ip6c_ip6 = (struct ip6_hdr *)(icmp6 + 1); ip6cp.ip6c_off = eoff; ip6cp.ip6c_finaldst = finaldst; ip6cp.ip6c_src = &icmp6src; ip6cp.ip6c_nxt = nxt; if (icmp6type == ICMP6_PACKET_TOO_BIG) { notifymtu = ntohl(icmp6->icmp6_mtu); ip6cp.ip6c_cmdarg = (void *)¬ifymtu; icmp6_mtudisc_update(&ip6cp, 1); /*XXX*/ } ctlfunc = (void (*)(int, struct sockaddr *, void *)) (inet6sw[ip6_protox[nxt]].pr_ctlinput); if (ctlfunc) { (void) (*ctlfunc)(code, (struct sockaddr *)&icmp6dst, &ip6cp); } } *mp = m; return (0); freeit: m_freem(m); return (-1); } void icmp6_mtudisc_update(struct ip6ctlparam *ip6cp, int validated) { struct in6_addr *dst = ip6cp->ip6c_finaldst; struct icmp6_hdr *icmp6 = ip6cp->ip6c_icmp6; struct mbuf *m = ip6cp->ip6c_m; /* will be necessary for scope issue */ u_int mtu = ntohl(icmp6->icmp6_mtu); struct in_conninfo inc; #if 0 /* * RFC2460 section 5, last paragraph. * even though minimum link MTU for IPv6 is IPV6_MMTU, * we may see ICMPv6 too big with mtu < IPV6_MMTU * due to packet translator in the middle. * see ip6_output() and ip6_getpmtu() "alwaysfrag" case for * special handling. */ if (mtu < IPV6_MMTU) return; #endif /* * we reject ICMPv6 too big with abnormally small value. * XXX what is the good definition of "abnormally small"? */ if (mtu < sizeof(struct ip6_hdr) + sizeof(struct ip6_frag) + 8) return; if (!validated) return; /* * In case the suggested mtu is less than IPV6_MMTU, we * only need to remember that it was for above mentioned * "alwaysfrag" case. * Try to be as close to the spec as possible. */ if (mtu < IPV6_MMTU) mtu = IPV6_MMTU - 8; bzero(&inc, sizeof(inc)); inc.inc_fibnum = M_GETFIB(m); inc.inc_flags |= INC_ISIPV6; inc.inc6_faddr = *dst; if (in6_setscope(&inc.inc6_faddr, m->m_pkthdr.rcvif, NULL)) return; if (mtu < tcp_maxmtu6(&inc, NULL)) { tcp_hc_updatemtu(&inc, mtu); ICMP6STAT_INC(icp6s_pmtuchg); } } /* * Process a Node Information Query packet, based on * draft-ietf-ipngwg-icmp-name-lookups-07. * * Spec incompatibilities: * - IPv6 Subject address handling * - IPv4 Subject address handling support missing * - Proxy reply (answer even if it's not for me) * - joins NI group address at in6_ifattach() time only, does not cope * with hostname changes by sethostname(3) */ static struct mbuf * ni6_input(struct mbuf *m, int off) { struct icmp6_nodeinfo *ni6, *nni6; struct mbuf *n = NULL; struct prison *pr; u_int16_t qtype; int subjlen; int replylen = sizeof(struct ip6_hdr) + sizeof(struct icmp6_nodeinfo); struct ni_reply_fqdn *fqdn; int addrs; /* for NI_QTYPE_NODEADDR */ struct ifnet *ifp = NULL; /* for NI_QTYPE_NODEADDR */ struct in6_addr in6_subj; /* subject address */ struct ip6_hdr *ip6; int oldfqdn = 0; /* if 1, return pascal string (03 draft) */ char *subj = NULL; struct in6_ifaddr *ia6 = NULL; ip6 = mtod(m, struct ip6_hdr *); #ifndef PULLDOWN_TEST ni6 = (struct icmp6_nodeinfo *)(mtod(m, caddr_t) + off); #else IP6_EXTHDR_GET(ni6, struct icmp6_nodeinfo *, m, off, sizeof(*ni6)); if (ni6 == NULL) { /* m is already reclaimed */ return (NULL); } #endif /* * Validate IPv6 source address. * The default configuration MUST be to refuse answering queries from * global-scope addresses according to RFC4602. * Notes: * - it's not very clear what "refuse" means; this implementation * simply drops it. * - it's not very easy to identify global-scope (unicast) addresses * since there are many prefixes for them. It should be safer * and in practice sufficient to check "all" but loopback and * link-local (note that site-local unicast was deprecated and * ULA is defined as global scope-wise) */ if ((V_icmp6_nodeinfo & ICMP6_NODEINFO_GLOBALOK) == 0 && !IN6_IS_ADDR_LOOPBACK(&ip6->ip6_src) && !IN6_IS_ADDR_LINKLOCAL(&ip6->ip6_src)) goto bad; /* * Validate IPv6 destination address. * * The Responder must discard the Query without further processing * unless it is one of the Responder's unicast or anycast addresses, or * a link-local scope multicast address which the Responder has joined. * [RFC4602, Section 5.] */ if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) { if (!IN6_IS_ADDR_MC_LINKLOCAL(&ip6->ip6_dst)) goto bad; /* else it's a link-local multicast, fine */ } else { /* unicast or anycast */ ia6 = in6ifa_ifwithaddr(&ip6->ip6_dst, 0 /* XXX */); if (ia6 == NULL) goto bad; /* XXX impossible */ if ((ia6->ia6_flags & IN6_IFF_TEMPORARY) && !(V_icmp6_nodeinfo & ICMP6_NODEINFO_TMPADDROK)) { ifa_free(&ia6->ia_ifa); nd6log((LOG_DEBUG, "ni6_input: ignore node info to " "a temporary address in %s:%d", __FILE__, __LINE__)); goto bad; } ifa_free(&ia6->ia_ifa); } /* validate query Subject field. */ qtype = ntohs(ni6->ni_qtype); subjlen = m->m_pkthdr.len - off - sizeof(struct icmp6_nodeinfo); switch (qtype) { case NI_QTYPE_NOOP: case NI_QTYPE_SUPTYPES: /* 07 draft */ if (ni6->ni_code == ICMP6_NI_SUBJ_FQDN && subjlen == 0) break; /* FALLTHROUGH */ case NI_QTYPE_FQDN: case NI_QTYPE_NODEADDR: case NI_QTYPE_IPV4ADDR: switch (ni6->ni_code) { case ICMP6_NI_SUBJ_IPV6: #if ICMP6_NI_SUBJ_IPV6 != 0 case 0: #endif /* * backward compatibility - try to accept 03 draft * format, where no Subject is present. */ if (qtype == NI_QTYPE_FQDN && ni6->ni_code == 0 && subjlen == 0) { oldfqdn++; break; } #if ICMP6_NI_SUBJ_IPV6 != 0 if (ni6->ni_code != ICMP6_NI_SUBJ_IPV6) goto bad; #endif if (subjlen != sizeof(struct in6_addr)) goto bad; /* * Validate Subject address. * * Not sure what exactly "address belongs to the node" * means in the spec, is it just unicast, or what? * * At this moment we consider Subject address as * "belong to the node" if the Subject address equals * to the IPv6 destination address; validation for * IPv6 destination address should have done enough * check for us. * * We do not do proxy at this moment. */ /* m_pulldown instead of copy? */ m_copydata(m, off + sizeof(struct icmp6_nodeinfo), subjlen, (caddr_t)&in6_subj); if (in6_setscope(&in6_subj, m->m_pkthdr.rcvif, NULL)) goto bad; subj = (char *)&in6_subj; if (IN6_ARE_ADDR_EQUAL(&ip6->ip6_dst, &in6_subj)) break; /* * XXX if we are to allow other cases, we should really * be careful about scope here. * basically, we should disallow queries toward IPv6 * destination X with subject Y, * if scope(X) > scope(Y). * if we allow scope(X) > scope(Y), it will result in * information leakage across scope boundary. */ goto bad; case ICMP6_NI_SUBJ_FQDN: /* * Validate Subject name with gethostname(3). * * The behavior may need some debate, since: * - we are not sure if the node has FQDN as * hostname (returned by gethostname(3)). * - the code does wildcard match for truncated names. * however, we are not sure if we want to perform * wildcard match, if gethostname(3) side has * truncated hostname. */ pr = curthread->td_ucred->cr_prison; mtx_lock(&pr->pr_mtx); n = ni6_nametodns(pr->pr_hostname, strlen(pr->pr_hostname), 0); mtx_unlock(&pr->pr_mtx); if (!n || n->m_next || n->m_len == 0) goto bad; IP6_EXTHDR_GET(subj, char *, m, off + sizeof(struct icmp6_nodeinfo), subjlen); if (subj == NULL) goto bad; if (!ni6_dnsmatch(subj, subjlen, mtod(n, const char *), n->m_len)) { goto bad; } m_freem(n); n = NULL; break; case ICMP6_NI_SUBJ_IPV4: /* XXX: to be implemented? */ default: goto bad; } break; } /* refuse based on configuration. XXX ICMP6_NI_REFUSED? */ switch (qtype) { case NI_QTYPE_FQDN: if ((V_icmp6_nodeinfo & ICMP6_NODEINFO_FQDNOK) == 0) goto bad; break; case NI_QTYPE_NODEADDR: case NI_QTYPE_IPV4ADDR: if ((V_icmp6_nodeinfo & ICMP6_NODEINFO_NODEADDROK) == 0) goto bad; break; } /* guess reply length */ switch (qtype) { case NI_QTYPE_NOOP: break; /* no reply data */ case NI_QTYPE_SUPTYPES: replylen += sizeof(u_int32_t); break; case NI_QTYPE_FQDN: /* XXX will append an mbuf */ replylen += offsetof(struct ni_reply_fqdn, ni_fqdn_namelen); break; case NI_QTYPE_NODEADDR: addrs = ni6_addrs(ni6, m, &ifp, (struct in6_addr *)subj); if ((replylen += addrs * (sizeof(struct in6_addr) + sizeof(u_int32_t))) > MCLBYTES) replylen = MCLBYTES; /* XXX: will truncate pkt later */ break; case NI_QTYPE_IPV4ADDR: /* unsupported - should respond with unknown Qtype? */ break; default: /* * XXX: We must return a reply with the ICMP6 code * `unknown Qtype' in this case. However we regard the case * as an FQDN query for backward compatibility. * Older versions set a random value to this field, * so it rarely varies in the defined qtypes. * But the mechanism is not reliable... * maybe we should obsolete older versions. */ qtype = NI_QTYPE_FQDN; /* XXX will append an mbuf */ replylen += offsetof(struct ni_reply_fqdn, ni_fqdn_namelen); oldfqdn++; break; } /* Allocate an mbuf to reply. */ if (replylen > MCLBYTES) { /* * XXX: should we try to allocate more? But MCLBYTES * is probably much larger than IPV6_MMTU... */ goto bad; } if (replylen > MHLEN) n = m_getcl(M_NOWAIT, m->m_type, M_PKTHDR); else n = m_gethdr(M_NOWAIT, m->m_type); if (n == NULL) { m_freem(m); return (NULL); } m_move_pkthdr(n, m); /* just for recvif and FIB */ n->m_pkthdr.len = n->m_len = replylen; /* copy mbuf header and IPv6 + Node Information base headers */ bcopy(mtod(m, caddr_t), mtod(n, caddr_t), sizeof(struct ip6_hdr)); nni6 = (struct icmp6_nodeinfo *)(mtod(n, struct ip6_hdr *) + 1); bcopy((caddr_t)ni6, (caddr_t)nni6, sizeof(struct icmp6_nodeinfo)); /* qtype dependent procedure */ switch (qtype) { case NI_QTYPE_NOOP: nni6->ni_code = ICMP6_NI_SUCCESS; nni6->ni_flags = 0; break; case NI_QTYPE_SUPTYPES: { u_int32_t v; nni6->ni_code = ICMP6_NI_SUCCESS; nni6->ni_flags = htons(0x0000); /* raw bitmap */ /* supports NOOP, SUPTYPES, FQDN, and NODEADDR */ v = (u_int32_t)htonl(0x0000000f); bcopy(&v, nni6 + 1, sizeof(u_int32_t)); break; } case NI_QTYPE_FQDN: nni6->ni_code = ICMP6_NI_SUCCESS; fqdn = (struct ni_reply_fqdn *)(mtod(n, caddr_t) + sizeof(struct ip6_hdr) + sizeof(struct icmp6_nodeinfo)); nni6->ni_flags = 0; /* XXX: meaningless TTL */ fqdn->ni_fqdn_ttl = 0; /* ditto. */ /* * XXX do we really have FQDN in hostname? */ pr = curthread->td_ucred->cr_prison; mtx_lock(&pr->pr_mtx); n->m_next = ni6_nametodns(pr->pr_hostname, strlen(pr->pr_hostname), oldfqdn); mtx_unlock(&pr->pr_mtx); if (n->m_next == NULL) goto bad; /* XXX we assume that n->m_next is not a chain */ if (n->m_next->m_next != NULL) goto bad; n->m_pkthdr.len += n->m_next->m_len; break; case NI_QTYPE_NODEADDR: { int lenlim, copied; nni6->ni_code = ICMP6_NI_SUCCESS; n->m_pkthdr.len = n->m_len = sizeof(struct ip6_hdr) + sizeof(struct icmp6_nodeinfo); lenlim = M_TRAILINGSPACE(n); copied = ni6_store_addrs(ni6, nni6, ifp, lenlim); /* XXX: reset mbuf length */ n->m_pkthdr.len = n->m_len = sizeof(struct ip6_hdr) + sizeof(struct icmp6_nodeinfo) + copied; break; } default: break; /* XXX impossible! */ } nni6->ni_type = ICMP6_NI_REPLY; m_freem(m); return (n); bad: m_freem(m); if (n) m_freem(n); return (NULL); } /* * make a mbuf with DNS-encoded string. no compression support. * * XXX names with less than 2 dots (like "foo" or "foo.section") will be * treated as truncated name (two \0 at the end). this is a wild guess. * * old - return pascal string if non-zero */ static struct mbuf * ni6_nametodns(const char *name, int namelen, int old) { struct mbuf *m; char *cp, *ep; const char *p, *q; int i, len, nterm; if (old) len = namelen + 1; else len = MCLBYTES; /* Because MAXHOSTNAMELEN is usually 256, we use cluster mbuf. */ if (len > MLEN) m = m_getcl(M_NOWAIT, MT_DATA, 0); else m = m_get(M_NOWAIT, MT_DATA); if (m == NULL) goto fail; if (old) { m->m_len = len; *mtod(m, char *) = namelen; bcopy(name, mtod(m, char *) + 1, namelen); return m; } else { m->m_len = 0; cp = mtod(m, char *); ep = mtod(m, char *) + M_TRAILINGSPACE(m); /* if not certain about my name, return empty buffer */ if (namelen == 0) return m; /* * guess if it looks like shortened hostname, or FQDN. * shortened hostname needs two trailing "\0". */ i = 0; for (p = name; p < name + namelen; p++) { if (*p && *p == '.') i++; } if (i < 2) nterm = 2; else nterm = 1; p = name; while (cp < ep && p < name + namelen) { i = 0; for (q = p; q < name + namelen && *q && *q != '.'; q++) i++; /* result does not fit into mbuf */ if (cp + i + 1 >= ep) goto fail; /* * DNS label length restriction, RFC1035 page 8. * "i == 0" case is included here to avoid returning * 0-length label on "foo..bar". */ if (i <= 0 || i >= 64) goto fail; *cp++ = i; bcopy(p, cp, i); cp += i; p = q; if (p < name + namelen && *p == '.') p++; } /* termination */ if (cp + nterm >= ep) goto fail; while (nterm-- > 0) *cp++ = '\0'; m->m_len = cp - mtod(m, char *); return m; } panic("should not reach here"); /* NOTREACHED */ fail: if (m) m_freem(m); return NULL; } /* * check if two DNS-encoded string matches. takes care of truncated * form (with \0\0 at the end). no compression support. * XXX upper/lowercase match (see RFC2065) */ static int ni6_dnsmatch(const char *a, int alen, const char *b, int blen) { const char *a0, *b0; int l; /* simplest case - need validation? */ if (alen == blen && bcmp(a, b, alen) == 0) return 1; a0 = a; b0 = b; /* termination is mandatory */ if (alen < 2 || blen < 2) return 0; if (a0[alen - 1] != '\0' || b0[blen - 1] != '\0') return 0; alen--; blen--; while (a - a0 < alen && b - b0 < blen) { if (a - a0 + 1 > alen || b - b0 + 1 > blen) return 0; if ((signed char)a[0] < 0 || (signed char)b[0] < 0) return 0; /* we don't support compression yet */ if (a[0] >= 64 || b[0] >= 64) return 0; /* truncated case */ if (a[0] == 0 && a - a0 == alen - 1) return 1; if (b[0] == 0 && b - b0 == blen - 1) return 1; if (a[0] == 0 || b[0] == 0) return 0; if (a[0] != b[0]) return 0; l = a[0]; if (a - a0 + 1 + l > alen || b - b0 + 1 + l > blen) return 0; if (bcmp(a + 1, b + 1, l) != 0) return 0; a += 1 + l; b += 1 + l; } if (a - a0 == alen && b - b0 == blen) return 1; else return 0; } /* * calculate the number of addresses to be returned in the node info reply. */ static int ni6_addrs(struct icmp6_nodeinfo *ni6, struct mbuf *m, struct ifnet **ifpp, struct in6_addr *subj) { struct ifnet *ifp; struct in6_ifaddr *ifa6; struct ifaddr *ifa; int addrs = 0, addrsofif, iffound = 0; int niflags = ni6->ni_flags; if ((niflags & NI_NODEADDR_FLAG_ALL) == 0) { switch (ni6->ni_code) { case ICMP6_NI_SUBJ_IPV6: if (subj == NULL) /* must be impossible... */ return (0); break; default: /* * XXX: we only support IPv6 subject address for * this Qtype. */ return (0); } } IFNET_RLOCK_NOSLEEP(); - TAILQ_FOREACH(ifp, &V_ifnet, if_link) { + CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) { addrsofif = 0; IF_ADDR_RLOCK(ifp); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; ifa6 = (struct in6_ifaddr *)ifa; if ((niflags & NI_NODEADDR_FLAG_ALL) == 0 && IN6_ARE_ADDR_EQUAL(subj, &ifa6->ia_addr.sin6_addr)) iffound = 1; /* * IPv4-mapped addresses can only be returned by a * Node Information proxy, since they represent * addresses of IPv4-only nodes, which perforce do * not implement this protocol. * [icmp-name-lookups-07, Section 5.4] * So we don't support NI_NODEADDR_FLAG_COMPAT in * this function at this moment. */ /* What do we have to do about ::1? */ switch (in6_addrscope(&ifa6->ia_addr.sin6_addr)) { case IPV6_ADDR_SCOPE_LINKLOCAL: if ((niflags & NI_NODEADDR_FLAG_LINKLOCAL) == 0) continue; break; case IPV6_ADDR_SCOPE_SITELOCAL: if ((niflags & NI_NODEADDR_FLAG_SITELOCAL) == 0) continue; break; case IPV6_ADDR_SCOPE_GLOBAL: if ((niflags & NI_NODEADDR_FLAG_GLOBAL) == 0) continue; break; default: continue; } /* * check if anycast is okay. * XXX: just experimental. not in the spec. */ if ((ifa6->ia6_flags & IN6_IFF_ANYCAST) != 0 && (niflags & NI_NODEADDR_FLAG_ANYCAST) == 0) continue; /* we need only unicast addresses */ if ((ifa6->ia6_flags & IN6_IFF_TEMPORARY) != 0 && (V_icmp6_nodeinfo & ICMP6_NODEINFO_TMPADDROK) == 0) { continue; } addrsofif++; /* count the address */ } IF_ADDR_RUNLOCK(ifp); if (iffound) { *ifpp = ifp; IFNET_RUNLOCK_NOSLEEP(); return (addrsofif); } addrs += addrsofif; } IFNET_RUNLOCK_NOSLEEP(); return (addrs); } static int ni6_store_addrs(struct icmp6_nodeinfo *ni6, struct icmp6_nodeinfo *nni6, struct ifnet *ifp0, int resid) { struct ifnet *ifp; struct in6_ifaddr *ifa6; struct ifaddr *ifa; struct ifnet *ifp_dep = NULL; int copied = 0, allow_deprecated = 0; u_char *cp = (u_char *)(nni6 + 1); int niflags = ni6->ni_flags; u_int32_t ltime; if (ifp0 == NULL && !(niflags & NI_NODEADDR_FLAG_ALL)) return (0); /* needless to copy */ IFNET_RLOCK_NOSLEEP(); - ifp = ifp0 ? ifp0 : TAILQ_FIRST(&V_ifnet); + ifp = ifp0 ? ifp0 : CK_STAILQ_FIRST(&V_ifnet); again: - for (; ifp; ifp = TAILQ_NEXT(ifp, if_link)) { + for (; ifp; ifp = CK_STAILQ_NEXT(ifp, if_link)) { IF_ADDR_RLOCK(ifp); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; ifa6 = (struct in6_ifaddr *)ifa; if ((ifa6->ia6_flags & IN6_IFF_DEPRECATED) != 0 && allow_deprecated == 0) { /* * prefererred address should be put before * deprecated addresses. */ /* record the interface for later search */ if (ifp_dep == NULL) ifp_dep = ifp; continue; } else if ((ifa6->ia6_flags & IN6_IFF_DEPRECATED) == 0 && allow_deprecated != 0) continue; /* we now collect deprecated addrs */ /* What do we have to do about ::1? */ switch (in6_addrscope(&ifa6->ia_addr.sin6_addr)) { case IPV6_ADDR_SCOPE_LINKLOCAL: if ((niflags & NI_NODEADDR_FLAG_LINKLOCAL) == 0) continue; break; case IPV6_ADDR_SCOPE_SITELOCAL: if ((niflags & NI_NODEADDR_FLAG_SITELOCAL) == 0) continue; break; case IPV6_ADDR_SCOPE_GLOBAL: if ((niflags & NI_NODEADDR_FLAG_GLOBAL) == 0) continue; break; default: continue; } /* * check if anycast is okay. * XXX: just experimental. not in the spec. */ if ((ifa6->ia6_flags & IN6_IFF_ANYCAST) != 0 && (niflags & NI_NODEADDR_FLAG_ANYCAST) == 0) continue; if ((ifa6->ia6_flags & IN6_IFF_TEMPORARY) != 0 && (V_icmp6_nodeinfo & ICMP6_NODEINFO_TMPADDROK) == 0) { continue; } /* now we can copy the address */ if (resid < sizeof(struct in6_addr) + sizeof(u_int32_t)) { IF_ADDR_RUNLOCK(ifp); /* * We give up much more copy. * Set the truncate flag and return. */ nni6->ni_flags |= NI_NODEADDR_FLAG_TRUNCATE; IFNET_RUNLOCK_NOSLEEP(); return (copied); } /* * Set the TTL of the address. * The TTL value should be one of the following * according to the specification: * * 1. The remaining lifetime of a DHCP lease on the * address, or * 2. The remaining Valid Lifetime of a prefix from * which the address was derived through Stateless * Autoconfiguration. * * Note that we currently do not support stateful * address configuration by DHCPv6, so the former * case can't happen. */ if (ifa6->ia6_lifetime.ia6t_expire == 0) ltime = ND6_INFINITE_LIFETIME; else { if (ifa6->ia6_lifetime.ia6t_expire > time_uptime) ltime = htonl(ifa6->ia6_lifetime.ia6t_expire - time_uptime); else ltime = 0; } bcopy(<ime, cp, sizeof(u_int32_t)); cp += sizeof(u_int32_t); /* copy the address itself */ bcopy(&ifa6->ia_addr.sin6_addr, cp, sizeof(struct in6_addr)); in6_clearscope((struct in6_addr *)cp); /* XXX */ cp += sizeof(struct in6_addr); resid -= (sizeof(struct in6_addr) + sizeof(u_int32_t)); copied += (sizeof(struct in6_addr) + sizeof(u_int32_t)); } IF_ADDR_RUNLOCK(ifp); if (ifp0) /* we need search only on the specified IF */ break; } if (allow_deprecated == 0 && ifp_dep != NULL) { ifp = ifp_dep; allow_deprecated = 1; goto again; } IFNET_RUNLOCK_NOSLEEP(); return (copied); } /* * XXX almost dup'ed code with rip6_input. */ static int icmp6_rip6_input(struct mbuf **mp, int off) { struct mbuf *m = *mp; struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *); struct inpcb *in6p; struct inpcb *last = NULL; struct sockaddr_in6 fromsa; struct icmp6_hdr *icmp6; struct mbuf *opts = NULL; #ifndef PULLDOWN_TEST /* this is assumed to be safe. */ icmp6 = (struct icmp6_hdr *)((caddr_t)ip6 + off); #else IP6_EXTHDR_GET(icmp6, struct icmp6_hdr *, m, off, sizeof(*icmp6)); if (icmp6 == NULL) { /* m is already reclaimed */ return (IPPROTO_DONE); } #endif /* * XXX: the address may have embedded scope zone ID, which should be * hidden from applications. */ bzero(&fromsa, sizeof(fromsa)); fromsa.sin6_family = AF_INET6; fromsa.sin6_len = sizeof(struct sockaddr_in6); fromsa.sin6_addr = ip6->ip6_src; if (sa6_recoverscope(&fromsa)) { m_freem(m); return (IPPROTO_DONE); } INP_INFO_RLOCK(&V_ripcbinfo); LIST_FOREACH(in6p, &V_ripcb, inp_list) { if ((in6p->inp_vflag & INP_IPV6) == 0) continue; if (in6p->inp_ip_p != IPPROTO_ICMPV6) continue; if (!IN6_IS_ADDR_UNSPECIFIED(&in6p->in6p_laddr) && !IN6_ARE_ADDR_EQUAL(&in6p->in6p_laddr, &ip6->ip6_dst)) continue; if (!IN6_IS_ADDR_UNSPECIFIED(&in6p->in6p_faddr) && !IN6_ARE_ADDR_EQUAL(&in6p->in6p_faddr, &ip6->ip6_src)) continue; INP_RLOCK(in6p); if (ICMP6_FILTER_WILLBLOCK(icmp6->icmp6_type, in6p->in6p_icmp6filt)) { INP_RUNLOCK(in6p); continue; } if (last != NULL) { struct mbuf *n = NULL; /* * Recent network drivers tend to allocate a single * mbuf cluster, rather than to make a couple of * mbufs without clusters. Also, since the IPv6 code * path tries to avoid m_pullup(), it is highly * probable that we still have an mbuf cluster here * even though the necessary length can be stored in an * mbuf's internal buffer. * Meanwhile, the default size of the receive socket * buffer for raw sockets is not so large. This means * the possibility of packet loss is relatively higher * than before. To avoid this scenario, we copy the * received data to a separate mbuf that does not use * a cluster, if possible. * XXX: it is better to copy the data after stripping * intermediate headers. */ if ((m->m_flags & M_EXT) && m->m_next == NULL && m->m_len <= MHLEN) { n = m_get(M_NOWAIT, m->m_type); if (n != NULL) { if (m_dup_pkthdr(n, m, M_NOWAIT)) { bcopy(m->m_data, n->m_data, m->m_len); n->m_len = m->m_len; } else { m_free(n); n = NULL; } } } if (n != NULL || (n = m_copym(m, 0, M_COPYALL, M_NOWAIT)) != NULL) { if (last->inp_flags & INP_CONTROLOPTS) ip6_savecontrol(last, n, &opts); /* strip intermediate headers */ m_adj(n, off); SOCKBUF_LOCK(&last->inp_socket->so_rcv); if (sbappendaddr_locked( &last->inp_socket->so_rcv, (struct sockaddr *)&fromsa, n, opts) == 0) { /* should notify about lost packet */ m_freem(n); if (opts) { m_freem(opts); } SOCKBUF_UNLOCK( &last->inp_socket->so_rcv); } else sorwakeup_locked(last->inp_socket); opts = NULL; } INP_RUNLOCK(last); } last = in6p; } INP_INFO_RUNLOCK(&V_ripcbinfo); if (last != NULL) { if (last->inp_flags & INP_CONTROLOPTS) ip6_savecontrol(last, m, &opts); /* strip intermediate headers */ m_adj(m, off); /* avoid using mbuf clusters if possible (see above) */ if ((m->m_flags & M_EXT) && m->m_next == NULL && m->m_len <= MHLEN) { struct mbuf *n; n = m_get(M_NOWAIT, m->m_type); if (n != NULL) { if (m_dup_pkthdr(n, m, M_NOWAIT)) { bcopy(m->m_data, n->m_data, m->m_len); n->m_len = m->m_len; m_freem(m); m = n; } else { m_freem(n); n = NULL; } } } SOCKBUF_LOCK(&last->inp_socket->so_rcv); if (sbappendaddr_locked(&last->inp_socket->so_rcv, (struct sockaddr *)&fromsa, m, opts) == 0) { m_freem(m); if (opts) m_freem(opts); SOCKBUF_UNLOCK(&last->inp_socket->so_rcv); } else sorwakeup_locked(last->inp_socket); INP_RUNLOCK(last); } else { m_freem(m); IP6STAT_DEC(ip6s_delivered); } return IPPROTO_DONE; } /* * Reflect the ip6 packet back to the source. * OFF points to the icmp6 header, counted from the top of the mbuf. */ void icmp6_reflect(struct mbuf *m, size_t off) { struct in6_addr src6, *srcp; struct ip6_hdr *ip6; struct icmp6_hdr *icmp6; struct in6_ifaddr *ia = NULL; struct ifnet *outif = NULL; int plen; int type, code, hlim; /* too short to reflect */ if (off < sizeof(struct ip6_hdr)) { nd6log((LOG_DEBUG, "sanity fail: off=%lx, sizeof(ip6)=%lx in %s:%d\n", (u_long)off, (u_long)sizeof(struct ip6_hdr), __FILE__, __LINE__)); goto bad; } /* * If there are extra headers between IPv6 and ICMPv6, strip * off that header first. */ #ifdef DIAGNOSTIC if (sizeof(struct ip6_hdr) + sizeof(struct icmp6_hdr) > MHLEN) panic("assumption failed in icmp6_reflect"); #endif if (off > sizeof(struct ip6_hdr)) { size_t l; struct ip6_hdr nip6; l = off - sizeof(struct ip6_hdr); m_copydata(m, 0, sizeof(nip6), (caddr_t)&nip6); m_adj(m, l); l = sizeof(struct ip6_hdr) + sizeof(struct icmp6_hdr); if (m->m_len < l) { if ((m = m_pullup(m, l)) == NULL) return; } bcopy((caddr_t)&nip6, mtod(m, caddr_t), sizeof(nip6)); } else /* off == sizeof(struct ip6_hdr) */ { size_t l; l = sizeof(struct ip6_hdr) + sizeof(struct icmp6_hdr); if (m->m_len < l) { if ((m = m_pullup(m, l)) == NULL) return; } } plen = m->m_pkthdr.len - sizeof(struct ip6_hdr); ip6 = mtod(m, struct ip6_hdr *); ip6->ip6_nxt = IPPROTO_ICMPV6; icmp6 = (struct icmp6_hdr *)(ip6 + 1); type = icmp6->icmp6_type; /* keep type for statistics */ code = icmp6->icmp6_code; /* ditto. */ hlim = 0; srcp = NULL; /* * If the incoming packet was addressed directly to us (i.e. unicast), * use dst as the src for the reply. * The IN6_IFF_NOTREADY case should be VERY rare, but is possible * (for example) when we encounter an error while forwarding procedure * destined to a duplicated address of ours. */ if (!IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) { ia = in6ifa_ifwithaddr(&ip6->ip6_dst, 0 /* XXX */); if (ia != NULL && !(ia->ia6_flags & (IN6_IFF_ANYCAST|IN6_IFF_NOTREADY))) { src6 = ia->ia_addr.sin6_addr; srcp = &src6; if (m->m_pkthdr.rcvif != NULL) { /* XXX: This may not be the outgoing interface */ hlim = ND_IFINFO(m->m_pkthdr.rcvif)->chlim; } else hlim = V_ip6_defhlim; } if (ia != NULL) ifa_free(&ia->ia_ifa); } if (srcp == NULL) { int error; struct in6_addr dst6; uint32_t scopeid; /* * This case matches to multicasts, our anycast, or unicasts * that we do not own. Select a source address based on the * source address of the erroneous packet. */ in6_splitscope(&ip6->ip6_src, &dst6, &scopeid); error = in6_selectsrc_addr(M_GETFIB(m), &dst6, scopeid, NULL, &src6, &hlim); if (error) { char ip6buf[INET6_ADDRSTRLEN]; nd6log((LOG_DEBUG, "icmp6_reflect: source can't be determined: " "dst=%s, error=%d\n", ip6_sprintf(ip6buf, &ip6->ip6_dst), error)); goto bad; } srcp = &src6; } /* * ip6_input() drops a packet if its src is multicast. * So, the src is never multicast. */ ip6->ip6_dst = ip6->ip6_src; ip6->ip6_src = *srcp; ip6->ip6_flow = 0; ip6->ip6_vfc &= ~IPV6_VERSION_MASK; ip6->ip6_vfc |= IPV6_VERSION; ip6->ip6_nxt = IPPROTO_ICMPV6; ip6->ip6_hlim = hlim; icmp6->icmp6_cksum = 0; icmp6->icmp6_cksum = in6_cksum(m, IPPROTO_ICMPV6, sizeof(struct ip6_hdr), plen); /* * XXX option handling */ m->m_flags &= ~(M_BCAST|M_MCAST); m->m_pkthdr.rcvif = NULL; ip6_output(m, NULL, NULL, 0, NULL, &outif, NULL); if (outif) icmp6_ifoutstat_inc(outif, type, code); return; bad: m_freem(m); return; } void icmp6_fasttimo(void) { mld_fasttimo(); } void icmp6_slowtimo(void) { mld_slowtimo(); } static const char * icmp6_redirect_diag(struct in6_addr *src6, struct in6_addr *dst6, struct in6_addr *tgt6) { static char buf[1024]; char ip6bufs[INET6_ADDRSTRLEN]; char ip6bufd[INET6_ADDRSTRLEN]; char ip6buft[INET6_ADDRSTRLEN]; snprintf(buf, sizeof(buf), "(src=%s dst=%s tgt=%s)", ip6_sprintf(ip6bufs, src6), ip6_sprintf(ip6bufd, dst6), ip6_sprintf(ip6buft, tgt6)); return buf; } void icmp6_redirect_input(struct mbuf *m, int off) { struct ifnet *ifp; struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *); struct nd_redirect *nd_rd; int icmp6len = ntohs(ip6->ip6_plen); char *lladdr = NULL; int lladdrlen = 0; int is_router; int is_onlink; struct in6_addr src6 = ip6->ip6_src; struct in6_addr redtgt6; struct in6_addr reddst6; union nd_opts ndopts; char ip6buf[INET6_ADDRSTRLEN]; M_ASSERTPKTHDR(m); KASSERT(m->m_pkthdr.rcvif != NULL, ("%s: no rcvif", __func__)); ifp = m->m_pkthdr.rcvif; /* XXX if we are router, we don't update route by icmp6 redirect */ if (V_ip6_forwarding) goto freeit; if (!V_icmp6_rediraccept) goto freeit; /* RFC 6980: Nodes MUST silently ignore fragments */ if(m->m_flags & M_FRAGMENTED) goto freeit; #ifndef PULLDOWN_TEST IP6_EXTHDR_CHECK(m, off, icmp6len,); nd_rd = (struct nd_redirect *)((caddr_t)ip6 + off); #else IP6_EXTHDR_GET(nd_rd, struct nd_redirect *, m, off, icmp6len); if (nd_rd == NULL) { ICMP6STAT_INC(icp6s_tooshort); return; } #endif redtgt6 = nd_rd->nd_rd_target; reddst6 = nd_rd->nd_rd_dst; if (in6_setscope(&redtgt6, m->m_pkthdr.rcvif, NULL) || in6_setscope(&reddst6, m->m_pkthdr.rcvif, NULL)) { goto freeit; } /* validation */ if (!IN6_IS_ADDR_LINKLOCAL(&src6)) { nd6log((LOG_ERR, "ICMP6 redirect sent from %s rejected; " "must be from linklocal\n", ip6_sprintf(ip6buf, &src6))); goto bad; } if (ip6->ip6_hlim != 255) { nd6log((LOG_ERR, "ICMP6 redirect sent from %s rejected; " "hlim=%d (must be 255)\n", ip6_sprintf(ip6buf, &src6), ip6->ip6_hlim)); goto bad; } { /* ip6->ip6_src must be equal to gw for icmp6->icmp6_reddst */ struct nhop6_basic nh6; struct in6_addr kdst; uint32_t scopeid; in6_splitscope(&reddst6, &kdst, &scopeid); if (fib6_lookup_nh_basic(ifp->if_fib, &kdst, scopeid, 0, 0,&nh6)==0){ if ((nh6.nh_flags & NHF_GATEWAY) == 0) { nd6log((LOG_ERR, "ICMP6 redirect rejected; no route " "with inet6 gateway found for redirect dst: %s\n", icmp6_redirect_diag(&src6, &reddst6, &redtgt6))); goto bad; } /* * Embed scope zone id into next hop address, since * fib6_lookup_nh_basic() returns address without embedded * scope zone id. */ if (in6_setscope(&nh6.nh_addr, m->m_pkthdr.rcvif, NULL)) goto freeit; if (IN6_ARE_ADDR_EQUAL(&src6, &nh6.nh_addr) == 0) { nd6log((LOG_ERR, "ICMP6 redirect rejected; " "not equal to gw-for-src=%s (must be same): " "%s\n", ip6_sprintf(ip6buf, &nh6.nh_addr), icmp6_redirect_diag(&src6, &reddst6, &redtgt6))); goto bad; } } else { nd6log((LOG_ERR, "ICMP6 redirect rejected; " "no route found for redirect dst: %s\n", icmp6_redirect_diag(&src6, &reddst6, &redtgt6))); goto bad; } } if (IN6_IS_ADDR_MULTICAST(&reddst6)) { nd6log((LOG_ERR, "ICMP6 redirect rejected; " "redirect dst must be unicast: %s\n", icmp6_redirect_diag(&src6, &reddst6, &redtgt6))); goto bad; } is_router = is_onlink = 0; if (IN6_IS_ADDR_LINKLOCAL(&redtgt6)) is_router = 1; /* router case */ if (bcmp(&redtgt6, &reddst6, sizeof(redtgt6)) == 0) is_onlink = 1; /* on-link destination case */ if (!is_router && !is_onlink) { nd6log((LOG_ERR, "ICMP6 redirect rejected; " "neither router case nor onlink case: %s\n", icmp6_redirect_diag(&src6, &reddst6, &redtgt6))); goto bad; } icmp6len -= sizeof(*nd_rd); nd6_option_init(nd_rd + 1, icmp6len, &ndopts); if (nd6_options(&ndopts) < 0) { nd6log((LOG_INFO, "%s: invalid ND option, rejected: %s\n", __func__, icmp6_redirect_diag(&src6, &reddst6, &redtgt6))); /* nd6_options have incremented stats */ goto freeit; } if (ndopts.nd_opts_tgt_lladdr) { lladdr = (char *)(ndopts.nd_opts_tgt_lladdr + 1); lladdrlen = ndopts.nd_opts_tgt_lladdr->nd_opt_len << 3; } if (lladdr && ((ifp->if_addrlen + 2 + 7) & ~7) != lladdrlen) { nd6log((LOG_INFO, "%s: lladdrlen mismatch for %s " "(if %d, icmp6 packet %d): %s\n", __func__, ip6_sprintf(ip6buf, &redtgt6), ifp->if_addrlen, lladdrlen - 2, icmp6_redirect_diag(&src6, &reddst6, &redtgt6))); goto bad; } /* Validation passed. */ /* RFC 2461 8.3 */ nd6_cache_lladdr(ifp, &redtgt6, lladdr, lladdrlen, ND_REDIRECT, is_onlink ? ND_REDIRECT_ONLINK : ND_REDIRECT_ROUTER); /* * Install a gateway route in the better-router case or an interface * route in the on-link-destination case. */ { struct sockaddr_in6 sdst; struct sockaddr_in6 sgw; struct sockaddr_in6 ssrc; struct sockaddr *gw; int rt_flags; u_int fibnum; bzero(&sdst, sizeof(sdst)); bzero(&ssrc, sizeof(ssrc)); sdst.sin6_family = ssrc.sin6_family = AF_INET6; sdst.sin6_len = ssrc.sin6_len = sizeof(struct sockaddr_in6); bcopy(&reddst6, &sdst.sin6_addr, sizeof(struct in6_addr)); bcopy(&src6, &ssrc.sin6_addr, sizeof(struct in6_addr)); rt_flags = RTF_HOST; if (is_router) { bzero(&sgw, sizeof(sgw)); sgw.sin6_family = AF_INET6; sgw.sin6_len = sizeof(struct sockaddr_in6); bcopy(&redtgt6, &sgw.sin6_addr, sizeof(struct in6_addr)); gw = (struct sockaddr *)&sgw; rt_flags |= RTF_GATEWAY; } else gw = ifp->if_addr->ifa_addr; for (fibnum = 0; fibnum < rt_numfibs; fibnum++) in6_rtredirect((struct sockaddr *)&sdst, gw, (struct sockaddr *)NULL, rt_flags, (struct sockaddr *)&ssrc, fibnum); } /* finally update cached route in each socket via pfctlinput */ { struct sockaddr_in6 sdst; bzero(&sdst, sizeof(sdst)); sdst.sin6_family = AF_INET6; sdst.sin6_len = sizeof(struct sockaddr_in6); bcopy(&reddst6, &sdst.sin6_addr, sizeof(struct in6_addr)); pfctlinput(PRC_REDIRECT_HOST, (struct sockaddr *)&sdst); } freeit: m_freem(m); return; bad: ICMP6STAT_INC(icp6s_badredirect); m_freem(m); } void icmp6_redirect_output(struct mbuf *m0, struct rtentry *rt) { struct ifnet *ifp; /* my outgoing interface */ struct in6_addr *ifp_ll6; struct in6_addr *router_ll6; struct ip6_hdr *sip6; /* m0 as struct ip6_hdr */ struct mbuf *m = NULL; /* newly allocated one */ struct m_tag *mtag; struct ip6_hdr *ip6; /* m as struct ip6_hdr */ struct nd_redirect *nd_rd; struct llentry *ln = NULL; size_t maxlen; u_char *p; struct ifnet *outif = NULL; struct sockaddr_in6 src_sa; icmp6_errcount(ND_REDIRECT, 0); /* if we are not router, we don't send icmp6 redirect */ if (!V_ip6_forwarding) goto fail; /* sanity check */ if (!m0 || !rt || !(rt->rt_flags & RTF_UP) || !(ifp = rt->rt_ifp)) goto fail; /* * Address check: * the source address must identify a neighbor, and * the destination address must not be a multicast address * [RFC 2461, sec 8.2] */ sip6 = mtod(m0, struct ip6_hdr *); bzero(&src_sa, sizeof(src_sa)); src_sa.sin6_family = AF_INET6; src_sa.sin6_len = sizeof(src_sa); src_sa.sin6_addr = sip6->ip6_src; if (nd6_is_addr_neighbor(&src_sa, ifp) == 0) goto fail; if (IN6_IS_ADDR_MULTICAST(&sip6->ip6_dst)) goto fail; /* what should we do here? */ /* rate limit */ if (icmp6_ratelimit(&sip6->ip6_src, ND_REDIRECT, 0)) goto fail; /* * Since we are going to append up to 1280 bytes (= IPV6_MMTU), * we almost always ask for an mbuf cluster for simplicity. * (MHLEN < IPV6_MMTU is almost always true) */ #if IPV6_MMTU >= MCLBYTES # error assumption failed about IPV6_MMTU and MCLBYTES #endif m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); if (m == NULL) goto fail; M_SETFIB(m, rt->rt_fibnum); maxlen = M_TRAILINGSPACE(m); maxlen = min(IPV6_MMTU, maxlen); /* just for safety */ if (maxlen < sizeof(struct ip6_hdr) + sizeof(struct icmp6_hdr) + ((sizeof(struct nd_opt_hdr) + ifp->if_addrlen + 7) & ~7)) { goto fail; } { /* get ip6 linklocal address for ifp(my outgoing interface). */ struct in6_ifaddr *ia; if ((ia = in6ifa_ifpforlinklocal(ifp, IN6_IFF_NOTREADY| IN6_IFF_ANYCAST)) == NULL) goto fail; ifp_ll6 = &ia->ia_addr.sin6_addr; /* XXXRW: reference released prematurely. */ ifa_free(&ia->ia_ifa); } /* get ip6 linklocal address for the router. */ if (rt->rt_gateway && (rt->rt_flags & RTF_GATEWAY)) { struct sockaddr_in6 *sin6; sin6 = (struct sockaddr_in6 *)rt->rt_gateway; router_ll6 = &sin6->sin6_addr; if (!IN6_IS_ADDR_LINKLOCAL(router_ll6)) router_ll6 = (struct in6_addr *)NULL; } else router_ll6 = (struct in6_addr *)NULL; /* ip6 */ ip6 = mtod(m, struct ip6_hdr *); ip6->ip6_flow = 0; ip6->ip6_vfc &= ~IPV6_VERSION_MASK; ip6->ip6_vfc |= IPV6_VERSION; /* ip6->ip6_plen will be set later */ ip6->ip6_nxt = IPPROTO_ICMPV6; ip6->ip6_hlim = 255; /* ip6->ip6_src must be linklocal addr for my outgoing if. */ bcopy(ifp_ll6, &ip6->ip6_src, sizeof(struct in6_addr)); bcopy(&sip6->ip6_src, &ip6->ip6_dst, sizeof(struct in6_addr)); /* ND Redirect */ nd_rd = (struct nd_redirect *)(ip6 + 1); nd_rd->nd_rd_type = ND_REDIRECT; nd_rd->nd_rd_code = 0; nd_rd->nd_rd_reserved = 0; if (rt->rt_flags & RTF_GATEWAY) { /* * nd_rd->nd_rd_target must be a link-local address in * better router cases. */ if (!router_ll6) goto fail; bcopy(router_ll6, &nd_rd->nd_rd_target, sizeof(nd_rd->nd_rd_target)); bcopy(&sip6->ip6_dst, &nd_rd->nd_rd_dst, sizeof(nd_rd->nd_rd_dst)); } else { /* make sure redtgt == reddst */ bcopy(&sip6->ip6_dst, &nd_rd->nd_rd_target, sizeof(nd_rd->nd_rd_target)); bcopy(&sip6->ip6_dst, &nd_rd->nd_rd_dst, sizeof(nd_rd->nd_rd_dst)); } p = (u_char *)(nd_rd + 1); if (!router_ll6) goto nolladdropt; { /* target lladdr option */ int len; struct nd_opt_hdr *nd_opt; char *lladdr; IF_AFDATA_RLOCK(ifp); ln = nd6_lookup(router_ll6, 0, ifp); IF_AFDATA_RUNLOCK(ifp); if (ln == NULL) goto nolladdropt; len = sizeof(*nd_opt) + ifp->if_addrlen; len = (len + 7) & ~7; /* round by 8 */ /* safety check */ if (len + (p - (u_char *)ip6) > maxlen) goto nolladdropt; if (ln->la_flags & LLE_VALID) { nd_opt = (struct nd_opt_hdr *)p; nd_opt->nd_opt_type = ND_OPT_TARGET_LINKADDR; nd_opt->nd_opt_len = len >> 3; lladdr = (char *)(nd_opt + 1); bcopy(ln->ll_addr, lladdr, ifp->if_addrlen); p += len; } } nolladdropt: if (ln != NULL) LLE_RUNLOCK(ln); m->m_pkthdr.len = m->m_len = p - (u_char *)ip6; /* just to be safe */ #ifdef M_DECRYPTED /*not openbsd*/ if (m0->m_flags & M_DECRYPTED) goto noredhdropt; #endif if (p - (u_char *)ip6 > maxlen) goto noredhdropt; { /* redirected header option */ int len; struct nd_opt_rd_hdr *nd_opt_rh; /* * compute the maximum size for icmp6 redirect header option. * XXX room for auth header? */ len = maxlen - (p - (u_char *)ip6); len &= ~7; /* This is just for simplicity. */ if (m0->m_pkthdr.len != m0->m_len) { if (m0->m_next) { m_freem(m0->m_next); m0->m_next = NULL; } m0->m_pkthdr.len = m0->m_len; } /* * Redirected header option spec (RFC2461 4.6.3) talks nothing * about padding/truncate rule for the original IP packet. * From the discussion on IPv6imp in Feb 1999, * the consensus was: * - "attach as much as possible" is the goal * - pad if not aligned (original size can be guessed by * original ip6 header) * Following code adds the padding if it is simple enough, * and truncates if not. */ if (m0->m_next || m0->m_pkthdr.len != m0->m_len) panic("assumption failed in %s:%d", __FILE__, __LINE__); if (len - sizeof(*nd_opt_rh) < m0->m_pkthdr.len) { /* not enough room, truncate */ m0->m_pkthdr.len = m0->m_len = len - sizeof(*nd_opt_rh); } else { /* enough room, pad or truncate */ size_t extra; extra = m0->m_pkthdr.len % 8; if (extra) { /* pad if easy enough, truncate if not */ if (8 - extra <= M_TRAILINGSPACE(m0)) { /* pad */ m0->m_len += (8 - extra); m0->m_pkthdr.len += (8 - extra); } else { /* truncate */ m0->m_pkthdr.len -= extra; m0->m_len -= extra; } } len = m0->m_pkthdr.len + sizeof(*nd_opt_rh); m0->m_pkthdr.len = m0->m_len = len - sizeof(*nd_opt_rh); } nd_opt_rh = (struct nd_opt_rd_hdr *)p; bzero(nd_opt_rh, sizeof(*nd_opt_rh)); nd_opt_rh->nd_opt_rh_type = ND_OPT_REDIRECTED_HEADER; nd_opt_rh->nd_opt_rh_len = len >> 3; p += sizeof(*nd_opt_rh); m->m_pkthdr.len = m->m_len = p - (u_char *)ip6; /* connect m0 to m */ m_tag_delete_chain(m0, NULL); m0->m_flags &= ~M_PKTHDR; m->m_next = m0; m->m_pkthdr.len = m->m_len + m0->m_len; m0 = NULL; } noredhdropt:; if (m0) { m_freem(m0); m0 = NULL; } /* XXX: clear embedded link IDs in the inner header */ in6_clearscope(&sip6->ip6_src); in6_clearscope(&sip6->ip6_dst); in6_clearscope(&nd_rd->nd_rd_target); in6_clearscope(&nd_rd->nd_rd_dst); ip6->ip6_plen = htons(m->m_pkthdr.len - sizeof(struct ip6_hdr)); nd_rd->nd_rd_cksum = 0; nd_rd->nd_rd_cksum = in6_cksum(m, IPPROTO_ICMPV6, sizeof(*ip6), ntohs(ip6->ip6_plen)); if (send_sendso_input_hook != NULL) { mtag = m_tag_get(PACKET_TAG_ND_OUTGOING, sizeof(unsigned short), M_NOWAIT); if (mtag == NULL) goto fail; *(unsigned short *)(mtag + 1) = nd_rd->nd_rd_type; m_tag_prepend(m, mtag); } /* send the packet to outside... */ ip6_output(m, NULL, NULL, 0, NULL, &outif, NULL); if (outif) { icmp6_ifstat_inc(outif, ifs6_out_msg); icmp6_ifstat_inc(outif, ifs6_out_redirect); } ICMP6STAT_INC(icp6s_outhist[ND_REDIRECT]); return; fail: if (m) m_freem(m); if (m0) m_freem(m0); } /* * ICMPv6 socket option processing. */ int icmp6_ctloutput(struct socket *so, struct sockopt *sopt) { int error = 0; int optlen; struct inpcb *inp = sotoinpcb(so); int level, op, optname; if (sopt) { level = sopt->sopt_level; op = sopt->sopt_dir; optname = sopt->sopt_name; optlen = sopt->sopt_valsize; } else level = op = optname = optlen = 0; if (level != IPPROTO_ICMPV6) { return EINVAL; } switch (op) { case PRCO_SETOPT: switch (optname) { case ICMP6_FILTER: { struct icmp6_filter ic6f; if (optlen != sizeof(ic6f)) { error = EMSGSIZE; break; } error = sooptcopyin(sopt, &ic6f, optlen, optlen); if (error == 0) { INP_WLOCK(inp); *inp->in6p_icmp6filt = ic6f; INP_WUNLOCK(inp); } break; } default: error = ENOPROTOOPT; break; } break; case PRCO_GETOPT: switch (optname) { case ICMP6_FILTER: { struct icmp6_filter ic6f; INP_RLOCK(inp); ic6f = *inp->in6p_icmp6filt; INP_RUNLOCK(inp); error = sooptcopyout(sopt, &ic6f, sizeof(ic6f)); break; } default: error = ENOPROTOOPT; break; } break; } return (error); } /* * Perform rate limit check. * Returns 0 if it is okay to send the icmp6 packet. * Returns 1 if the router SHOULD NOT send this icmp6 packet due to rate * limitation. * * XXX per-destination/type check necessary? * * dst - not used at this moment * type - not used at this moment * code - not used at this moment */ static int icmp6_ratelimit(const struct in6_addr *dst, const int type, const int code) { int ret; ret = 0; /* okay to send */ /* PPS limit */ if (!ppsratecheck(&V_icmp6errppslim_last, &V_icmp6errpps_count, V_icmp6errppslim)) { /* The packet is subject to rate limit */ ret++; } return ret; } Index: head/sys/netinet6/in6.c =================================================================== --- head/sys/netinet6/in6.c (revision 334117) +++ head/sys/netinet6/in6.c (revision 334118) @@ -1,2552 +1,2556 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $KAME: in6.c,v 1.259 2002/01/21 11:37:50 keiichi Exp $ */ /*- * Copyright (c) 1982, 1986, 1991, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)in.c 8.2 (Berkeley) 11/15/93 */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * struct in6_ifreq and struct ifreq must be type punnable for common members * of ifr_ifru to allow accessors to be shared. */ _Static_assert(offsetof(struct in6_ifreq, ifr_ifru) == offsetof(struct ifreq, ifr_ifru), "struct in6_ifreq and struct ifreq are not type punnable"); VNET_DECLARE(int, icmp6_nodeinfo_oldmcprefix); #define V_icmp6_nodeinfo_oldmcprefix VNET(icmp6_nodeinfo_oldmcprefix) /* * Definitions of some costant IP6 addresses. */ const struct in6_addr in6addr_any = IN6ADDR_ANY_INIT; const struct in6_addr in6addr_loopback = IN6ADDR_LOOPBACK_INIT; const struct in6_addr in6addr_nodelocal_allnodes = IN6ADDR_NODELOCAL_ALLNODES_INIT; const struct in6_addr in6addr_linklocal_allnodes = IN6ADDR_LINKLOCAL_ALLNODES_INIT; const struct in6_addr in6addr_linklocal_allrouters = IN6ADDR_LINKLOCAL_ALLROUTERS_INIT; const struct in6_addr in6addr_linklocal_allv2routers = IN6ADDR_LINKLOCAL_ALLV2ROUTERS_INIT; const struct in6_addr in6mask0 = IN6MASK0; const struct in6_addr in6mask32 = IN6MASK32; const struct in6_addr in6mask64 = IN6MASK64; const struct in6_addr in6mask96 = IN6MASK96; const struct in6_addr in6mask128 = IN6MASK128; const struct sockaddr_in6 sa6_any = { sizeof(sa6_any), AF_INET6, 0, 0, IN6ADDR_ANY_INIT, 0 }; static int in6_notify_ifa(struct ifnet *, struct in6_ifaddr *, struct in6_aliasreq *, int); static void in6_unlink_ifa(struct in6_ifaddr *, struct ifnet *); static int in6_validate_ifra(struct ifnet *, struct in6_aliasreq *, struct in6_ifaddr *, int); static struct in6_ifaddr *in6_alloc_ifa(struct ifnet *, struct in6_aliasreq *, int flags); static int in6_update_ifa_internal(struct ifnet *, struct in6_aliasreq *, struct in6_ifaddr *, int, int); static int in6_broadcast_ifa(struct ifnet *, struct in6_aliasreq *, struct in6_ifaddr *, int); #define ifa2ia6(ifa) ((struct in6_ifaddr *)(ifa)) #define ia62ifa(ia6) (&((ia6)->ia_ifa)) void in6_newaddrmsg(struct in6_ifaddr *ia, int cmd) { struct sockaddr_dl gateway; struct sockaddr_in6 mask, addr; struct rtentry rt; int fibnum; /* * initialize for rtmsg generation */ bzero(&gateway, sizeof(gateway)); gateway.sdl_len = sizeof(gateway); gateway.sdl_family = AF_LINK; bzero(&rt, sizeof(rt)); rt.rt_gateway = (struct sockaddr *)&gateway; memcpy(&mask, &ia->ia_prefixmask, sizeof(ia->ia_prefixmask)); memcpy(&addr, &ia->ia_addr, sizeof(ia->ia_addr)); rt_mask(&rt) = (struct sockaddr *)&mask; rt_key(&rt) = (struct sockaddr *)&addr; rt.rt_flags = RTF_HOST | RTF_STATIC; if (cmd == RTM_ADD) rt.rt_flags |= RTF_UP; fibnum = V_rt_add_addr_allfibs ? RT_ALL_FIBS : ia62ifa(ia)->ifa_ifp->if_fib; /* Announce arrival of local address to this FIB. */ rt_newaddrmsg_fib(cmd, &ia->ia_ifa, 0, &rt, fibnum); } int in6_mask2len(struct in6_addr *mask, u_char *lim0) { int x = 0, y; u_char *lim = lim0, *p; /* ignore the scope_id part */ if (lim0 == NULL || lim0 - (u_char *)mask > sizeof(*mask)) lim = (u_char *)mask + sizeof(*mask); for (p = (u_char *)mask; p < lim; x++, p++) { if (*p != 0xff) break; } y = 0; if (p < lim) { for (y = 0; y < 8; y++) { if ((*p & (0x80 >> y)) == 0) break; } } /* * when the limit pointer is given, do a stricter check on the * remaining bits. */ if (p < lim) { if (y != 0 && (*p & (0x00ff >> y)) != 0) return (-1); for (p = p + 1; p < lim; p++) if (*p != 0) return (-1); } return x * 8 + y; } #ifdef COMPAT_FREEBSD32 struct in6_ndifreq32 { char ifname[IFNAMSIZ]; uint32_t ifindex; }; #define SIOCGDEFIFACE32_IN6 _IOWR('i', 86, struct in6_ndifreq32) #endif int in6_control(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp, struct thread *td) { struct in6_ifreq *ifr = (struct in6_ifreq *)data; struct in6_ifaddr *ia = NULL; struct in6_aliasreq *ifra = (struct in6_aliasreq *)data; struct sockaddr_in6 *sa6; int carp_attached = 0; int error; u_long ocmd = cmd; /* * Compat to make pre-10.x ifconfig(8) operable. */ if (cmd == OSIOCAIFADDR_IN6) cmd = SIOCAIFADDR_IN6; switch (cmd) { case SIOCGETSGCNT_IN6: case SIOCGETMIFCNT_IN6: /* * XXX mrt_ioctl has a 3rd, unused, FIB argument in route.c. * We cannot see how that would be needed, so do not adjust the * KPI blindly; more likely should clean up the IPv4 variant. */ return (mrt6_ioctl ? mrt6_ioctl(cmd, data) : EOPNOTSUPP); } switch (cmd) { case SIOCAADDRCTL_POLICY: case SIOCDADDRCTL_POLICY: if (td != NULL) { error = priv_check(td, PRIV_NETINET_ADDRCTRL6); if (error) return (error); } return (in6_src_ioctl(cmd, data)); } if (ifp == NULL) return (EOPNOTSUPP); switch (cmd) { case SIOCSNDFLUSH_IN6: case SIOCSPFXFLUSH_IN6: case SIOCSRTRFLUSH_IN6: case SIOCSDEFIFACE_IN6: case SIOCSIFINFO_FLAGS: case SIOCSIFINFO_IN6: if (td != NULL) { error = priv_check(td, PRIV_NETINET_ND6); if (error) return (error); } /* FALLTHROUGH */ case OSIOCGIFINFO_IN6: case SIOCGIFINFO_IN6: case SIOCGNBRINFO_IN6: case SIOCGDEFIFACE_IN6: return (nd6_ioctl(cmd, data, ifp)); #ifdef COMPAT_FREEBSD32 case SIOCGDEFIFACE32_IN6: { struct in6_ndifreq ndif; struct in6_ndifreq32 *ndif32; error = nd6_ioctl(SIOCGDEFIFACE_IN6, (caddr_t)&ndif, ifp); if (error) return (error); ndif32 = (struct in6_ndifreq32 *)data; ndif32->ifindex = ndif.ifindex; return (0); } #endif } switch (cmd) { case SIOCSIFPREFIX_IN6: case SIOCDIFPREFIX_IN6: case SIOCAIFPREFIX_IN6: case SIOCCIFPREFIX_IN6: case SIOCSGIFPREFIX_IN6: case SIOCGIFPREFIX_IN6: log(LOG_NOTICE, "prefix ioctls are now invalidated. " "please use ifconfig.\n"); return (EOPNOTSUPP); } switch (cmd) { case SIOCSSCOPE6: if (td != NULL) { error = priv_check(td, PRIV_NETINET_SCOPE6); if (error) return (error); } /* FALLTHROUGH */ case SIOCGSCOPE6: case SIOCGSCOPE6DEF: return (scope6_ioctl(cmd, data, ifp)); } /* * Find address for this interface, if it exists. * * In netinet code, we have checked ifra_addr in SIOCSIF*ADDR operation * only, and used the first interface address as the target of other * operations (without checking ifra_addr). This was because netinet * code/API assumed at most 1 interface address per interface. * Since IPv6 allows a node to assign multiple addresses * on a single interface, we almost always look and check the * presence of ifra_addr, and reject invalid ones here. * It also decreases duplicated code among SIOC*_IN6 operations. */ switch (cmd) { case SIOCAIFADDR_IN6: case SIOCSIFPHYADDR_IN6: sa6 = &ifra->ifra_addr; break; case SIOCSIFADDR_IN6: case SIOCGIFADDR_IN6: case SIOCSIFDSTADDR_IN6: case SIOCSIFNETMASK_IN6: case SIOCGIFDSTADDR_IN6: case SIOCGIFNETMASK_IN6: case SIOCDIFADDR_IN6: case SIOCGIFPSRCADDR_IN6: case SIOCGIFPDSTADDR_IN6: case SIOCGIFAFLAG_IN6: case SIOCSNDFLUSH_IN6: case SIOCSPFXFLUSH_IN6: case SIOCSRTRFLUSH_IN6: case SIOCGIFALIFETIME_IN6: case SIOCGIFSTAT_IN6: case SIOCGIFSTAT_ICMP6: sa6 = &ifr->ifr_addr; break; case SIOCSIFADDR: case SIOCSIFBRDADDR: case SIOCSIFDSTADDR: case SIOCSIFNETMASK: /* * Although we should pass any non-INET6 ioctl requests * down to driver, we filter some legacy INET requests. * Drivers trust SIOCSIFADDR et al to come from an already * privileged layer, and do not perform any credentials * checks or input validation. */ return (EINVAL); default: sa6 = NULL; break; } if (sa6 && sa6->sin6_family == AF_INET6) { if (sa6->sin6_scope_id != 0) error = sa6_embedscope(sa6, 0); else error = in6_setscope(&sa6->sin6_addr, ifp, NULL); if (error != 0) return (error); if (td != NULL && (error = prison_check_ip6(td->td_ucred, &sa6->sin6_addr)) != 0) return (error); ia = in6ifa_ifpwithaddr(ifp, &sa6->sin6_addr); } else ia = NULL; switch (cmd) { case SIOCSIFADDR_IN6: case SIOCSIFDSTADDR_IN6: case SIOCSIFNETMASK_IN6: /* * Since IPv6 allows a node to assign multiple addresses * on a single interface, SIOCSIFxxx ioctls are deprecated. */ /* we decided to obsolete this command (20000704) */ error = EINVAL; goto out; case SIOCDIFADDR_IN6: /* * for IPv4, we look for existing in_ifaddr here to allow * "ifconfig if0 delete" to remove the first IPv4 address on * the interface. For IPv6, as the spec allows multiple * interface address from the day one, we consider "remove the * first one" semantics to be not preferable. */ if (ia == NULL) { error = EADDRNOTAVAIL; goto out; } /* FALLTHROUGH */ case SIOCAIFADDR_IN6: /* * We always require users to specify a valid IPv6 address for * the corresponding operation. */ if (ifra->ifra_addr.sin6_family != AF_INET6 || ifra->ifra_addr.sin6_len != sizeof(struct sockaddr_in6)) { error = EAFNOSUPPORT; goto out; } if (td != NULL) { error = priv_check(td, (cmd == SIOCDIFADDR_IN6) ? PRIV_NET_DELIFADDR : PRIV_NET_ADDIFADDR); if (error) goto out; } /* FALLTHROUGH */ case SIOCGIFSTAT_IN6: case SIOCGIFSTAT_ICMP6: if (ifp->if_afdata[AF_INET6] == NULL) { error = EPFNOSUPPORT; goto out; } break; case SIOCGIFADDR_IN6: /* This interface is basically deprecated. use SIOCGIFCONF. */ /* FALLTHROUGH */ case SIOCGIFAFLAG_IN6: case SIOCGIFNETMASK_IN6: case SIOCGIFDSTADDR_IN6: case SIOCGIFALIFETIME_IN6: /* must think again about its semantics */ if (ia == NULL) { error = EADDRNOTAVAIL; goto out; } break; } switch (cmd) { case SIOCGIFADDR_IN6: ifr->ifr_addr = ia->ia_addr; if ((error = sa6_recoverscope(&ifr->ifr_addr)) != 0) goto out; break; case SIOCGIFDSTADDR_IN6: if ((ifp->if_flags & IFF_POINTOPOINT) == 0) { error = EINVAL; goto out; } ifr->ifr_dstaddr = ia->ia_dstaddr; if ((error = sa6_recoverscope(&ifr->ifr_dstaddr)) != 0) goto out; break; case SIOCGIFNETMASK_IN6: ifr->ifr_addr = ia->ia_prefixmask; break; case SIOCGIFAFLAG_IN6: ifr->ifr_ifru.ifru_flags6 = ia->ia6_flags; break; case SIOCGIFSTAT_IN6: COUNTER_ARRAY_COPY(((struct in6_ifextra *) ifp->if_afdata[AF_INET6])->in6_ifstat, &ifr->ifr_ifru.ifru_stat, sizeof(struct in6_ifstat) / sizeof(uint64_t)); break; case SIOCGIFSTAT_ICMP6: COUNTER_ARRAY_COPY(((struct in6_ifextra *) ifp->if_afdata[AF_INET6])->icmp6_ifstat, &ifr->ifr_ifru.ifru_icmp6stat, sizeof(struct icmp6_ifstat) / sizeof(uint64_t)); break; case SIOCGIFALIFETIME_IN6: ifr->ifr_ifru.ifru_lifetime = ia->ia6_lifetime; if (ia->ia6_lifetime.ia6t_vltime != ND6_INFINITE_LIFETIME) { time_t maxexpire; struct in6_addrlifetime *retlt = &ifr->ifr_ifru.ifru_lifetime; /* * XXX: adjust expiration time assuming time_t is * signed. */ maxexpire = (-1) & ~((time_t)1 << ((sizeof(maxexpire) * 8) - 1)); if (ia->ia6_lifetime.ia6t_vltime < maxexpire - ia->ia6_updatetime) { retlt->ia6t_expire = ia->ia6_updatetime + ia->ia6_lifetime.ia6t_vltime; } else retlt->ia6t_expire = maxexpire; } if (ia->ia6_lifetime.ia6t_pltime != ND6_INFINITE_LIFETIME) { time_t maxexpire; struct in6_addrlifetime *retlt = &ifr->ifr_ifru.ifru_lifetime; /* * XXX: adjust expiration time assuming time_t is * signed. */ maxexpire = (-1) & ~((time_t)1 << ((sizeof(maxexpire) * 8) - 1)); if (ia->ia6_lifetime.ia6t_pltime < maxexpire - ia->ia6_updatetime) { retlt->ia6t_preferred = ia->ia6_updatetime + ia->ia6_lifetime.ia6t_pltime; } else retlt->ia6t_preferred = maxexpire; } break; case SIOCAIFADDR_IN6: { struct nd_prefixctl pr0; struct nd_prefix *pr; /* * first, make or update the interface address structure, * and link it to the list. */ if ((error = in6_update_ifa(ifp, ifra, ia, 0)) != 0) goto out; if (ia != NULL) { if (ia->ia_ifa.ifa_carp) (*carp_detach_p)(&ia->ia_ifa, true); ifa_free(&ia->ia_ifa); } if ((ia = in6ifa_ifpwithaddr(ifp, &ifra->ifra_addr.sin6_addr)) == NULL) { /* * this can happen when the user specify the 0 valid * lifetime. */ break; } if (cmd == ocmd && ifra->ifra_vhid > 0) { if (carp_attach_p != NULL) error = (*carp_attach_p)(&ia->ia_ifa, ifra->ifra_vhid); else error = EPROTONOSUPPORT; if (error) goto out; else carp_attached = 1; } /* * then, make the prefix on-link on the interface. * XXX: we'd rather create the prefix before the address, but * we need at least one address to install the corresponding * interface route, so we configure the address first. */ /* * convert mask to prefix length (prefixmask has already * been validated in in6_update_ifa(). */ bzero(&pr0, sizeof(pr0)); pr0.ndpr_ifp = ifp; pr0.ndpr_plen = in6_mask2len(&ifra->ifra_prefixmask.sin6_addr, NULL); if (pr0.ndpr_plen == 128) { /* we don't need to install a host route. */ goto aifaddr_out; } pr0.ndpr_prefix = ifra->ifra_addr; /* apply the mask for safety. */ IN6_MASK_ADDR(&pr0.ndpr_prefix.sin6_addr, &ifra->ifra_prefixmask.sin6_addr); /* * XXX: since we don't have an API to set prefix (not address) * lifetimes, we just use the same lifetimes as addresses. * The (temporarily) installed lifetimes can be overridden by * later advertised RAs (when accept_rtadv is non 0), which is * an intended behavior. */ pr0.ndpr_raf_onlink = 1; /* should be configurable? */ pr0.ndpr_raf_auto = ((ifra->ifra_flags & IN6_IFF_AUTOCONF) != 0); pr0.ndpr_vltime = ifra->ifra_lifetime.ia6t_vltime; pr0.ndpr_pltime = ifra->ifra_lifetime.ia6t_pltime; /* add the prefix if not yet. */ if ((pr = nd6_prefix_lookup(&pr0)) == NULL) { /* * nd6_prelist_add will install the corresponding * interface route. */ if ((error = nd6_prelist_add(&pr0, NULL, &pr)) != 0) { if (carp_attached) (*carp_detach_p)(&ia->ia_ifa, false); goto out; } } /* relate the address to the prefix */ if (ia->ia6_ndpr == NULL) { ia->ia6_ndpr = pr; pr->ndpr_addrcnt++; /* * If this is the first autoconf address from the * prefix, create a temporary address as well * (when required). */ if ((ia->ia6_flags & IN6_IFF_AUTOCONF) && V_ip6_use_tempaddr && pr->ndpr_addrcnt == 1) { int e; if ((e = in6_tmpifadd(ia, 1, 0)) != 0) { log(LOG_NOTICE, "in6_control: failed " "to create a temporary address, " "errno=%d\n", e); } } } nd6_prefix_rele(pr); /* * this might affect the status of autoconfigured addresses, * that is, this address might make other addresses detached. */ pfxlist_onlink_check(); aifaddr_out: /* * Try to clear the flag when a new IPv6 address is added * onto an IFDISABLED interface and it succeeds. */ if (ND_IFINFO(ifp)->flags & ND6_IFF_IFDISABLED) { struct in6_ndireq nd; memset(&nd, 0, sizeof(nd)); nd.ndi.flags = ND_IFINFO(ifp)->flags; nd.ndi.flags &= ~ND6_IFF_IFDISABLED; if (nd6_ioctl(SIOCSIFINFO_FLAGS, (caddr_t)&nd, ifp) < 0) log(LOG_NOTICE, "SIOCAIFADDR_IN6: " "SIOCSIFINFO_FLAGS for -ifdisabled " "failed."); /* * Ignore failure of clearing the flag intentionally. * The failure means address duplication was detected. */ } break; } case SIOCDIFADDR_IN6: { struct nd_prefix *pr; /* * If the address being deleted is the only one that owns * the corresponding prefix, expire the prefix as well. * XXX: theoretically, we don't have to worry about such * relationship, since we separate the address management * and the prefix management. We do this, however, to provide * as much backward compatibility as possible in terms of * the ioctl operation. * Note that in6_purgeaddr() will decrement ndpr_addrcnt. */ pr = ia->ia6_ndpr; in6_purgeaddr(&ia->ia_ifa); if (pr != NULL && pr->ndpr_addrcnt == 0) { ND6_WLOCK(); nd6_prefix_unlink(pr, NULL); ND6_WUNLOCK(); nd6_prefix_del(pr); } EVENTHANDLER_INVOKE(ifaddr_event, ifp); break; } default: if (ifp->if_ioctl == NULL) { error = EOPNOTSUPP; goto out; } error = (*ifp->if_ioctl)(ifp, cmd, data); goto out; } error = 0; out: if (ia != NULL) ifa_free(&ia->ia_ifa); return (error); } static struct in6_multi_mship * in6_joingroup_legacy(struct ifnet *ifp, const struct in6_addr *mcaddr, int *errorp, int delay) { struct in6_multi_mship *imm; int error; imm = malloc(sizeof(*imm), M_IP6MADDR, M_NOWAIT); if (imm == NULL) { *errorp = ENOBUFS; return (NULL); } delay = (delay * PR_FASTHZ) / hz; error = in6_joingroup(ifp, mcaddr, NULL, &imm->i6mm_maddr, delay); if (error) { *errorp = error; free(imm, M_IP6MADDR); return (NULL); } return (imm); } /* * Join necessary multicast groups. Factored out from in6_update_ifa(). * This entire work should only be done once, for the default FIB. */ static int in6_update_ifa_join_mc(struct ifnet *ifp, struct in6_aliasreq *ifra, struct in6_ifaddr *ia, int flags, struct in6_multi **in6m_sol) { char ip6buf[INET6_ADDRSTRLEN]; struct in6_addr mltaddr; struct in6_multi_mship *imm; int delay, error; KASSERT(in6m_sol != NULL, ("%s: in6m_sol is NULL", __func__)); /* Join solicited multicast addr for new host id. */ bzero(&mltaddr, sizeof(struct in6_addr)); mltaddr.s6_addr32[0] = IPV6_ADDR_INT32_MLL; mltaddr.s6_addr32[2] = htonl(1); mltaddr.s6_addr32[3] = ifra->ifra_addr.sin6_addr.s6_addr32[3]; mltaddr.s6_addr8[12] = 0xff; if ((error = in6_setscope(&mltaddr, ifp, NULL)) != 0) { /* XXX: should not happen */ log(LOG_ERR, "%s: in6_setscope failed\n", __func__); goto cleanup; } delay = error = 0; if ((flags & IN6_IFAUPDATE_DADDELAY)) { /* * We need a random delay for DAD on the address being * configured. It also means delaying transmission of the * corresponding MLD report to avoid report collision. * [RFC 4861, Section 6.3.7] */ delay = arc4random() % (MAX_RTR_SOLICITATION_DELAY * hz); } imm = in6_joingroup_legacy(ifp, &mltaddr, &error, delay); if (imm == NULL) { nd6log((LOG_WARNING, "%s: in6_joingroup failed for %s on %s " "(errno=%d)\n", __func__, ip6_sprintf(ip6buf, &mltaddr), if_name(ifp), error)); goto cleanup; } LIST_INSERT_HEAD(&ia->ia6_memberships, imm, i6mm_chain); *in6m_sol = imm->i6mm_maddr; /* * Join link-local all-nodes address. */ mltaddr = in6addr_linklocal_allnodes; if ((error = in6_setscope(&mltaddr, ifp, NULL)) != 0) goto cleanup; /* XXX: should not fail */ imm = in6_joingroup_legacy(ifp, &mltaddr, &error, 0); if (imm == NULL) { nd6log((LOG_WARNING, "%s: in6_joingroup failed for %s on %s " "(errno=%d)\n", __func__, ip6_sprintf(ip6buf, &mltaddr), if_name(ifp), error)); goto cleanup; } LIST_INSERT_HEAD(&ia->ia6_memberships, imm, i6mm_chain); /* * Join node information group address. */ delay = 0; if ((flags & IN6_IFAUPDATE_DADDELAY)) { /* * The spec does not say anything about delay for this group, * but the same logic should apply. */ delay = arc4random() % (MAX_RTR_SOLICITATION_DELAY * hz); } if (in6_nigroup(ifp, NULL, -1, &mltaddr) == 0) { /* XXX jinmei */ imm = in6_joingroup_legacy(ifp, &mltaddr, &error, delay); if (imm == NULL) nd6log((LOG_WARNING, "%s: in6_joingroup failed for %s on %s " "(errno=%d)\n", __func__, ip6_sprintf(ip6buf, &mltaddr), if_name(ifp), error)); /* XXX not very fatal, go on... */ else LIST_INSERT_HEAD(&ia->ia6_memberships, imm, i6mm_chain); } if (V_icmp6_nodeinfo_oldmcprefix && in6_nigroup_oldmcprefix(ifp, NULL, -1, &mltaddr) == 0) { imm = in6_joingroup_legacy(ifp, &mltaddr, &error, delay); if (imm == NULL) nd6log((LOG_WARNING, "%s: in6_joingroup failed for %s on %s " "(errno=%d)\n", __func__, ip6_sprintf(ip6buf, &mltaddr), if_name(ifp), error)); /* XXX not very fatal, go on... */ else LIST_INSERT_HEAD(&ia->ia6_memberships, imm, i6mm_chain); } /* * Join interface-local all-nodes address. * (ff01::1%ifN, and ff01::%ifN/32) */ mltaddr = in6addr_nodelocal_allnodes; if ((error = in6_setscope(&mltaddr, ifp, NULL)) != 0) goto cleanup; /* XXX: should not fail */ imm = in6_joingroup_legacy(ifp, &mltaddr, &error, 0); if (imm == NULL) { nd6log((LOG_WARNING, "%s: in6_joingroup failed for %s on %s " "(errno=%d)\n", __func__, ip6_sprintf(ip6buf, &mltaddr), if_name(ifp), error)); goto cleanup; } LIST_INSERT_HEAD(&ia->ia6_memberships, imm, i6mm_chain); cleanup: return (error); } /* * Update parameters of an IPv6 interface address. * If necessary, a new entry is created and linked into address chains. * This function is separated from in6_control(). */ int in6_update_ifa(struct ifnet *ifp, struct in6_aliasreq *ifra, struct in6_ifaddr *ia, int flags) { int error, hostIsNew = 0; if ((error = in6_validate_ifra(ifp, ifra, ia, flags)) != 0) return (error); if (ia == NULL) { hostIsNew = 1; if ((ia = in6_alloc_ifa(ifp, ifra, flags)) == NULL) return (ENOBUFS); } error = in6_update_ifa_internal(ifp, ifra, ia, hostIsNew, flags); if (error != 0) { if (hostIsNew != 0) { in6_unlink_ifa(ia, ifp); ifa_free(&ia->ia_ifa); } return (error); } if (hostIsNew) error = in6_broadcast_ifa(ifp, ifra, ia, flags); return (error); } /* * Fill in basic IPv6 address request info. */ void in6_prepare_ifra(struct in6_aliasreq *ifra, const struct in6_addr *addr, const struct in6_addr *mask) { memset(ifra, 0, sizeof(struct in6_aliasreq)); ifra->ifra_addr.sin6_family = AF_INET6; ifra->ifra_addr.sin6_len = sizeof(struct sockaddr_in6); if (addr != NULL) ifra->ifra_addr.sin6_addr = *addr; ifra->ifra_prefixmask.sin6_family = AF_INET6; ifra->ifra_prefixmask.sin6_len = sizeof(struct sockaddr_in6); if (mask != NULL) ifra->ifra_prefixmask.sin6_addr = *mask; } static int in6_validate_ifra(struct ifnet *ifp, struct in6_aliasreq *ifra, struct in6_ifaddr *ia, int flags) { int plen = -1; struct sockaddr_in6 dst6; struct in6_addrlifetime *lt; char ip6buf[INET6_ADDRSTRLEN]; /* Validate parameters */ if (ifp == NULL || ifra == NULL) /* this maybe redundant */ return (EINVAL); /* * The destination address for a p2p link must have a family * of AF_UNSPEC or AF_INET6. */ if ((ifp->if_flags & IFF_POINTOPOINT) != 0 && ifra->ifra_dstaddr.sin6_family != AF_INET6 && ifra->ifra_dstaddr.sin6_family != AF_UNSPEC) return (EAFNOSUPPORT); /* * Validate address */ if (ifra->ifra_addr.sin6_len != sizeof(struct sockaddr_in6) || ifra->ifra_addr.sin6_family != AF_INET6) return (EINVAL); /* * validate ifra_prefixmask. don't check sin6_family, netmask * does not carry fields other than sin6_len. */ if (ifra->ifra_prefixmask.sin6_len > sizeof(struct sockaddr_in6)) return (EINVAL); /* * Because the IPv6 address architecture is classless, we require * users to specify a (non 0) prefix length (mask) for a new address. * We also require the prefix (when specified) mask is valid, and thus * reject a non-consecutive mask. */ if (ia == NULL && ifra->ifra_prefixmask.sin6_len == 0) return (EINVAL); if (ifra->ifra_prefixmask.sin6_len != 0) { plen = in6_mask2len(&ifra->ifra_prefixmask.sin6_addr, (u_char *)&ifra->ifra_prefixmask + ifra->ifra_prefixmask.sin6_len); if (plen <= 0) return (EINVAL); } else { /* * In this case, ia must not be NULL. We just use its prefix * length. */ plen = in6_mask2len(&ia->ia_prefixmask.sin6_addr, NULL); } /* * If the destination address on a p2p interface is specified, * and the address is a scoped one, validate/set the scope * zone identifier. */ dst6 = ifra->ifra_dstaddr; if ((ifp->if_flags & (IFF_POINTOPOINT|IFF_LOOPBACK)) != 0 && (dst6.sin6_family == AF_INET6)) { struct in6_addr in6_tmp; u_int32_t zoneid; in6_tmp = dst6.sin6_addr; if (in6_setscope(&in6_tmp, ifp, &zoneid)) return (EINVAL); /* XXX: should be impossible */ if (dst6.sin6_scope_id != 0) { if (dst6.sin6_scope_id != zoneid) return (EINVAL); } else /* user omit to specify the ID. */ dst6.sin6_scope_id = zoneid; /* convert into the internal form */ if (sa6_embedscope(&dst6, 0)) return (EINVAL); /* XXX: should be impossible */ } /* Modify original ifra_dstaddr to reflect changes */ ifra->ifra_dstaddr = dst6; /* * The destination address can be specified only for a p2p or a * loopback interface. If specified, the corresponding prefix length * must be 128. */ if (ifra->ifra_dstaddr.sin6_family == AF_INET6) { if ((ifp->if_flags & (IFF_POINTOPOINT|IFF_LOOPBACK)) == 0) { /* XXX: noisy message */ nd6log((LOG_INFO, "in6_update_ifa: a destination can " "be specified for a p2p or a loopback IF only\n")); return (EINVAL); } if (plen != 128) { nd6log((LOG_INFO, "in6_update_ifa: prefixlen should " "be 128 when dstaddr is specified\n")); return (EINVAL); } } /* lifetime consistency check */ lt = &ifra->ifra_lifetime; if (lt->ia6t_pltime > lt->ia6t_vltime) return (EINVAL); if (lt->ia6t_vltime == 0) { /* * the following log might be noisy, but this is a typical * configuration mistake or a tool's bug. */ nd6log((LOG_INFO, "in6_update_ifa: valid lifetime is 0 for %s\n", ip6_sprintf(ip6buf, &ifra->ifra_addr.sin6_addr))); if (ia == NULL) return (0); /* there's nothing to do */ } /* Check prefix mask */ if (ia != NULL && ifra->ifra_prefixmask.sin6_len != 0) { /* * We prohibit changing the prefix length of an existing * address, because * + such an operation should be rare in IPv6, and * + the operation would confuse prefix management. */ if (ia->ia_prefixmask.sin6_len != 0 && in6_mask2len(&ia->ia_prefixmask.sin6_addr, NULL) != plen) { nd6log((LOG_INFO, "in6_validate_ifa: the prefix length " "of an existing %s address should not be changed\n", ip6_sprintf(ip6buf, &ia->ia_addr.sin6_addr))); return (EINVAL); } } return (0); } /* * Allocate a new ifaddr and link it into chains. */ static struct in6_ifaddr * in6_alloc_ifa(struct ifnet *ifp, struct in6_aliasreq *ifra, int flags) { struct in6_ifaddr *ia; /* * When in6_alloc_ifa() is called in a process of a received * RA, it is called under an interrupt context. So, we should * call malloc with M_NOWAIT. */ ia = (struct in6_ifaddr *)ifa_alloc(sizeof(*ia), M_NOWAIT); if (ia == NULL) return (NULL); LIST_INIT(&ia->ia6_memberships); /* Initialize the address and masks, and put time stamp */ ia->ia_ifa.ifa_addr = (struct sockaddr *)&ia->ia_addr; ia->ia_addr.sin6_family = AF_INET6; ia->ia_addr.sin6_len = sizeof(ia->ia_addr); /* XXX: Can we assign ,sin6_addr and skip the rest? */ ia->ia_addr = ifra->ifra_addr; ia->ia6_createtime = time_uptime; if ((ifp->if_flags & (IFF_POINTOPOINT | IFF_LOOPBACK)) != 0) { /* * Some functions expect that ifa_dstaddr is not * NULL for p2p interfaces. */ ia->ia_ifa.ifa_dstaddr = (struct sockaddr *)&ia->ia_dstaddr; } else { ia->ia_ifa.ifa_dstaddr = NULL; } /* set prefix mask if any */ ia->ia_ifa.ifa_netmask = (struct sockaddr *)&ia->ia_prefixmask; if (ifra->ifra_prefixmask.sin6_len != 0) { ia->ia_prefixmask.sin6_family = AF_INET6; ia->ia_prefixmask.sin6_len = ifra->ifra_prefixmask.sin6_len; ia->ia_prefixmask.sin6_addr = ifra->ifra_prefixmask.sin6_addr; } ia->ia_ifp = ifp; ifa_ref(&ia->ia_ifa); /* if_addrhead */ IF_ADDR_WLOCK(ifp); CK_STAILQ_INSERT_TAIL(&ifp->if_addrhead, &ia->ia_ifa, ifa_link); IF_ADDR_WUNLOCK(ifp); ifa_ref(&ia->ia_ifa); /* in6_ifaddrhead */ IN6_IFADDR_WLOCK(); CK_STAILQ_INSERT_TAIL(&V_in6_ifaddrhead, ia, ia_link); LIST_INSERT_HEAD(IN6ADDR_HASH(&ia->ia_addr.sin6_addr), ia, ia6_hash); IN6_IFADDR_WUNLOCK(); return (ia); } /* * Update/configure interface address parameters: * * 1) Update lifetime * 2) Update interface metric ad flags * 3) Notify other subsystems */ static int in6_update_ifa_internal(struct ifnet *ifp, struct in6_aliasreq *ifra, struct in6_ifaddr *ia, int hostIsNew, int flags) { int error; /* update timestamp */ ia->ia6_updatetime = time_uptime; /* * Set lifetimes. We do not refer to ia6t_expire and ia6t_preferred * to see if the address is deprecated or invalidated, but initialize * these members for applications. */ ia->ia6_lifetime = ifra->ifra_lifetime; if (ia->ia6_lifetime.ia6t_vltime != ND6_INFINITE_LIFETIME) { ia->ia6_lifetime.ia6t_expire = time_uptime + ia->ia6_lifetime.ia6t_vltime; } else ia->ia6_lifetime.ia6t_expire = 0; if (ia->ia6_lifetime.ia6t_pltime != ND6_INFINITE_LIFETIME) { ia->ia6_lifetime.ia6t_preferred = time_uptime + ia->ia6_lifetime.ia6t_pltime; } else ia->ia6_lifetime.ia6t_preferred = 0; /* * backward compatibility - if IN6_IFF_DEPRECATED is set from the * userland, make it deprecated. */ if ((ifra->ifra_flags & IN6_IFF_DEPRECATED) != 0) { ia->ia6_lifetime.ia6t_pltime = 0; ia->ia6_lifetime.ia6t_preferred = time_uptime; } /* * configure address flags. */ ia->ia6_flags = ifra->ifra_flags; /* * Make the address tentative before joining multicast addresses, * so that corresponding MLD responses would not have a tentative * source address. */ ia->ia6_flags &= ~IN6_IFF_DUPLICATED; /* safety */ /* * DAD should be performed for an new address or addresses on * an interface with ND6_IFF_IFDISABLED. */ if (in6if_do_dad(ifp) && (hostIsNew || (ND_IFINFO(ifp)->flags & ND6_IFF_IFDISABLED))) ia->ia6_flags |= IN6_IFF_TENTATIVE; /* notify other subsystems */ error = in6_notify_ifa(ifp, ia, ifra, hostIsNew); return (error); } /* * Do link-level ifa job: * 1) Add lle entry for added address * 2) Notifies routing socket users about new address * 3) join appropriate multicast group * 4) start DAD if enabled */ static int in6_broadcast_ifa(struct ifnet *ifp, struct in6_aliasreq *ifra, struct in6_ifaddr *ia, int flags) { struct in6_multi *in6m_sol; int error = 0; /* Add local address to lltable, if necessary (ex. on p2p link). */ if ((error = nd6_add_ifa_lle(ia)) != 0) { in6_purgeaddr(&ia->ia_ifa); ifa_free(&ia->ia_ifa); return (error); } /* Join necessary multicast groups. */ in6m_sol = NULL; if ((ifp->if_flags & IFF_MULTICAST) != 0) { error = in6_update_ifa_join_mc(ifp, ifra, ia, flags, &in6m_sol); if (error != 0) { in6_purgeaddr(&ia->ia_ifa); ifa_free(&ia->ia_ifa); return (error); } } /* Perform DAD, if the address is TENTATIVE. */ if ((ia->ia6_flags & IN6_IFF_TENTATIVE)) { int delay, mindelay, maxdelay; delay = 0; if ((flags & IN6_IFAUPDATE_DADDELAY)) { /* * We need to impose a delay before sending an NS * for DAD. Check if we also needed a delay for the * corresponding MLD message. If we did, the delay * should be larger than the MLD delay (this could be * relaxed a bit, but this simple logic is at least * safe). * XXX: Break data hiding guidelines and look at * state for the solicited multicast group. */ mindelay = 0; if (in6m_sol != NULL && in6m_sol->in6m_state == MLD_REPORTING_MEMBER) { mindelay = in6m_sol->in6m_timer; } maxdelay = MAX_RTR_SOLICITATION_DELAY * hz; if (maxdelay - mindelay == 0) delay = 0; else { delay = (arc4random() % (maxdelay - mindelay)) + mindelay; } } nd6_dad_start((struct ifaddr *)ia, delay); } in6_newaddrmsg(ia, RTM_ADD); ifa_free(&ia->ia_ifa); return (error); } void in6_purgeaddr(struct ifaddr *ifa) { struct ifnet *ifp = ifa->ifa_ifp; struct in6_ifaddr *ia = (struct in6_ifaddr *) ifa; struct in6_multi_mship *imm; int plen, error; if (ifa->ifa_carp) (*carp_detach_p)(ifa, false); /* * Remove the loopback route to the interface address. * The check for the current setting of "nd6_useloopback" * is not needed. */ if (ia->ia_flags & IFA_RTSELF) { error = ifa_del_loopback_route((struct ifaddr *)ia, (struct sockaddr *)&ia->ia_addr); if (error == 0) ia->ia_flags &= ~IFA_RTSELF; } /* stop DAD processing */ nd6_dad_stop(ifa); /* Leave multicast groups. */ while ((imm = LIST_FIRST(&ia->ia6_memberships)) != NULL) { LIST_REMOVE(imm, i6mm_chain); if (imm->i6mm_maddr != NULL) in6_leavegroup(imm->i6mm_maddr, NULL); free(imm, M_IP6MADDR); } plen = in6_mask2len(&ia->ia_prefixmask.sin6_addr, NULL); /* XXX */ if ((ia->ia_flags & IFA_ROUTE) && plen == 128) { error = rtinit(&(ia->ia_ifa), RTM_DELETE, ia->ia_flags | (ia->ia_dstaddr.sin6_family == AF_INET6 ? RTF_HOST : 0)); if (error != 0) log(LOG_INFO, "%s: err=%d, destination address delete " "failed\n", __func__, error); ia->ia_flags &= ~IFA_ROUTE; } in6_newaddrmsg(ia, RTM_DELETE); in6_unlink_ifa(ia, ifp); } static void in6_unlink_ifa(struct in6_ifaddr *ia, struct ifnet *ifp) { char ip6buf[INET6_ADDRSTRLEN]; int remove_lle; IF_ADDR_WLOCK(ifp); CK_STAILQ_REMOVE(&ifp->if_addrhead, &ia->ia_ifa, ifaddr, ifa_link); IF_ADDR_WUNLOCK(ifp); ifa_free(&ia->ia_ifa); /* if_addrhead */ /* * Defer the release of what might be the last reference to the * in6_ifaddr so that it can't be freed before the remainder of the * cleanup. */ IN6_IFADDR_WLOCK(); CK_STAILQ_REMOVE(&V_in6_ifaddrhead, ia, in6_ifaddr, ia_link); LIST_REMOVE(ia, ia6_hash); IN6_IFADDR_WUNLOCK(); /* * Release the reference to the base prefix. There should be a * positive reference. */ remove_lle = 0; if (ia->ia6_ndpr == NULL) { nd6log((LOG_NOTICE, "in6_unlink_ifa: autoconf'ed address " "%s has no prefix\n", ip6_sprintf(ip6buf, IA6_IN6(ia)))); } else { ia->ia6_ndpr->ndpr_addrcnt--; /* Do not delete lles within prefix if refcont != 0 */ if (ia->ia6_ndpr->ndpr_addrcnt == 0) remove_lle = 1; ia->ia6_ndpr = NULL; } nd6_rem_ifa_lle(ia, remove_lle); /* * Also, if the address being removed is autoconf'ed, call * pfxlist_onlink_check() since the release might affect the status of * other (detached) addresses. */ if ((ia->ia6_flags & IN6_IFF_AUTOCONF)) { pfxlist_onlink_check(); } ifa_free(&ia->ia_ifa); /* in6_ifaddrhead */ } /* * Notifies other subsystems about address change/arrival: * 1) Notifies device handler on the first IPv6 address assignment * 2) Handle routing table changes for P2P links and route * 3) Handle routing table changes for address host route */ static int in6_notify_ifa(struct ifnet *ifp, struct in6_ifaddr *ia, struct in6_aliasreq *ifra, int hostIsNew) { int error = 0, plen, ifacount = 0; struct ifaddr *ifa; struct sockaddr_in6 *pdst; char ip6buf[INET6_ADDRSTRLEN]; /* * Give the interface a chance to initialize * if this is its first address, */ if (hostIsNew != 0) { IF_ADDR_RLOCK(ifp); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; ifacount++; } IF_ADDR_RUNLOCK(ifp); } if (ifacount <= 1 && ifp->if_ioctl) { error = (*ifp->if_ioctl)(ifp, SIOCSIFADDR, (caddr_t)ia); if (error) goto done; } /* * If a new destination address is specified, scrub the old one and * install the new destination. Note that the interface must be * p2p or loopback. */ pdst = &ifra->ifra_dstaddr; if (pdst->sin6_family == AF_INET6 && !IN6_ARE_ADDR_EQUAL(&pdst->sin6_addr, &ia->ia_dstaddr.sin6_addr)) { if ((ia->ia_flags & IFA_ROUTE) != 0 && (rtinit(&(ia->ia_ifa), (int)RTM_DELETE, RTF_HOST) != 0)) { nd6log((LOG_ERR, "in6_update_ifa_internal: failed to " "remove a route to the old destination: %s\n", ip6_sprintf(ip6buf, &ia->ia_addr.sin6_addr))); /* proceed anyway... */ } else ia->ia_flags &= ~IFA_ROUTE; ia->ia_dstaddr = *pdst; } /* * If a new destination address is specified for a point-to-point * interface, install a route to the destination as an interface * direct route. * XXX: the logic below rejects assigning multiple addresses on a p2p * interface that share the same destination. */ plen = in6_mask2len(&ia->ia_prefixmask.sin6_addr, NULL); /* XXX */ if (!(ia->ia_flags & IFA_ROUTE) && plen == 128 && ia->ia_dstaddr.sin6_family == AF_INET6) { int rtflags = RTF_UP | RTF_HOST; /* * Handle the case for ::1 . */ if (ifp->if_flags & IFF_LOOPBACK) ia->ia_flags |= IFA_RTSELF; error = rtinit(&ia->ia_ifa, RTM_ADD, ia->ia_flags | rtflags); if (error) goto done; ia->ia_flags |= IFA_ROUTE; } /* * add a loopback route to self if not exists */ if (!(ia->ia_flags & IFA_RTSELF) && V_nd6_useloopback) { error = ifa_add_loopback_route((struct ifaddr *)ia, (struct sockaddr *)&ia->ia_addr); if (error == 0) ia->ia_flags |= IFA_RTSELF; } done: WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, "Invoking IPv6 network device address event may sleep"); EVENTHANDLER_INVOKE(ifaddr_event, ifp); return (error); } /* * Find an IPv6 interface link-local address specific to an interface. * ifaddr is returned referenced. */ struct in6_ifaddr * in6ifa_ifpforlinklocal(struct ifnet *ifp, int ignoreflags) { struct ifaddr *ifa; IF_ADDR_RLOCK(ifp); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; if (IN6_IS_ADDR_LINKLOCAL(IFA_IN6(ifa))) { if ((((struct in6_ifaddr *)ifa)->ia6_flags & ignoreflags) != 0) continue; ifa_ref(ifa); break; } } IF_ADDR_RUNLOCK(ifp); return ((struct in6_ifaddr *)ifa); } /* * find the interface address corresponding to a given IPv6 address. * ifaddr is returned referenced. */ struct in6_ifaddr * in6ifa_ifwithaddr(const struct in6_addr *addr, uint32_t zoneid) { struct rm_priotracker in6_ifa_tracker; struct in6_ifaddr *ia; IN6_IFADDR_RLOCK(&in6_ifa_tracker); LIST_FOREACH(ia, IN6ADDR_HASH(addr), ia6_hash) { if (IN6_ARE_ADDR_EQUAL(IA6_IN6(ia), addr)) { if (zoneid != 0 && zoneid != ia->ia_addr.sin6_scope_id) continue; ifa_ref(&ia->ia_ifa); break; } } IN6_IFADDR_RUNLOCK(&in6_ifa_tracker); return (ia); } /* * find the internet address corresponding to a given interface and address. * ifaddr is returned referenced. */ struct in6_ifaddr * in6ifa_ifpwithaddr(struct ifnet *ifp, const struct in6_addr *addr) { struct ifaddr *ifa; IF_ADDR_RLOCK(ifp); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; if (IN6_ARE_ADDR_EQUAL(addr, IFA_IN6(ifa))) { ifa_ref(ifa); break; } } IF_ADDR_RUNLOCK(ifp); return ((struct in6_ifaddr *)ifa); } /* * Find a link-local scoped address on ifp and return it if any. */ struct in6_ifaddr * in6ifa_llaonifp(struct ifnet *ifp) { struct sockaddr_in6 *sin6; struct ifaddr *ifa; if (ND_IFINFO(ifp)->flags & ND6_IFF_IFDISABLED) return (NULL); IF_ADDR_RLOCK(ifp); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; sin6 = (struct sockaddr_in6 *)ifa->ifa_addr; if (IN6_IS_SCOPE_LINKLOCAL(&sin6->sin6_addr) || IN6_IS_ADDR_MC_INTFACELOCAL(&sin6->sin6_addr) || IN6_IS_ADDR_MC_NODELOCAL(&sin6->sin6_addr)) break; } IF_ADDR_RUNLOCK(ifp); return ((struct in6_ifaddr *)ifa); } /* * Convert IP6 address to printable (loggable) representation. Caller * has to make sure that ip6buf is at least INET6_ADDRSTRLEN long. */ static char digits[] = "0123456789abcdef"; char * ip6_sprintf(char *ip6buf, const struct in6_addr *addr) { int i, cnt = 0, maxcnt = 0, idx = 0, index = 0; char *cp; const u_int16_t *a = (const u_int16_t *)addr; const u_int8_t *d; int dcolon = 0, zero = 0; cp = ip6buf; for (i = 0; i < 8; i++) { if (*(a + i) == 0) { cnt++; if (cnt == 1) idx = i; } else if (maxcnt < cnt) { maxcnt = cnt; index = idx; cnt = 0; } } if (maxcnt < cnt) { maxcnt = cnt; index = idx; } for (i = 0; i < 8; i++) { if (dcolon == 1) { if (*a == 0) { if (i == 7) *cp++ = ':'; a++; continue; } else dcolon = 2; } if (*a == 0) { if (dcolon == 0 && *(a + 1) == 0 && i == index) { if (i == 0) *cp++ = ':'; *cp++ = ':'; dcolon = 1; } else { *cp++ = '0'; *cp++ = ':'; } a++; continue; } d = (const u_char *)a; /* Try to eliminate leading zeros in printout like in :0001. */ zero = 1; *cp = digits[*d >> 4]; if (*cp != '0') { zero = 0; cp++; } *cp = digits[*d++ & 0xf]; if (zero == 0 || (*cp != '0')) { zero = 0; cp++; } *cp = digits[*d >> 4]; if (zero == 0 || (*cp != '0')) { zero = 0; cp++; } *cp++ = digits[*d & 0xf]; *cp++ = ':'; a++; } *--cp = '\0'; return (ip6buf); } int in6_localaddr(struct in6_addr *in6) { struct rm_priotracker in6_ifa_tracker; struct in6_ifaddr *ia; if (IN6_IS_ADDR_LOOPBACK(in6) || IN6_IS_ADDR_LINKLOCAL(in6)) return 1; IN6_IFADDR_RLOCK(&in6_ifa_tracker); CK_STAILQ_FOREACH(ia, &V_in6_ifaddrhead, ia_link) { if (IN6_ARE_MASKED_ADDR_EQUAL(in6, &ia->ia_addr.sin6_addr, &ia->ia_prefixmask.sin6_addr)) { IN6_IFADDR_RUNLOCK(&in6_ifa_tracker); return 1; } } IN6_IFADDR_RUNLOCK(&in6_ifa_tracker); return (0); } /* * Return 1 if an internet address is for the local host and configured * on one of its interfaces. */ int in6_localip(struct in6_addr *in6) { struct rm_priotracker in6_ifa_tracker; struct in6_ifaddr *ia; IN6_IFADDR_RLOCK(&in6_ifa_tracker); LIST_FOREACH(ia, IN6ADDR_HASH(in6), ia6_hash) { if (IN6_ARE_ADDR_EQUAL(in6, &ia->ia_addr.sin6_addr)) { IN6_IFADDR_RUNLOCK(&in6_ifa_tracker); return (1); } } IN6_IFADDR_RUNLOCK(&in6_ifa_tracker); return (0); } /* * Return 1 if an internet address is configured on an interface. */ int in6_ifhasaddr(struct ifnet *ifp, struct in6_addr *addr) { struct in6_addr in6; struct ifaddr *ifa; struct in6_ifaddr *ia6; in6 = *addr; if (in6_clearscope(&in6)) return (0); in6_setscope(&in6, ifp, NULL); IF_ADDR_RLOCK(ifp); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; ia6 = (struct in6_ifaddr *)ifa; if (IN6_ARE_ADDR_EQUAL(&ia6->ia_addr.sin6_addr, &in6)) { IF_ADDR_RUNLOCK(ifp); return (1); } } IF_ADDR_RUNLOCK(ifp); return (0); } int in6_is_addr_deprecated(struct sockaddr_in6 *sa6) { struct rm_priotracker in6_ifa_tracker; struct in6_ifaddr *ia; IN6_IFADDR_RLOCK(&in6_ifa_tracker); LIST_FOREACH(ia, IN6ADDR_HASH(&sa6->sin6_addr), ia6_hash) { if (IN6_ARE_ADDR_EQUAL(IA6_IN6(ia), &sa6->sin6_addr)) { if (ia->ia6_flags & IN6_IFF_DEPRECATED) { IN6_IFADDR_RUNLOCK(&in6_ifa_tracker); return (1); /* true */ } break; } } IN6_IFADDR_RUNLOCK(&in6_ifa_tracker); return (0); /* false */ } /* * return length of part which dst and src are equal * hard coding... */ int in6_matchlen(struct in6_addr *src, struct in6_addr *dst) { int match = 0; u_char *s = (u_char *)src, *d = (u_char *)dst; u_char *lim = s + 16, r; while (s < lim) if ((r = (*d++ ^ *s++)) != 0) { while (r < 128) { match++; r <<= 1; } break; } else match += 8; return match; } /* XXX: to be scope conscious */ int in6_are_prefix_equal(struct in6_addr *p1, struct in6_addr *p2, int len) { int bytelen, bitlen; /* sanity check */ if (0 > len || len > 128) { log(LOG_ERR, "in6_are_prefix_equal: invalid prefix length(%d)\n", len); return (0); } bytelen = len / 8; bitlen = len % 8; if (bcmp(&p1->s6_addr, &p2->s6_addr, bytelen)) return (0); if (bitlen != 0 && p1->s6_addr[bytelen] >> (8 - bitlen) != p2->s6_addr[bytelen] >> (8 - bitlen)) return (0); return (1); } void in6_prefixlen2mask(struct in6_addr *maskp, int len) { u_char maskarray[8] = {0x80, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc, 0xfe, 0xff}; int bytelen, bitlen, i; /* sanity check */ if (0 > len || len > 128) { log(LOG_ERR, "in6_prefixlen2mask: invalid prefix length(%d)\n", len); return; } bzero(maskp, sizeof(*maskp)); bytelen = len / 8; bitlen = len % 8; for (i = 0; i < bytelen; i++) maskp->s6_addr[i] = 0xff; if (bitlen) maskp->s6_addr[bytelen] = maskarray[bitlen - 1]; } /* * return the best address out of the same scope. if no address was * found, return the first valid address from designated IF. */ struct in6_ifaddr * in6_ifawithifp(struct ifnet *ifp, struct in6_addr *dst) { int dst_scope = in6_addrscope(dst), blen = -1, tlen; struct ifaddr *ifa; struct in6_ifaddr *besta = NULL; struct in6_ifaddr *dep[2]; /* last-resort: deprecated */ dep[0] = dep[1] = NULL; /* * We first look for addresses in the same scope. * If there is one, return it. * If two or more, return one which matches the dst longest. * If none, return one of global addresses assigned other ifs. */ IF_ADDR_RLOCK(ifp); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; if (((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_ANYCAST) continue; /* XXX: is there any case to allow anycast? */ if (((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_NOTREADY) continue; /* don't use this interface */ if (((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_DETACHED) continue; if (((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_DEPRECATED) { if (V_ip6_use_deprecated) dep[0] = (struct in6_ifaddr *)ifa; continue; } if (dst_scope == in6_addrscope(IFA_IN6(ifa))) { /* * call in6_matchlen() as few as possible */ if (besta) { if (blen == -1) blen = in6_matchlen(&besta->ia_addr.sin6_addr, dst); tlen = in6_matchlen(IFA_IN6(ifa), dst); if (tlen > blen) { blen = tlen; besta = (struct in6_ifaddr *)ifa; } } else besta = (struct in6_ifaddr *)ifa; } } if (besta) { ifa_ref(&besta->ia_ifa); IF_ADDR_RUNLOCK(ifp); return (besta); } CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; if (((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_ANYCAST) continue; /* XXX: is there any case to allow anycast? */ if (((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_NOTREADY) continue; /* don't use this interface */ if (((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_DETACHED) continue; if (((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_DEPRECATED) { if (V_ip6_use_deprecated) dep[1] = (struct in6_ifaddr *)ifa; continue; } if (ifa != NULL) ifa_ref(ifa); IF_ADDR_RUNLOCK(ifp); return (struct in6_ifaddr *)ifa; } /* use the last-resort values, that are, deprecated addresses */ if (dep[0]) { ifa_ref((struct ifaddr *)dep[0]); IF_ADDR_RUNLOCK(ifp); return dep[0]; } if (dep[1]) { ifa_ref((struct ifaddr *)dep[1]); IF_ADDR_RUNLOCK(ifp); return dep[1]; } IF_ADDR_RUNLOCK(ifp); return NULL; } /* * perform DAD when interface becomes IFF_UP. */ void in6_if_up(struct ifnet *ifp) { struct ifaddr *ifa; struct in6_ifaddr *ia; IF_ADDR_RLOCK(ifp); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; ia = (struct in6_ifaddr *)ifa; if (ia->ia6_flags & IN6_IFF_TENTATIVE) { /* * The TENTATIVE flag was likely set by hand * beforehand, implicitly indicating the need for DAD. * We may be able to skip the random delay in this * case, but we impose delays just in case. */ nd6_dad_start(ifa, arc4random() % (MAX_RTR_SOLICITATION_DELAY * hz)); } } IF_ADDR_RUNLOCK(ifp); /* * special cases, like 6to4, are handled in in6_ifattach */ in6_ifattach(ifp, NULL); } int in6if_do_dad(struct ifnet *ifp) { if ((ifp->if_flags & IFF_LOOPBACK) != 0) return (0); if ((ND_IFINFO(ifp)->flags & ND6_IFF_IFDISABLED) || (ND_IFINFO(ifp)->flags & ND6_IFF_NO_DAD)) return (0); /* * Our DAD routine requires the interface up and running. * However, some interfaces can be up before the RUNNING * status. Additionally, users may try to assign addresses * before the interface becomes up (or running). * This function returns EAGAIN in that case. * The caller should mark "tentative" on the address instead of * performing DAD immediately. */ if (!((ifp->if_flags & IFF_UP) && (ifp->if_drv_flags & IFF_DRV_RUNNING))) return (EAGAIN); return (1); } /* * Calculate max IPv6 MTU through all the interfaces and store it * to in6_maxmtu. */ void in6_setmaxmtu(void) { unsigned long maxmtu = 0; struct ifnet *ifp; IFNET_RLOCK_NOSLEEP(); - TAILQ_FOREACH(ifp, &V_ifnet, if_link) { + CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) { /* this function can be called during ifnet initialization */ if (!ifp->if_afdata[AF_INET6]) continue; if ((ifp->if_flags & IFF_LOOPBACK) == 0 && IN6_LINKMTU(ifp) > maxmtu) maxmtu = IN6_LINKMTU(ifp); } IFNET_RUNLOCK_NOSLEEP(); if (maxmtu) /* update only when maxmtu is positive */ V_in6_maxmtu = maxmtu; } /* * Provide the length of interface identifiers to be used for the link attached * to the given interface. The length should be defined in "IPv6 over * xxx-link" document. Note that address architecture might also define * the length for a particular set of address prefixes, regardless of the * link type. As clarified in rfc2462bis, those two definitions should be * consistent, and those really are as of August 2004. */ int in6_if2idlen(struct ifnet *ifp) { switch (ifp->if_type) { case IFT_ETHER: /* RFC2464 */ case IFT_PROPVIRTUAL: /* XXX: no RFC. treat it as ether */ case IFT_L2VLAN: /* ditto */ case IFT_BRIDGE: /* bridge(4) only does Ethernet-like links */ case IFT_INFINIBAND: return (64); case IFT_PPP: /* RFC2472 */ return (64); case IFT_FRELAY: /* RFC2590 */ return (64); case IFT_IEEE1394: /* RFC3146 */ return (64); case IFT_GIF: return (64); /* draft-ietf-v6ops-mech-v2-07 */ case IFT_LOOP: return (64); /* XXX: is this really correct? */ default: /* * Unknown link type: * It might be controversial to use the today's common constant * of 64 for these cases unconditionally. For full compliance, * we should return an error in this case. On the other hand, * if we simply miss the standard for the link type or a new * standard is defined for a new link type, the IFID length * is very likely to be the common constant. As a compromise, * we always use the constant, but make an explicit notice * indicating the "unknown" case. */ printf("in6_if2idlen: unknown link type (%d)\n", ifp->if_type); return (64); } } #include struct in6_llentry { struct llentry base; }; #define IN6_LLTBL_DEFAULT_HSIZE 32 #define IN6_LLTBL_HASH(k, h) \ (((((((k >> 8) ^ k) >> 8) ^ k) >> 8) ^ k) & ((h) - 1)) /* * Do actual deallocation of @lle. */ static void -in6_lltable_destroy_lle_unlocked(struct llentry *lle) +in6_lltable_destroy_lle_unlocked(epoch_context_t ctx) { + struct llentry *lle; + lle = __containerof(ctx, struct llentry, lle_epoch_ctx); LLE_LOCK_DESTROY(lle); LLE_REQ_DESTROY(lle); free(lle, M_LLTABLE); } /* * Called by LLE_FREE_LOCKED when number of references * drops to zero. */ static void in6_lltable_destroy_lle(struct llentry *lle) { LLE_WUNLOCK(lle); - in6_lltable_destroy_lle_unlocked(lle); + epoch_call(net_epoch_preempt, &lle->lle_epoch_ctx, in6_lltable_destroy_lle_unlocked); } static struct llentry * in6_lltable_new(const struct in6_addr *addr6, u_int flags) { struct in6_llentry *lle; lle = malloc(sizeof(struct in6_llentry), M_LLTABLE, M_NOWAIT | M_ZERO); if (lle == NULL) /* NB: caller generates msg */ return NULL; lle->base.r_l3addr.addr6 = *addr6; lle->base.lle_refcnt = 1; lle->base.lle_free = in6_lltable_destroy_lle; LLE_LOCK_INIT(&lle->base); LLE_REQ_INIT(&lle->base); callout_init(&lle->base.lle_timer, 1); return (&lle->base); } static int in6_lltable_match_prefix(const struct sockaddr *saddr, const struct sockaddr *smask, u_int flags, struct llentry *lle) { const struct in6_addr *addr, *mask, *lle_addr; addr = &((const struct sockaddr_in6 *)saddr)->sin6_addr; mask = &((const struct sockaddr_in6 *)smask)->sin6_addr; lle_addr = &lle->r_l3addr.addr6; if (IN6_ARE_MASKED_ADDR_EQUAL(lle_addr, addr, mask) == 0) return (0); if (lle->la_flags & LLE_IFADDR) { /* * Delete LLE_IFADDR records IFF address & flag matches. * Note that addr is the interface address within prefix * being matched. */ if (IN6_ARE_ADDR_EQUAL(addr, lle_addr) && (flags & LLE_STATIC) != 0) return (1); return (0); } /* flags & LLE_STATIC means deleting both dynamic and static entries */ if ((flags & LLE_STATIC) || !(lle->la_flags & LLE_STATIC)) return (1); return (0); } static void in6_lltable_free_entry(struct lltable *llt, struct llentry *lle) { struct ifnet *ifp; LLE_WLOCK_ASSERT(lle); KASSERT(llt != NULL, ("lltable is NULL")); /* Unlink entry from table */ if ((lle->la_flags & LLE_LINKED) != 0) { ifp = llt->llt_ifp; IF_AFDATA_WLOCK_ASSERT(ifp); lltable_unlink_entry(llt, lle); } if (callout_stop(&lle->lle_timer) > 0) LLE_REMREF(lle); llentry_free(lle); } static int in6_lltable_rtcheck(struct ifnet *ifp, u_int flags, const struct sockaddr *l3addr) { const struct sockaddr_in6 *sin6; struct nhop6_basic nh6; struct in6_addr dst; uint32_t scopeid; int error; char ip6buf[INET6_ADDRSTRLEN]; int fibnum; KASSERT(l3addr->sa_family == AF_INET6, ("sin_family %d", l3addr->sa_family)); sin6 = (const struct sockaddr_in6 *)l3addr; in6_splitscope(&sin6->sin6_addr, &dst, &scopeid); fibnum = V_rt_add_addr_allfibs ? RT_DEFAULT_FIB : ifp->if_fib; error = fib6_lookup_nh_basic(fibnum, &dst, scopeid, 0, 0, &nh6); if (error != 0 || (nh6.nh_flags & NHF_GATEWAY) || nh6.nh_ifp != ifp) { struct ifaddr *ifa; /* * Create an ND6 cache for an IPv6 neighbor * that is not covered by our own prefix. */ + NET_EPOCH_ENTER(); ifa = ifaof_ifpforaddr(l3addr, ifp); if (ifa != NULL) { - ifa_free(ifa); + NET_EPOCH_EXIT(); return 0; } + NET_EPOCH_EXIT(); log(LOG_INFO, "IPv6 address: \"%s\" is not on the network\n", ip6_sprintf(ip6buf, &sin6->sin6_addr)); return EINVAL; } return 0; } /* * Called by the datapath to indicate that the entry was used. */ static void in6_lltable_mark_used(struct llentry *lle) { LLE_REQ_LOCK(lle); lle->r_skip_req = 0; /* * Set the hit time so the callback function * can determine the remaining time before * transiting to the DELAY state. */ lle->lle_hittime = time_uptime; LLE_REQ_UNLOCK(lle); } static inline uint32_t in6_lltable_hash_dst(const struct in6_addr *dst, uint32_t hsize) { return (IN6_LLTBL_HASH(dst->s6_addr32[3], hsize)); } static uint32_t in6_lltable_hash(const struct llentry *lle, uint32_t hsize) { return (in6_lltable_hash_dst(&lle->r_l3addr.addr6, hsize)); } static void in6_lltable_fill_sa_entry(const struct llentry *lle, struct sockaddr *sa) { struct sockaddr_in6 *sin6; sin6 = (struct sockaddr_in6 *)sa; bzero(sin6, sizeof(*sin6)); sin6->sin6_family = AF_INET6; sin6->sin6_len = sizeof(*sin6); sin6->sin6_addr = lle->r_l3addr.addr6; } static inline struct llentry * in6_lltable_find_dst(struct lltable *llt, const struct in6_addr *dst) { struct llentry *lle; struct llentries *lleh; u_int hashidx; hashidx = in6_lltable_hash_dst(dst, llt->llt_hsize); lleh = &llt->lle_head[hashidx]; - LIST_FOREACH(lle, lleh, lle_next) { + CK_LIST_FOREACH(lle, lleh, lle_next) { if (lle->la_flags & LLE_DELETED) continue; if (IN6_ARE_ADDR_EQUAL(&lle->r_l3addr.addr6, dst)) break; } return (lle); } static void in6_lltable_delete_entry(struct lltable *llt, struct llentry *lle) { lle->la_flags |= LLE_DELETED; EVENTHANDLER_INVOKE(lle_event, lle, LLENTRY_DELETED); #ifdef DIAGNOSTIC log(LOG_INFO, "ifaddr cache = %p is deleted\n", lle); #endif llentry_free(lle); } static struct llentry * in6_lltable_alloc(struct lltable *llt, u_int flags, const struct sockaddr *l3addr) { const struct sockaddr_in6 *sin6 = (const struct sockaddr_in6 *)l3addr; struct ifnet *ifp = llt->llt_ifp; struct llentry *lle; char linkhdr[LLE_MAX_LINKHDR]; size_t linkhdrsize; int lladdr_off; KASSERT(l3addr->sa_family == AF_INET6, ("sin_family %d", l3addr->sa_family)); /* * A route that covers the given address must have * been installed 1st because we are doing a resolution, * verify this. */ if (!(flags & LLE_IFADDR) && in6_lltable_rtcheck(ifp, flags, l3addr) != 0) return (NULL); lle = in6_lltable_new(&sin6->sin6_addr, flags); if (lle == NULL) { log(LOG_INFO, "lla_lookup: new lle malloc failed\n"); return (NULL); } lle->la_flags = flags; if ((flags & LLE_IFADDR) == LLE_IFADDR) { linkhdrsize = LLE_MAX_LINKHDR; if (lltable_calc_llheader(ifp, AF_INET6, IF_LLADDR(ifp), linkhdr, &linkhdrsize, &lladdr_off) != 0) { - in6_lltable_destroy_lle_unlocked(lle); + epoch_call(net_epoch_preempt, &lle->lle_epoch_ctx, in6_lltable_destroy_lle_unlocked); return (NULL); } lltable_set_entry_addr(ifp, lle, linkhdr, linkhdrsize, lladdr_off); lle->la_flags |= LLE_STATIC; } if ((lle->la_flags & LLE_STATIC) != 0) lle->ln_state = ND6_LLINFO_REACHABLE; return (lle); } static struct llentry * in6_lltable_lookup(struct lltable *llt, u_int flags, const struct sockaddr *l3addr) { const struct sockaddr_in6 *sin6 = (const struct sockaddr_in6 *)l3addr; struct llentry *lle; IF_AFDATA_LOCK_ASSERT(llt->llt_ifp); KASSERT(l3addr->sa_family == AF_INET6, ("sin_family %d", l3addr->sa_family)); lle = in6_lltable_find_dst(llt, &sin6->sin6_addr); if (lle == NULL) return (NULL); KASSERT((flags & (LLE_UNLOCKED|LLE_EXCLUSIVE)) != (LLE_UNLOCKED|LLE_EXCLUSIVE),("wrong lle request flags: 0x%X", flags)); if (flags & LLE_UNLOCKED) return (lle); if (flags & LLE_EXCLUSIVE) LLE_WLOCK(lle); else LLE_RLOCK(lle); return (lle); } static int in6_lltable_dump_entry(struct lltable *llt, struct llentry *lle, struct sysctl_req *wr) { struct ifnet *ifp = llt->llt_ifp; /* XXX stack use */ struct { struct rt_msghdr rtm; struct sockaddr_in6 sin6; /* * ndp.c assumes that sdl is word aligned */ #ifdef __LP64__ uint32_t pad; #endif struct sockaddr_dl sdl; } ndpc; struct sockaddr_dl *sdl; int error; bzero(&ndpc, sizeof(ndpc)); /* skip deleted entries */ if ((lle->la_flags & LLE_DELETED) == LLE_DELETED) return (0); /* Skip if jailed and not a valid IP of the prison. */ lltable_fill_sa_entry(lle, (struct sockaddr *)&ndpc.sin6); if (prison_if(wr->td->td_ucred, (struct sockaddr *)&ndpc.sin6) != 0) return (0); /* * produce a msg made of: * struct rt_msghdr; * struct sockaddr_in6 (IPv6) * struct sockaddr_dl; */ ndpc.rtm.rtm_msglen = sizeof(ndpc); ndpc.rtm.rtm_version = RTM_VERSION; ndpc.rtm.rtm_type = RTM_GET; ndpc.rtm.rtm_flags = RTF_UP; ndpc.rtm.rtm_addrs = RTA_DST | RTA_GATEWAY; if (V_deembed_scopeid) sa6_recoverscope(&ndpc.sin6); /* publish */ if (lle->la_flags & LLE_PUB) ndpc.rtm.rtm_flags |= RTF_ANNOUNCE; sdl = &ndpc.sdl; sdl->sdl_family = AF_LINK; sdl->sdl_len = sizeof(*sdl); sdl->sdl_index = ifp->if_index; sdl->sdl_type = ifp->if_type; if ((lle->la_flags & LLE_VALID) == LLE_VALID) { sdl->sdl_alen = ifp->if_addrlen; bcopy(lle->ll_addr, LLADDR(sdl), ifp->if_addrlen); } else { sdl->sdl_alen = 0; bzero(LLADDR(sdl), ifp->if_addrlen); } if (lle->la_expire != 0) ndpc.rtm.rtm_rmx.rmx_expire = lle->la_expire + lle->lle_remtime / hz + time_second - time_uptime; ndpc.rtm.rtm_flags |= (RTF_HOST | RTF_LLDATA); if (lle->la_flags & LLE_STATIC) ndpc.rtm.rtm_flags |= RTF_STATIC; if (lle->la_flags & LLE_IFADDR) ndpc.rtm.rtm_flags |= RTF_PINNED; if (lle->ln_router != 0) ndpc.rtm.rtm_flags |= RTF_GATEWAY; ndpc.rtm.rtm_rmx.rmx_pksent = lle->la_asked; /* Store state in rmx_weight value */ ndpc.rtm.rtm_rmx.rmx_state = lle->ln_state; ndpc.rtm.rtm_index = ifp->if_index; error = SYSCTL_OUT(wr, &ndpc, sizeof(ndpc)); return (error); } static struct lltable * in6_lltattach(struct ifnet *ifp) { struct lltable *llt; llt = lltable_allocate_htbl(IN6_LLTBL_DEFAULT_HSIZE); llt->llt_af = AF_INET6; llt->llt_ifp = ifp; llt->llt_lookup = in6_lltable_lookup; llt->llt_alloc_entry = in6_lltable_alloc; llt->llt_delete_entry = in6_lltable_delete_entry; llt->llt_dump_entry = in6_lltable_dump_entry; llt->llt_hash = in6_lltable_hash; llt->llt_fill_sa_entry = in6_lltable_fill_sa_entry; llt->llt_free_entry = in6_lltable_free_entry; llt->llt_match_prefix = in6_lltable_match_prefix; llt->llt_mark_used = in6_lltable_mark_used; lltable_link(llt); return (llt); } void * in6_domifattach(struct ifnet *ifp) { struct in6_ifextra *ext; /* There are not IPv6-capable interfaces. */ switch (ifp->if_type) { case IFT_PFLOG: case IFT_PFSYNC: case IFT_USB: return (NULL); } ext = (struct in6_ifextra *)malloc(sizeof(*ext), M_IFADDR, M_WAITOK); bzero(ext, sizeof(*ext)); ext->in6_ifstat = malloc(sizeof(counter_u64_t) * sizeof(struct in6_ifstat) / sizeof(uint64_t), M_IFADDR, M_WAITOK); COUNTER_ARRAY_ALLOC(ext->in6_ifstat, sizeof(struct in6_ifstat) / sizeof(uint64_t), M_WAITOK); ext->icmp6_ifstat = malloc(sizeof(counter_u64_t) * sizeof(struct icmp6_ifstat) / sizeof(uint64_t), M_IFADDR, M_WAITOK); COUNTER_ARRAY_ALLOC(ext->icmp6_ifstat, sizeof(struct icmp6_ifstat) / sizeof(uint64_t), M_WAITOK); ext->nd_ifinfo = nd6_ifattach(ifp); ext->scope6_id = scope6_ifattach(ifp); ext->lltable = in6_lltattach(ifp); ext->mld_ifinfo = mld_domifattach(ifp); return ext; } int in6_domifmtu(struct ifnet *ifp) { if (ifp->if_afdata[AF_INET6] == NULL) return ifp->if_mtu; return (IN6_LINKMTU(ifp)); } void in6_domifdetach(struct ifnet *ifp, void *aux) { struct in6_ifextra *ext = (struct in6_ifextra *)aux; mld_domifdetach(ifp); scope6_ifdetach(ext->scope6_id); nd6_ifdetach(ifp, ext->nd_ifinfo); lltable_free(ext->lltable); COUNTER_ARRAY_FREE(ext->in6_ifstat, sizeof(struct in6_ifstat) / sizeof(uint64_t)); free(ext->in6_ifstat, M_IFADDR); COUNTER_ARRAY_FREE(ext->icmp6_ifstat, sizeof(struct icmp6_ifstat) / sizeof(uint64_t)); free(ext->icmp6_ifstat, M_IFADDR); free(ext, M_IFADDR); } /* * Convert sockaddr_in6 to sockaddr_in. Original sockaddr_in6 must be * v4 mapped addr or v4 compat addr */ void in6_sin6_2_sin(struct sockaddr_in *sin, struct sockaddr_in6 *sin6) { bzero(sin, sizeof(*sin)); sin->sin_len = sizeof(struct sockaddr_in); sin->sin_family = AF_INET; sin->sin_port = sin6->sin6_port; sin->sin_addr.s_addr = sin6->sin6_addr.s6_addr32[3]; } /* Convert sockaddr_in to sockaddr_in6 in v4 mapped addr format. */ void in6_sin_2_v4mapsin6(struct sockaddr_in *sin, struct sockaddr_in6 *sin6) { bzero(sin6, sizeof(*sin6)); sin6->sin6_len = sizeof(struct sockaddr_in6); sin6->sin6_family = AF_INET6; sin6->sin6_port = sin->sin_port; sin6->sin6_addr.s6_addr32[0] = 0; sin6->sin6_addr.s6_addr32[1] = 0; sin6->sin6_addr.s6_addr32[2] = IPV6_ADDR_INT32_SMP; sin6->sin6_addr.s6_addr32[3] = sin->sin_addr.s_addr; } /* Convert sockaddr_in6 into sockaddr_in. */ void in6_sin6_2_sin_in_sock(struct sockaddr *nam) { struct sockaddr_in *sin_p; struct sockaddr_in6 sin6; /* * Save original sockaddr_in6 addr and convert it * to sockaddr_in. */ sin6 = *(struct sockaddr_in6 *)nam; sin_p = (struct sockaddr_in *)nam; in6_sin6_2_sin(sin_p, &sin6); } /* Convert sockaddr_in into sockaddr_in6 in v4 mapped addr format. */ void in6_sin_2_v4mapsin6_in_sock(struct sockaddr **nam) { struct sockaddr_in *sin_p; struct sockaddr_in6 *sin6_p; sin6_p = malloc(sizeof *sin6_p, M_SONAME, M_WAITOK); sin_p = (struct sockaddr_in *)*nam; in6_sin_2_v4mapsin6(sin_p, sin6_p); free(*nam, M_SONAME); *nam = (struct sockaddr *)sin6_p; } Index: head/sys/netinet6/in6_ifattach.c =================================================================== --- head/sys/netinet6/in6_ifattach.c (revision 334117) +++ head/sys/netinet6/in6_ifattach.c (revision 334118) @@ -1,904 +1,904 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $KAME: in6_ifattach.c,v 1.118 2001/05/24 07:44:00 itojun Exp $ */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include VNET_DEFINE(unsigned long, in6_maxmtu) = 0; #ifdef IP6_AUTO_LINKLOCAL VNET_DEFINE(int, ip6_auto_linklocal) = IP6_AUTO_LINKLOCAL; #else VNET_DEFINE(int, ip6_auto_linklocal) = 1; /* enabled by default */ #endif VNET_DEFINE(struct callout, in6_tmpaddrtimer_ch); #define V_in6_tmpaddrtimer_ch VNET(in6_tmpaddrtimer_ch) VNET_DECLARE(struct inpcbinfo, ripcbinfo); #define V_ripcbinfo VNET(ripcbinfo) static int get_rand_ifid(struct ifnet *, struct in6_addr *); static int generate_tmp_ifid(u_int8_t *, const u_int8_t *, u_int8_t *); static int get_ifid(struct ifnet *, struct ifnet *, struct in6_addr *); static int in6_ifattach_linklocal(struct ifnet *, struct ifnet *); static int in6_ifattach_loopback(struct ifnet *); static void in6_purgemaddrs(struct ifnet *); #define EUI64_GBIT 0x01 #define EUI64_UBIT 0x02 #define EUI64_TO_IFID(in6) do {(in6)->s6_addr[8] ^= EUI64_UBIT; } while (0) #define EUI64_GROUP(in6) ((in6)->s6_addr[8] & EUI64_GBIT) #define EUI64_INDIVIDUAL(in6) (!EUI64_GROUP(in6)) #define EUI64_LOCAL(in6) ((in6)->s6_addr[8] & EUI64_UBIT) #define EUI64_UNIVERSAL(in6) (!EUI64_LOCAL(in6)) #define IFID_LOCAL(in6) (!EUI64_LOCAL(in6)) #define IFID_UNIVERSAL(in6) (!EUI64_UNIVERSAL(in6)) /* * Generate a last-resort interface identifier, when the machine has no * IEEE802/EUI64 address sources. * The goal here is to get an interface identifier that is * (1) random enough and (2) does not change across reboot. * We currently use MD5(hostname) for it. * * in6 - upper 64bits are preserved */ static int get_rand_ifid(struct ifnet *ifp, struct in6_addr *in6) { MD5_CTX ctxt; struct prison *pr; u_int8_t digest[16]; int hostnamelen; pr = curthread->td_ucred->cr_prison; mtx_lock(&pr->pr_mtx); hostnamelen = strlen(pr->pr_hostname); #if 0 /* we need at least several letters as seed for ifid */ if (hostnamelen < 3) { mtx_unlock(&pr->pr_mtx); return -1; } #endif /* generate 8 bytes of pseudo-random value. */ bzero(&ctxt, sizeof(ctxt)); MD5Init(&ctxt); MD5Update(&ctxt, pr->pr_hostname, hostnamelen); mtx_unlock(&pr->pr_mtx); MD5Final(digest, &ctxt); /* assumes sizeof(digest) > sizeof(ifid) */ bcopy(digest, &in6->s6_addr[8], 8); /* make sure to set "u" bit to local, and "g" bit to individual. */ in6->s6_addr[8] &= ~EUI64_GBIT; /* g bit to "individual" */ in6->s6_addr[8] |= EUI64_UBIT; /* u bit to "local" */ /* convert EUI64 into IPv6 interface identifier */ EUI64_TO_IFID(in6); return 0; } static int generate_tmp_ifid(u_int8_t *seed0, const u_int8_t *seed1, u_int8_t *ret) { MD5_CTX ctxt; u_int8_t seed[16], digest[16], nullbuf[8]; u_int32_t val32; /* If there's no history, start with a random seed. */ bzero(nullbuf, sizeof(nullbuf)); if (bcmp(nullbuf, seed0, sizeof(nullbuf)) == 0) { int i; for (i = 0; i < 2; i++) { val32 = arc4random(); bcopy(&val32, seed + sizeof(val32) * i, sizeof(val32)); } } else bcopy(seed0, seed, 8); /* copy the right-most 64-bits of the given address */ /* XXX assumption on the size of IFID */ bcopy(seed1, &seed[8], 8); if (0) { /* for debugging purposes only */ int i; printf("generate_tmp_ifid: new randomized ID from: "); for (i = 0; i < 16; i++) printf("%02x", seed[i]); printf(" "); } /* generate 16 bytes of pseudo-random value. */ bzero(&ctxt, sizeof(ctxt)); MD5Init(&ctxt); MD5Update(&ctxt, seed, sizeof(seed)); MD5Final(digest, &ctxt); /* * RFC 3041 3.2.1. (3) * Take the left-most 64-bits of the MD5 digest and set bit 6 (the * left-most bit is numbered 0) to zero. */ bcopy(digest, ret, 8); ret[0] &= ~EUI64_UBIT; /* * XXX: we'd like to ensure that the generated value is not zero * for simplicity. If the caclculated digest happens to be zero, * use a random non-zero value as the last resort. */ if (bcmp(nullbuf, ret, sizeof(nullbuf)) == 0) { nd6log((LOG_INFO, "generate_tmp_ifid: computed MD5 value is zero.\n")); val32 = arc4random(); val32 = 1 + (val32 % (0xffffffff - 1)); } /* * RFC 3041 3.2.1. (4) * Take the rightmost 64-bits of the MD5 digest and save them in * stable storage as the history value to be used in the next * iteration of the algorithm. */ bcopy(&digest[8], seed0, 8); if (0) { /* for debugging purposes only */ int i; printf("to: "); for (i = 0; i < 16; i++) printf("%02x", digest[i]); printf("\n"); } return 0; } /* * Get interface identifier for the specified interface. * XXX assumes single sockaddr_dl (AF_LINK address) per an interface * * in6 - upper 64bits are preserved */ int in6_get_hw_ifid(struct ifnet *ifp, struct in6_addr *in6) { struct ifaddr *ifa; struct sockaddr_dl *sdl; u_int8_t *addr; size_t addrlen; static u_int8_t allzero[8] = { 0, 0, 0, 0, 0, 0, 0, 0 }; static u_int8_t allone[8] = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }; IF_ADDR_RLOCK(ifp); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_LINK) continue; sdl = (struct sockaddr_dl *)ifa->ifa_addr; if (sdl == NULL) continue; if (sdl->sdl_alen == 0) continue; goto found; } IF_ADDR_RUNLOCK(ifp); return -1; found: IF_ADDR_LOCK_ASSERT(ifp); addr = LLADDR(sdl); addrlen = sdl->sdl_alen; /* get EUI64 */ switch (ifp->if_type) { case IFT_BRIDGE: case IFT_ETHER: case IFT_L2VLAN: case IFT_ATM: case IFT_IEEE1394: /* IEEE802/EUI64 cases - what others? */ /* IEEE1394 uses 16byte length address starting with EUI64 */ if (addrlen > 8) addrlen = 8; /* look at IEEE802/EUI64 only */ if (addrlen != 8 && addrlen != 6) { IF_ADDR_RUNLOCK(ifp); return -1; } /* * check for invalid MAC address - on bsdi, we see it a lot * since wildboar configures all-zero MAC on pccard before * card insertion. */ if (bcmp(addr, allzero, addrlen) == 0) { IF_ADDR_RUNLOCK(ifp); return -1; } if (bcmp(addr, allone, addrlen) == 0) { IF_ADDR_RUNLOCK(ifp); return -1; } /* make EUI64 address */ if (addrlen == 8) bcopy(addr, &in6->s6_addr[8], 8); else if (addrlen == 6) { in6->s6_addr[8] = addr[0]; in6->s6_addr[9] = addr[1]; in6->s6_addr[10] = addr[2]; in6->s6_addr[11] = 0xff; in6->s6_addr[12] = 0xfe; in6->s6_addr[13] = addr[3]; in6->s6_addr[14] = addr[4]; in6->s6_addr[15] = addr[5]; } break; case IFT_GIF: case IFT_STF: /* * RFC2893 says: "SHOULD use IPv4 address as ifid source". * however, IPv4 address is not very suitable as unique * identifier source (can be renumbered). * we don't do this. */ IF_ADDR_RUNLOCK(ifp); return -1; default: IF_ADDR_RUNLOCK(ifp); return -1; } /* sanity check: g bit must not indicate "group" */ if (EUI64_GROUP(in6)) { IF_ADDR_RUNLOCK(ifp); return -1; } /* convert EUI64 into IPv6 interface identifier */ EUI64_TO_IFID(in6); /* * sanity check: ifid must not be all zero, avoid conflict with * subnet router anycast */ if ((in6->s6_addr[8] & ~(EUI64_GBIT | EUI64_UBIT)) == 0x00 && bcmp(&in6->s6_addr[9], allzero, 7) == 0) { IF_ADDR_RUNLOCK(ifp); return -1; } IF_ADDR_RUNLOCK(ifp); return 0; } /* * Get interface identifier for the specified interface. If it is not * available on ifp0, borrow interface identifier from other information * sources. * * altifp - secondary EUI64 source */ static int get_ifid(struct ifnet *ifp0, struct ifnet *altifp, struct in6_addr *in6) { struct ifnet *ifp; /* first, try to get it from the interface itself */ if (in6_get_hw_ifid(ifp0, in6) == 0) { nd6log((LOG_DEBUG, "%s: got interface identifier from itself\n", if_name(ifp0))); goto success; } /* try secondary EUI64 source. this basically is for ATM PVC */ if (altifp && in6_get_hw_ifid(altifp, in6) == 0) { nd6log((LOG_DEBUG, "%s: got interface identifier from %s\n", if_name(ifp0), if_name(altifp))); goto success; } /* next, try to get it from some other hardware interface */ IFNET_RLOCK_NOSLEEP(); - TAILQ_FOREACH(ifp, &V_ifnet, if_link) { + CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) { if (ifp == ifp0) continue; if (in6_get_hw_ifid(ifp, in6) != 0) continue; /* * to borrow ifid from other interface, ifid needs to be * globally unique */ if (IFID_UNIVERSAL(in6)) { nd6log((LOG_DEBUG, "%s: borrow interface identifier from %s\n", if_name(ifp0), if_name(ifp))); IFNET_RUNLOCK_NOSLEEP(); goto success; } } IFNET_RUNLOCK_NOSLEEP(); /* last resort: get from random number source */ if (get_rand_ifid(ifp, in6) == 0) { nd6log((LOG_DEBUG, "%s: interface identifier generated by random number\n", if_name(ifp0))); goto success; } printf("%s: failed to get interface identifier\n", if_name(ifp0)); return -1; success: nd6log((LOG_INFO, "%s: ifid: %02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x\n", if_name(ifp0), in6->s6_addr[8], in6->s6_addr[9], in6->s6_addr[10], in6->s6_addr[11], in6->s6_addr[12], in6->s6_addr[13], in6->s6_addr[14], in6->s6_addr[15])); return 0; } /* * altifp - secondary EUI64 source */ static int in6_ifattach_linklocal(struct ifnet *ifp, struct ifnet *altifp) { struct in6_ifaddr *ia; struct in6_aliasreq ifra; struct nd_prefixctl pr0; struct nd_prefix *pr; int error; /* * configure link-local address. */ in6_prepare_ifra(&ifra, NULL, &in6mask64); ifra.ifra_addr.sin6_addr.s6_addr32[0] = htonl(0xfe800000); ifra.ifra_addr.sin6_addr.s6_addr32[1] = 0; if ((ifp->if_flags & IFF_LOOPBACK) != 0) { ifra.ifra_addr.sin6_addr.s6_addr32[2] = 0; ifra.ifra_addr.sin6_addr.s6_addr32[3] = htonl(1); } else { if (get_ifid(ifp, altifp, &ifra.ifra_addr.sin6_addr) != 0) { nd6log((LOG_ERR, "%s: no ifid available\n", if_name(ifp))); return (-1); } } if (in6_setscope(&ifra.ifra_addr.sin6_addr, ifp, NULL)) return (-1); /* link-local addresses should NEVER expire. */ ifra.ifra_lifetime.ia6t_vltime = ND6_INFINITE_LIFETIME; ifra.ifra_lifetime.ia6t_pltime = ND6_INFINITE_LIFETIME; /* * Now call in6_update_ifa() to do a bunch of procedures to configure * a link-local address. We can set the 3rd argument to NULL, because * we know there's no other link-local address on the interface * and therefore we are adding one (instead of updating one). */ if ((error = in6_update_ifa(ifp, &ifra, NULL, IN6_IFAUPDATE_DADDELAY)) != 0) { /* * XXX: When the interface does not support IPv6, this call * would fail in the SIOCSIFADDR ioctl. I believe the * notification is rather confusing in this case, so just * suppress it. (jinmei@kame.net 20010130) */ if (error != EAFNOSUPPORT) nd6log((LOG_NOTICE, "in6_ifattach_linklocal: failed to " "configure a link-local address on %s " "(errno=%d)\n", if_name(ifp), error)); return (-1); } ia = in6ifa_ifpforlinklocal(ifp, 0); /* ia must not be NULL */ KASSERT(ia != NULL, ("%s: ia == NULL, ifp=%p", __func__, ifp)); ifa_free(&ia->ia_ifa); /* * Make the link-local prefix (fe80::%link/64) as on-link. * Since we'd like to manage prefixes separately from addresses, * we make an ND6 prefix structure for the link-local prefix, * and add it to the prefix list as a never-expire prefix. * XXX: this change might affect some existing code base... */ bzero(&pr0, sizeof(pr0)); pr0.ndpr_ifp = ifp; /* this should be 64 at this moment. */ pr0.ndpr_plen = in6_mask2len(&ifra.ifra_prefixmask.sin6_addr, NULL); pr0.ndpr_prefix = ifra.ifra_addr; /* apply the mask for safety. (nd6_prelist_add will apply it again) */ IN6_MASK_ADDR(&pr0.ndpr_prefix.sin6_addr, &in6mask64); /* * Initialize parameters. The link-local prefix must always be * on-link, and its lifetimes never expire. */ pr0.ndpr_raf_onlink = 1; pr0.ndpr_raf_auto = 1; /* probably meaningless */ pr0.ndpr_vltime = ND6_INFINITE_LIFETIME; pr0.ndpr_pltime = ND6_INFINITE_LIFETIME; /* * Since there is no other link-local addresses, nd6_prefix_lookup() * probably returns NULL. However, we cannot always expect the result. * For example, if we first remove the (only) existing link-local * address, and then reconfigure another one, the prefix is still * valid with referring to the old link-local address. */ if ((pr = nd6_prefix_lookup(&pr0)) == NULL) { if ((error = nd6_prelist_add(&pr0, NULL, NULL)) != 0) return (error); } else nd6_prefix_rele(pr); return 0; } /* * ifp - must be IFT_LOOP */ static int in6_ifattach_loopback(struct ifnet *ifp) { struct in6_aliasreq ifra; int error; in6_prepare_ifra(&ifra, &in6addr_loopback, &in6mask128); /* * Always initialize ia_dstaddr (= broadcast address) to loopback * address. Follows IPv4 practice - see in_ifinit(). */ ifra.ifra_dstaddr.sin6_len = sizeof(struct sockaddr_in6); ifra.ifra_dstaddr.sin6_family = AF_INET6; ifra.ifra_dstaddr.sin6_addr = in6addr_loopback; /* the loopback address should NEVER expire. */ ifra.ifra_lifetime.ia6t_vltime = ND6_INFINITE_LIFETIME; ifra.ifra_lifetime.ia6t_pltime = ND6_INFINITE_LIFETIME; /* * We are sure that this is a newly assigned address, so we can set * NULL to the 3rd arg. */ if ((error = in6_update_ifa(ifp, &ifra, NULL, 0)) != 0) { nd6log((LOG_ERR, "in6_ifattach_loopback: failed to configure " "the loopback address on %s (errno=%d)\n", if_name(ifp), error)); return (-1); } return 0; } /* * compute NI group address, based on the current hostname setting. * see RFC 4620. * * when ifp == NULL, the caller is responsible for filling scopeid. * * If oldmcprefix == 1, FF02:0:0:0:0:2::/96 is used for NI group address * while it is FF02:0:0:0:0:2:FF00::/104 in RFC 4620. */ static int in6_nigroup0(struct ifnet *ifp, const char *name, int namelen, struct in6_addr *in6, int oldmcprefix) { struct prison *pr; const char *p; u_char *q; MD5_CTX ctxt; u_int8_t digest[16]; char l; char n[64]; /* a single label must not exceed 63 chars */ /* * If no name is given and namelen is -1, * we try to do the hostname lookup ourselves. */ if (!name && namelen == -1) { pr = curthread->td_ucred->cr_prison; mtx_lock(&pr->pr_mtx); name = pr->pr_hostname; namelen = strlen(name); } else pr = NULL; if (!name || !namelen) { if (pr != NULL) mtx_unlock(&pr->pr_mtx); return -1; } p = name; while (p && *p && *p != '.' && p - name < namelen) p++; if (p == name || p - name > sizeof(n) - 1) { if (pr != NULL) mtx_unlock(&pr->pr_mtx); return -1; /* label too long */ } l = p - name; strncpy(n, name, l); if (pr != NULL) mtx_unlock(&pr->pr_mtx); n[(int)l] = '\0'; for (q = n; *q; q++) { if ('A' <= *q && *q <= 'Z') *q = *q - 'A' + 'a'; } /* generate 16 bytes of pseudo-random value. */ bzero(&ctxt, sizeof(ctxt)); MD5Init(&ctxt); MD5Update(&ctxt, &l, sizeof(l)); MD5Update(&ctxt, n, l); MD5Final(digest, &ctxt); bzero(in6, sizeof(*in6)); in6->s6_addr16[0] = IPV6_ADDR_INT16_MLL; in6->s6_addr8[11] = 2; if (oldmcprefix == 0) { in6->s6_addr8[12] = 0xff; /* Copy the first 24 bits of 128-bit hash into the address. */ bcopy(digest, &in6->s6_addr8[13], 3); } else { /* Copy the first 32 bits of 128-bit hash into the address. */ bcopy(digest, &in6->s6_addr32[3], sizeof(in6->s6_addr32[3])); } if (in6_setscope(in6, ifp, NULL)) return (-1); /* XXX: should not fail */ return 0; } int in6_nigroup(struct ifnet *ifp, const char *name, int namelen, struct in6_addr *in6) { return (in6_nigroup0(ifp, name, namelen, in6, 0)); } int in6_nigroup_oldmcprefix(struct ifnet *ifp, const char *name, int namelen, struct in6_addr *in6) { return (in6_nigroup0(ifp, name, namelen, in6, 1)); } /* * XXX multiple loopback interface needs more care. for instance, * nodelocal address needs to be configured onto only one of them. * XXX multiple link-local address case * * altifp - secondary EUI64 source */ void in6_ifattach(struct ifnet *ifp, struct ifnet *altifp) { struct in6_ifaddr *ia; if (ifp->if_afdata[AF_INET6] == NULL) return; /* * quirks based on interface type */ switch (ifp->if_type) { case IFT_STF: /* * 6to4 interface is a very special kind of beast. * no multicast, no linklocal. RFC2529 specifies how to make * linklocals for 6to4 interface, but there's no use and * it is rather harmful to have one. */ ND_IFINFO(ifp)->flags &= ~ND6_IFF_AUTO_LINKLOCAL; break; default: break; } /* * usually, we require multicast capability to the interface */ if ((ifp->if_flags & IFF_MULTICAST) == 0) { nd6log((LOG_INFO, "in6_ifattach: " "%s is not multicast capable, IPv6 not enabled\n", if_name(ifp))); return; } /* * assign loopback address for loopback interface. */ if ((ifp->if_flags & IFF_LOOPBACK) != 0) { /* * check that loopback address doesn't exist yet. */ ia = in6ifa_ifwithaddr(&in6addr_loopback, 0); if (ia == NULL) in6_ifattach_loopback(ifp); else ifa_free(&ia->ia_ifa); } /* * assign a link-local address, if there's none. */ if (!(ND_IFINFO(ifp)->flags & ND6_IFF_IFDISABLED) && ND_IFINFO(ifp)->flags & ND6_IFF_AUTO_LINKLOCAL) { ia = in6ifa_ifpforlinklocal(ifp, 0); if (ia == NULL) in6_ifattach_linklocal(ifp, altifp); else ifa_free(&ia->ia_ifa); } /* update dynamically. */ if (V_in6_maxmtu < ifp->if_mtu) V_in6_maxmtu = ifp->if_mtu; } /* * NOTE: in6_ifdetach() does not support loopback if at this moment. * * When shutting down a VNET we clean up layers top-down. In that case * upper layer protocols (ulp) are cleaned up already and locks are destroyed * and we must not call into these cleanup functions anymore, thus purgeulp * is set to 0 in that case by in6_ifdetach_destroy(). * The normal case of destroying a (cloned) interface still needs to cleanup * everything related to the interface and will have purgeulp set to 1. */ static void _in6_ifdetach(struct ifnet *ifp, int purgeulp) { struct ifaddr *ifa, *next; if (ifp->if_afdata[AF_INET6] == NULL) return; /* * nuke any of IPv6 addresses we have * XXX: all addresses should be already removed */ CK_STAILQ_FOREACH_SAFE(ifa, &ifp->if_addrhead, ifa_link, next) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; in6_purgeaddr(ifa); } if (purgeulp) { in6_pcbpurgeif0(&V_udbinfo, ifp); in6_pcbpurgeif0(&V_ulitecbinfo, ifp); in6_pcbpurgeif0(&V_ripcbinfo, ifp); } /* leave from all multicast groups joined */ in6_purgemaddrs(ifp); /* * Remove neighbor management table. * Enabling the nd6_purge will panic on vmove for interfaces on VNET * teardown as the IPv6 layer is cleaned up already and the locks * are destroyed. */ if (purgeulp) nd6_purge(ifp); } void in6_ifdetach(struct ifnet *ifp) { _in6_ifdetach(ifp, 1); } void in6_ifdetach_destroy(struct ifnet *ifp) { _in6_ifdetach(ifp, 0); } int in6_get_tmpifid(struct ifnet *ifp, u_int8_t *retbuf, const u_int8_t *baseid, int generate) { u_int8_t nullbuf[8]; struct nd_ifinfo *ndi = ND_IFINFO(ifp); bzero(nullbuf, sizeof(nullbuf)); if (bcmp(ndi->randomid, nullbuf, sizeof(nullbuf)) == 0) { /* we've never created a random ID. Create a new one. */ generate = 1; } if (generate) { bcopy(baseid, ndi->randomseed1, sizeof(ndi->randomseed1)); /* generate_tmp_ifid will update seedn and buf */ (void)generate_tmp_ifid(ndi->randomseed0, ndi->randomseed1, ndi->randomid); } bcopy(ndi->randomid, retbuf, 8); return (0); } void in6_tmpaddrtimer(void *arg) { CURVNET_SET((struct vnet *) arg); struct nd_ifinfo *ndi; u_int8_t nullbuf[8]; struct ifnet *ifp; callout_reset(&V_in6_tmpaddrtimer_ch, (V_ip6_temp_preferred_lifetime - V_ip6_desync_factor - V_ip6_temp_regen_advance) * hz, in6_tmpaddrtimer, curvnet); bzero(nullbuf, sizeof(nullbuf)); - TAILQ_FOREACH(ifp, &V_ifnet, if_link) { + CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) { if (ifp->if_afdata[AF_INET6] == NULL) continue; ndi = ND_IFINFO(ifp); if (bcmp(ndi->randomid, nullbuf, sizeof(nullbuf)) != 0) { /* * We've been generating a random ID on this interface. * Create a new one. */ (void)generate_tmp_ifid(ndi->randomseed0, ndi->randomseed1, ndi->randomid); } } CURVNET_RESTORE(); } static void in6_purgemaddrs(struct ifnet *ifp) { struct in6_multi_head purgeinms; struct in6_multi *inm; struct ifmultiaddr *ifma, *next; SLIST_INIT(&purgeinms); IN6_MULTI_LOCK(); IN6_MULTI_LIST_LOCK(); IF_ADDR_WLOCK(ifp); /* * Extract list of in6_multi associated with the detaching ifp * which the PF_INET6 layer is about to release. */ restart: CK_STAILQ_FOREACH_SAFE(ifma, &ifp->if_multiaddrs, ifma_link, next) { if (ifma->ifma_addr->sa_family != AF_INET6 || ifma->ifma_protospec == NULL) continue; inm = (struct in6_multi *)ifma->ifma_protospec; in6m_rele_locked(&purgeinms, inm); if (__predict_false(ifma6_restart)) { ifma6_restart = false; goto restart; } } IF_ADDR_WUNLOCK(ifp); mld_ifdetach(ifp); IN6_MULTI_LIST_UNLOCK(); IN6_MULTI_UNLOCK(); in6m_release_list_deferred(&purgeinms); } void in6_ifattach_destroy(void) { callout_drain(&V_in6_tmpaddrtimer_ch); } static void in6_ifattach_init(void *dummy) { /* Timer for regeneranation of temporary addresses randomize ID. */ callout_init(&V_in6_tmpaddrtimer_ch, 0); callout_reset(&V_in6_tmpaddrtimer_ch, (V_ip6_temp_preferred_lifetime - V_ip6_desync_factor - V_ip6_temp_regen_advance) * hz, in6_tmpaddrtimer, curvnet); } /* * Cheat. * This must be after route_init(), which is now SI_ORDER_THIRD. */ SYSINIT(in6_ifattach_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_MIDDLE, in6_ifattach_init, NULL); Index: head/sys/netinet6/in6_pcb.c =================================================================== --- head/sys/netinet6/in6_pcb.c (revision 334117) +++ head/sys/netinet6/in6_pcb.c (revision 334118) @@ -1,1314 +1,1315 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * Copyright (c) 2010-2011 Juniper Networks, Inc. * All rights reserved. * * Portions of this software were developed by Robert N. M. Watson under * contract to Juniper Networks, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $KAME: in6_pcb.c,v 1.31 2001/05/21 05:45:10 jinmei Exp $ */ /*- * Copyright (c) 1982, 1986, 1991, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)in_pcb.c 8.2 (Berkeley) 1/4/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ipsec.h" #include "opt_pcbgroup.h" #include "opt_rss.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static struct inpcb *in6_pcblookup_hash_locked(struct inpcbinfo *, struct in6_addr *, u_int, struct in6_addr *, u_int, int, struct ifnet *); int in6_pcbbind(struct inpcb *inp, struct sockaddr *nam, struct ucred *cred) { struct socket *so = inp->inp_socket; struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)NULL; struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; u_short lport = 0; int error, lookupflags = 0; int reuseport = (so->so_options & SO_REUSEPORT); INP_WLOCK_ASSERT(inp); INP_HASH_WLOCK_ASSERT(pcbinfo); if (CK_STAILQ_EMPTY(&V_in6_ifaddrhead)) /* XXX broken! */ return (EADDRNOTAVAIL); if (inp->inp_lport || !IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)) return (EINVAL); if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT)) == 0) lookupflags = INPLOOKUP_WILDCARD; if (nam == NULL) { if ((error = prison_local_ip6(cred, &inp->in6p_laddr, ((inp->inp_flags & IN6P_IPV6_V6ONLY) != 0))) != 0) return (error); } else { sin6 = (struct sockaddr_in6 *)nam; if (nam->sa_len != sizeof(*sin6)) return (EINVAL); /* * family check. */ if (nam->sa_family != AF_INET6) return (EAFNOSUPPORT); if ((error = sa6_embedscope(sin6, V_ip6_use_defzone)) != 0) return(error); if ((error = prison_local_ip6(cred, &sin6->sin6_addr, ((inp->inp_flags & IN6P_IPV6_V6ONLY) != 0))) != 0) return (error); lport = sin6->sin6_port; if (IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr)) { /* * Treat SO_REUSEADDR as SO_REUSEPORT for multicast; * allow compepte duplication of binding if * SO_REUSEPORT is set, or if SO_REUSEADDR is set * and a multicast address is bound on both * new and duplicated sockets. */ if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT)) != 0) reuseport = SO_REUSEADDR|SO_REUSEPORT; } else if (!IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) { struct ifaddr *ifa; sin6->sin6_port = 0; /* yech... */ + NET_EPOCH_ENTER(); if ((ifa = ifa_ifwithaddr((struct sockaddr *)sin6)) == NULL && (inp->inp_flags & INP_BINDANY) == 0) { + NET_EPOCH_EXIT(); return (EADDRNOTAVAIL); } /* * XXX: bind to an anycast address might accidentally * cause sending a packet with anycast source address. * We should allow to bind to a deprecated address, since * the application dares to use it. */ if (ifa != NULL && ((struct in6_ifaddr *)ifa)->ia6_flags & (IN6_IFF_ANYCAST|IN6_IFF_NOTREADY|IN6_IFF_DETACHED)) { - ifa_free(ifa); + NET_EPOCH_EXIT(); return (EADDRNOTAVAIL); } - if (ifa != NULL) - ifa_free(ifa); + NET_EPOCH_EXIT(); } if (lport) { struct inpcb *t; struct tcptw *tw; /* GROSS */ if (ntohs(lport) <= V_ipport_reservedhigh && ntohs(lport) >= V_ipport_reservedlow && priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT, 0)) return (EACCES); if (!IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr) && priv_check_cred(inp->inp_cred, PRIV_NETINET_REUSEPORT, 0) != 0) { t = in6_pcblookup_local(pcbinfo, &sin6->sin6_addr, lport, INPLOOKUP_WILDCARD, cred); if (t && ((inp->inp_flags2 & INP_BINDMULTI) == 0) && ((t->inp_flags & INP_TIMEWAIT) == 0) && (so->so_type != SOCK_STREAM || IN6_IS_ADDR_UNSPECIFIED(&t->in6p_faddr)) && (!IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr) || !IN6_IS_ADDR_UNSPECIFIED(&t->in6p_laddr) || (t->inp_flags2 & INP_REUSEPORT) == 0) && (inp->inp_cred->cr_uid != t->inp_cred->cr_uid)) return (EADDRINUSE); /* * If the socket is a BINDMULTI socket, then * the credentials need to match and the * original socket also has to have been bound * with BINDMULTI. */ if (t && (! in_pcbbind_check_bindmulti(inp, t))) return (EADDRINUSE); #ifdef INET if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0 && IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) { struct sockaddr_in sin; in6_sin6_2_sin(&sin, sin6); t = in_pcblookup_local(pcbinfo, sin.sin_addr, lport, INPLOOKUP_WILDCARD, cred); if (t && ((inp->inp_flags2 & INP_BINDMULTI) == 0) && ((t->inp_flags & INP_TIMEWAIT) == 0) && (so->so_type != SOCK_STREAM || ntohl(t->inp_faddr.s_addr) == INADDR_ANY) && (inp->inp_cred->cr_uid != t->inp_cred->cr_uid)) return (EADDRINUSE); if (t && (! in_pcbbind_check_bindmulti(inp, t))) return (EADDRINUSE); } #endif } t = in6_pcblookup_local(pcbinfo, &sin6->sin6_addr, lport, lookupflags, cred); if (t && (t->inp_flags & INP_TIMEWAIT)) { /* * XXXRW: If an incpb has had its timewait * state recycled, we treat the address as * being in use (for now). This is better * than a panic, but not desirable. */ tw = intotw(t); if (tw == NULL || (reuseport & tw->tw_so_options) == 0) return (EADDRINUSE); } else if (t && (reuseport & inp_so_options(t)) == 0) { return (EADDRINUSE); } #ifdef INET if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0 && IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) { struct sockaddr_in sin; in6_sin6_2_sin(&sin, sin6); t = in_pcblookup_local(pcbinfo, sin.sin_addr, lport, lookupflags, cred); if (t && t->inp_flags & INP_TIMEWAIT) { tw = intotw(t); if (tw == NULL) return (EADDRINUSE); if ((reuseport & tw->tw_so_options) == 0 && (ntohl(t->inp_laddr.s_addr) != INADDR_ANY || ((inp->inp_vflag & INP_IPV6PROTO) == (t->inp_vflag & INP_IPV6PROTO)))) return (EADDRINUSE); } else if (t && (reuseport & inp_so_options(t)) == 0 && (ntohl(t->inp_laddr.s_addr) != INADDR_ANY || (t->inp_vflag & INP_IPV6PROTO) != 0)) return (EADDRINUSE); } #endif } inp->in6p_laddr = sin6->sin6_addr; } if (lport == 0) { if ((error = in6_pcbsetport(&inp->in6p_laddr, inp, cred)) != 0) { /* Undo an address bind that may have occurred. */ inp->in6p_laddr = in6addr_any; return (error); } } else { inp->inp_lport = lport; if (in_pcbinshash(inp) != 0) { inp->in6p_laddr = in6addr_any; inp->inp_lport = 0; return (EAGAIN); } } return (0); } /* * Transform old in6_pcbconnect() into an inner subroutine for new * in6_pcbconnect(): Do some validity-checking on the remote * address (in mbuf 'nam') and then determine local host address * (i.e., which interface) to use to access that remote host. * * This preserves definition of in6_pcbconnect(), while supporting a * slightly different version for T/TCP. (This is more than * a bit of a kludge, but cleaning up the internal interfaces would * have forced minor changes in every protocol). */ static int in6_pcbladdr(struct inpcb *inp, struct sockaddr *nam, struct in6_addr *plocal_addr6) { struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)nam; int error = 0; int scope_ambiguous = 0; struct in6_addr in6a; INP_WLOCK_ASSERT(inp); INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo); /* XXXRW: why? */ if (nam->sa_len != sizeof (*sin6)) return (EINVAL); if (sin6->sin6_family != AF_INET6) return (EAFNOSUPPORT); if (sin6->sin6_port == 0) return (EADDRNOTAVAIL); if (sin6->sin6_scope_id == 0 && !V_ip6_use_defzone) scope_ambiguous = 1; if ((error = sa6_embedscope(sin6, V_ip6_use_defzone)) != 0) return(error); if (!CK_STAILQ_EMPTY(&V_in6_ifaddrhead)) { /* * If the destination address is UNSPECIFIED addr, * use the loopback addr, e.g ::1. */ if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) sin6->sin6_addr = in6addr_loopback; } if ((error = prison_remote_ip6(inp->inp_cred, &sin6->sin6_addr)) != 0) return (error); error = in6_selectsrc_socket(sin6, inp->in6p_outputopts, inp, inp->inp_cred, scope_ambiguous, &in6a, NULL); if (error) return (error); /* * Do not update this earlier, in case we return with an error. * * XXX: this in6_selectsrc_socket result might replace the bound local * address with the address specified by setsockopt(IPV6_PKTINFO). * Is it the intended behavior? */ *plocal_addr6 = in6a; /* * Don't do pcblookup call here; return interface in * plocal_addr6 * and exit to caller, that will do the lookup. */ return (0); } /* * Outer subroutine: * Connect from a socket to a specified address. * Both address and port must be specified in argument sin. * If don't have a local address for this socket yet, * then pick one. */ int in6_pcbconnect_mbuf(struct inpcb *inp, struct sockaddr *nam, struct ucred *cred, struct mbuf *m) { struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)nam; struct in6_addr addr6; int error; INP_WLOCK_ASSERT(inp); INP_HASH_WLOCK_ASSERT(pcbinfo); /* * Call inner routine, to assign local interface address. * in6_pcbladdr() may automatically fill in sin6_scope_id. */ if ((error = in6_pcbladdr(inp, nam, &addr6)) != 0) return (error); if (in6_pcblookup_hash_locked(pcbinfo, &sin6->sin6_addr, sin6->sin6_port, IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr) ? &addr6 : &inp->in6p_laddr, inp->inp_lport, 0, NULL) != NULL) { return (EADDRINUSE); } if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)) { if (inp->inp_lport == 0) { error = in6_pcbbind(inp, (struct sockaddr *)0, cred); if (error) return (error); } inp->in6p_laddr = addr6; } inp->in6p_faddr = sin6->sin6_addr; inp->inp_fport = sin6->sin6_port; /* update flowinfo - draft-itojun-ipv6-flowlabel-api-00 */ inp->inp_flow &= ~IPV6_FLOWLABEL_MASK; if (inp->inp_flags & IN6P_AUTOFLOWLABEL) inp->inp_flow |= (htonl(ip6_randomflowlabel()) & IPV6_FLOWLABEL_MASK); in_pcbrehash_mbuf(inp, m); return (0); } int in6_pcbconnect(struct inpcb *inp, struct sockaddr *nam, struct ucred *cred) { return (in6_pcbconnect_mbuf(inp, nam, cred, NULL)); } void in6_pcbdisconnect(struct inpcb *inp) { INP_WLOCK_ASSERT(inp); INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo); bzero((caddr_t)&inp->in6p_faddr, sizeof(inp->in6p_faddr)); inp->inp_fport = 0; /* clear flowinfo - draft-itojun-ipv6-flowlabel-api-00 */ inp->inp_flow &= ~IPV6_FLOWLABEL_MASK; in_pcbrehash(inp); } struct sockaddr * in6_sockaddr(in_port_t port, struct in6_addr *addr_p) { struct sockaddr_in6 *sin6; sin6 = malloc(sizeof *sin6, M_SONAME, M_WAITOK); bzero(sin6, sizeof *sin6); sin6->sin6_family = AF_INET6; sin6->sin6_len = sizeof(*sin6); sin6->sin6_port = port; sin6->sin6_addr = *addr_p; (void)sa6_recoverscope(sin6); /* XXX: should catch errors */ return (struct sockaddr *)sin6; } struct sockaddr * in6_v4mapsin6_sockaddr(in_port_t port, struct in_addr *addr_p) { struct sockaddr_in sin; struct sockaddr_in6 *sin6_p; bzero(&sin, sizeof sin); sin.sin_family = AF_INET; sin.sin_len = sizeof(sin); sin.sin_port = port; sin.sin_addr = *addr_p; sin6_p = malloc(sizeof *sin6_p, M_SONAME, M_WAITOK); in6_sin_2_v4mapsin6(&sin, sin6_p); return (struct sockaddr *)sin6_p; } int in6_getsockaddr(struct socket *so, struct sockaddr **nam) { struct inpcb *inp; struct in6_addr addr; in_port_t port; inp = sotoinpcb(so); KASSERT(inp != NULL, ("in6_getsockaddr: inp == NULL")); INP_RLOCK(inp); port = inp->inp_lport; addr = inp->in6p_laddr; INP_RUNLOCK(inp); *nam = in6_sockaddr(port, &addr); return 0; } int in6_getpeeraddr(struct socket *so, struct sockaddr **nam) { struct inpcb *inp; struct in6_addr addr; in_port_t port; inp = sotoinpcb(so); KASSERT(inp != NULL, ("in6_getpeeraddr: inp == NULL")); INP_RLOCK(inp); port = inp->inp_fport; addr = inp->in6p_faddr; INP_RUNLOCK(inp); *nam = in6_sockaddr(port, &addr); return 0; } int in6_mapped_sockaddr(struct socket *so, struct sockaddr **nam) { struct inpcb *inp; int error; inp = sotoinpcb(so); KASSERT(inp != NULL, ("in6_mapped_sockaddr: inp == NULL")); #ifdef INET if ((inp->inp_vflag & (INP_IPV4 | INP_IPV6)) == INP_IPV4) { error = in_getsockaddr(so, nam); if (error == 0) in6_sin_2_v4mapsin6_in_sock(nam); } else #endif { /* scope issues will be handled in in6_getsockaddr(). */ error = in6_getsockaddr(so, nam); } return error; } int in6_mapped_peeraddr(struct socket *so, struct sockaddr **nam) { struct inpcb *inp; int error; inp = sotoinpcb(so); KASSERT(inp != NULL, ("in6_mapped_peeraddr: inp == NULL")); #ifdef INET if ((inp->inp_vflag & (INP_IPV4 | INP_IPV6)) == INP_IPV4) { error = in_getpeeraddr(so, nam); if (error == 0) in6_sin_2_v4mapsin6_in_sock(nam); } else #endif /* scope issues will be handled in in6_getpeeraddr(). */ error = in6_getpeeraddr(so, nam); return error; } /* * Pass some notification to all connections of a protocol * associated with address dst. The local address and/or port numbers * may be specified to limit the search. The "usual action" will be * taken, depending on the ctlinput cmd. The caller must filter any * cmds that are uninteresting (e.g., no error in the map). * Call the protocol specific routine (if any) to report * any errors for each matching socket. */ void in6_pcbnotify(struct inpcbinfo *pcbinfo, struct sockaddr *dst, u_int fport_arg, const struct sockaddr *src, u_int lport_arg, int cmd, void *cmdarg, struct inpcb *(*notify)(struct inpcb *, int)) { struct inpcb *inp, *inp_temp; struct sockaddr_in6 sa6_src, *sa6_dst; u_short fport = fport_arg, lport = lport_arg; u_int32_t flowinfo; int errno; if ((unsigned)cmd >= PRC_NCMDS || dst->sa_family != AF_INET6) return; sa6_dst = (struct sockaddr_in6 *)dst; if (IN6_IS_ADDR_UNSPECIFIED(&sa6_dst->sin6_addr)) return; /* * note that src can be NULL when we get notify by local fragmentation. */ sa6_src = (src == NULL) ? sa6_any : *(const struct sockaddr_in6 *)src; flowinfo = sa6_src.sin6_flowinfo; /* * Redirects go to all references to the destination, * and use in6_rtchange to invalidate the route cache. * Dead host indications: also use in6_rtchange to invalidate * the cache, and deliver the error to all the sockets. * Otherwise, if we have knowledge of the local port and address, * deliver only to that socket. */ if (PRC_IS_REDIRECT(cmd) || cmd == PRC_HOSTDEAD) { fport = 0; lport = 0; bzero((caddr_t)&sa6_src.sin6_addr, sizeof(sa6_src.sin6_addr)); if (cmd != PRC_HOSTDEAD) notify = in6_rtchange; } errno = inet6ctlerrmap[cmd]; INP_INFO_WLOCK(pcbinfo); LIST_FOREACH_SAFE(inp, pcbinfo->ipi_listhead, inp_list, inp_temp) { INP_WLOCK(inp); if ((inp->inp_vflag & INP_IPV6) == 0) { INP_WUNLOCK(inp); continue; } /* * If the error designates a new path MTU for a destination * and the application (associated with this socket) wanted to * know the value, notify. * XXX: should we avoid to notify the value to TCP sockets? */ if (cmd == PRC_MSGSIZE && cmdarg != NULL) ip6_notify_pmtu(inp, (struct sockaddr_in6 *)dst, *(u_int32_t *)cmdarg); /* * Detect if we should notify the error. If no source and * destination ports are specifed, but non-zero flowinfo and * local address match, notify the error. This is the case * when the error is delivered with an encrypted buffer * by ESP. Otherwise, just compare addresses and ports * as usual. */ if (lport == 0 && fport == 0 && flowinfo && inp->inp_socket != NULL && flowinfo == (inp->inp_flow & IPV6_FLOWLABEL_MASK) && IN6_ARE_ADDR_EQUAL(&inp->in6p_laddr, &sa6_src.sin6_addr)) goto do_notify; else if (!IN6_ARE_ADDR_EQUAL(&inp->in6p_faddr, &sa6_dst->sin6_addr) || inp->inp_socket == 0 || (lport && inp->inp_lport != lport) || (!IN6_IS_ADDR_UNSPECIFIED(&sa6_src.sin6_addr) && !IN6_ARE_ADDR_EQUAL(&inp->in6p_laddr, &sa6_src.sin6_addr)) || (fport && inp->inp_fport != fport)) { INP_WUNLOCK(inp); continue; } do_notify: if (notify) { if ((*notify)(inp, errno)) INP_WUNLOCK(inp); } else INP_WUNLOCK(inp); } INP_INFO_WUNLOCK(pcbinfo); } /* * Lookup a PCB based on the local address and port. Caller must hold the * hash lock. No inpcb locks or references are acquired. */ struct inpcb * in6_pcblookup_local(struct inpcbinfo *pcbinfo, struct in6_addr *laddr, u_short lport, int lookupflags, struct ucred *cred) { struct inpcb *inp; int matchwild = 3, wildcard; KASSERT((lookupflags & ~(INPLOOKUP_WILDCARD)) == 0, ("%s: invalid lookup flags %d", __func__, lookupflags)); INP_HASH_WLOCK_ASSERT(pcbinfo); if ((lookupflags & INPLOOKUP_WILDCARD) == 0) { struct inpcbhead *head; /* * Look for an unconnected (wildcard foreign addr) PCB that * matches the local address and port we're looking for. */ head = &pcbinfo->ipi_hashbase[INP_PCBHASH( INP6_PCBHASHKEY(&in6addr_any), lport, 0, pcbinfo->ipi_hashmask)]; LIST_FOREACH(inp, head, inp_hash) { /* XXX inp locking */ if ((inp->inp_vflag & INP_IPV6) == 0) continue; if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr) && IN6_ARE_ADDR_EQUAL(&inp->in6p_laddr, laddr) && inp->inp_lport == lport) { /* Found. */ if (cred == NULL || prison_equal_ip6(cred->cr_prison, inp->inp_cred->cr_prison)) return (inp); } } /* * Not found. */ return (NULL); } else { struct inpcbporthead *porthash; struct inpcbport *phd; struct inpcb *match = NULL; /* * Best fit PCB lookup. * * First see if this local port is in use by looking on the * port hash list. */ porthash = &pcbinfo->ipi_porthashbase[INP_PCBPORTHASH(lport, pcbinfo->ipi_porthashmask)]; LIST_FOREACH(phd, porthash, phd_hash) { if (phd->phd_port == lport) break; } if (phd != NULL) { /* * Port is in use by one or more PCBs. Look for best * fit. */ LIST_FOREACH(inp, &phd->phd_pcblist, inp_portlist) { wildcard = 0; if (cred != NULL && !prison_equal_ip6(cred->cr_prison, inp->inp_cred->cr_prison)) continue; /* XXX inp locking */ if ((inp->inp_vflag & INP_IPV6) == 0) continue; if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr)) wildcard++; if (!IN6_IS_ADDR_UNSPECIFIED( &inp->in6p_laddr)) { if (IN6_IS_ADDR_UNSPECIFIED(laddr)) wildcard++; else if (!IN6_ARE_ADDR_EQUAL( &inp->in6p_laddr, laddr)) continue; } else { if (!IN6_IS_ADDR_UNSPECIFIED(laddr)) wildcard++; } if (wildcard < matchwild) { match = inp; matchwild = wildcard; if (matchwild == 0) break; } } } return (match); } } void in6_pcbpurgeif0(struct inpcbinfo *pcbinfo, struct ifnet *ifp) { struct inpcb *in6p; struct ip6_moptions *im6o; int i, gap; INP_INFO_WLOCK(pcbinfo); LIST_FOREACH(in6p, pcbinfo->ipi_listhead, inp_list) { INP_WLOCK(in6p); im6o = in6p->in6p_moptions; if ((in6p->inp_vflag & INP_IPV6) && im6o != NULL) { /* * Unselect the outgoing ifp for multicast if it * is being detached. */ if (im6o->im6o_multicast_ifp == ifp) im6o->im6o_multicast_ifp = NULL; /* * Drop multicast group membership if we joined * through the interface being detached. */ gap = 0; for (i = 0; i < im6o->im6o_num_memberships; i++) { if (im6o->im6o_membership[i]->in6m_ifp == ifp) { in6_leavegroup(im6o->im6o_membership[i], NULL); gap++; } else if (gap != 0) { im6o->im6o_membership[i - gap] = im6o->im6o_membership[i]; } } im6o->im6o_num_memberships -= gap; } INP_WUNLOCK(in6p); } INP_INFO_WUNLOCK(pcbinfo); } /* * Check for alternatives when higher level complains * about service problems. For now, invalidate cached * routing information. If the route was created dynamically * (by a redirect), time to try a default gateway again. */ void in6_losing(struct inpcb *in6p) { if (in6p->inp_route6.ro_rt) { RTFREE(in6p->inp_route6.ro_rt); in6p->inp_route6.ro_rt = (struct rtentry *)NULL; } if (in6p->inp_route.ro_lle) LLE_FREE(in6p->inp_route.ro_lle); /* zeros ro_lle */ return; } /* * After a routing change, flush old routing * and allocate a (hopefully) better one. */ struct inpcb * in6_rtchange(struct inpcb *inp, int errno) { if (inp->inp_route6.ro_rt) { RTFREE(inp->inp_route6.ro_rt); inp->inp_route6.ro_rt = (struct rtentry *)NULL; } if (inp->inp_route.ro_lle) LLE_FREE(inp->inp_route.ro_lle); /* zeros ro_lle */ return inp; } #ifdef PCBGROUP /* * Lookup PCB in hash list, using pcbgroup tables. */ static struct inpcb * in6_pcblookup_group(struct inpcbinfo *pcbinfo, struct inpcbgroup *pcbgroup, struct in6_addr *faddr, u_int fport_arg, struct in6_addr *laddr, u_int lport_arg, int lookupflags, struct ifnet *ifp) { struct inpcbhead *head; struct inpcb *inp, *tmpinp; u_short fport = fport_arg, lport = lport_arg; bool locked; /* * First look for an exact match. */ tmpinp = NULL; INP_GROUP_LOCK(pcbgroup); head = &pcbgroup->ipg_hashbase[INP_PCBHASH( INP6_PCBHASHKEY(faddr), lport, fport, pcbgroup->ipg_hashmask)]; LIST_FOREACH(inp, head, inp_pcbgrouphash) { /* XXX inp locking */ if ((inp->inp_vflag & INP_IPV6) == 0) continue; if (IN6_ARE_ADDR_EQUAL(&inp->in6p_faddr, faddr) && IN6_ARE_ADDR_EQUAL(&inp->in6p_laddr, laddr) && inp->inp_fport == fport && inp->inp_lport == lport) { /* * XXX We should be able to directly return * the inp here, without any checks. * Well unless both bound with SO_REUSEPORT? */ if (prison_flag(inp->inp_cred, PR_IP6)) goto found; if (tmpinp == NULL) tmpinp = inp; } } if (tmpinp != NULL) { inp = tmpinp; goto found; } /* * Then look for a wildcard match in the pcbgroup. */ if ((lookupflags & INPLOOKUP_WILDCARD) != 0) { struct inpcb *local_wild = NULL, *local_exact = NULL; struct inpcb *jail_wild = NULL; int injail; /* * Order of socket selection - we always prefer jails. * 1. jailed, non-wild. * 2. jailed, wild. * 3. non-jailed, non-wild. * 4. non-jailed, wild. */ head = &pcbgroup->ipg_hashbase[ INP_PCBHASH(INADDR_ANY, lport, 0, pcbgroup->ipg_hashmask)]; LIST_FOREACH(inp, head, inp_pcbgrouphash) { /* XXX inp locking */ if ((inp->inp_vflag & INP_IPV6) == 0) continue; if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr) || inp->inp_lport != lport) { continue; } injail = prison_flag(inp->inp_cred, PR_IP6); if (injail) { if (prison_check_ip6(inp->inp_cred, laddr) != 0) continue; } else { if (local_exact != NULL) continue; } if (IN6_ARE_ADDR_EQUAL(&inp->in6p_laddr, laddr)) { if (injail) goto found; else local_exact = inp; } else if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)) { if (injail) jail_wild = inp; else local_wild = inp; } } /* LIST_FOREACH */ inp = jail_wild; if (inp == NULL) inp = jail_wild; if (inp == NULL) inp = local_exact; if (inp == NULL) inp = local_wild; if (inp != NULL) goto found; } /* * Then look for a wildcard match, if requested. */ if ((lookupflags & INPLOOKUP_WILDCARD) != 0) { struct inpcb *local_wild = NULL, *local_exact = NULL; struct inpcb *jail_wild = NULL; int injail; /* * Order of socket selection - we always prefer jails. * 1. jailed, non-wild. * 2. jailed, wild. * 3. non-jailed, non-wild. * 4. non-jailed, wild. */ head = &pcbinfo->ipi_wildbase[INP_PCBHASH( INP6_PCBHASHKEY(&in6addr_any), lport, 0, pcbinfo->ipi_wildmask)]; LIST_FOREACH(inp, head, inp_pcbgroup_wild) { /* XXX inp locking */ if ((inp->inp_vflag & INP_IPV6) == 0) continue; if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr) || inp->inp_lport != lport) { continue; } injail = prison_flag(inp->inp_cred, PR_IP6); if (injail) { if (prison_check_ip6(inp->inp_cred, laddr) != 0) continue; } else { if (local_exact != NULL) continue; } if (IN6_ARE_ADDR_EQUAL(&inp->in6p_laddr, laddr)) { if (injail) goto found; else local_exact = inp; } else if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)) { if (injail) jail_wild = inp; else local_wild = inp; } } /* LIST_FOREACH */ inp = jail_wild; if (inp == NULL) inp = jail_wild; if (inp == NULL) inp = local_exact; if (inp == NULL) inp = local_wild; if (inp != NULL) goto found; } /* if ((lookupflags & INPLOOKUP_WILDCARD) != 0) */ INP_GROUP_UNLOCK(pcbgroup); return (NULL); found: if (lookupflags & INPLOOKUP_WLOCKPCB) locked = INP_TRY_WLOCK(inp); else if (lookupflags & INPLOOKUP_RLOCKPCB) locked = INP_TRY_RLOCK(inp); else panic("%s: locking buf", __func__); if (!locked) in_pcbref(inp); INP_GROUP_UNLOCK(pcbgroup); if (!locked) { if (lookupflags & INPLOOKUP_WLOCKPCB) { INP_WLOCK(inp); if (in_pcbrele_wlocked(inp)) return (NULL); } else { INP_RLOCK(inp); if (in_pcbrele_rlocked(inp)) return (NULL); } } #ifdef INVARIANTS if (lookupflags & INPLOOKUP_WLOCKPCB) INP_WLOCK_ASSERT(inp); else INP_RLOCK_ASSERT(inp); #endif return (inp); } #endif /* PCBGROUP */ /* * Lookup PCB in hash list. */ static struct inpcb * in6_pcblookup_hash_locked(struct inpcbinfo *pcbinfo, struct in6_addr *faddr, u_int fport_arg, struct in6_addr *laddr, u_int lport_arg, int lookupflags, struct ifnet *ifp) { struct inpcbhead *head; struct inpcb *inp, *tmpinp; u_short fport = fport_arg, lport = lport_arg; KASSERT((lookupflags & ~(INPLOOKUP_WILDCARD)) == 0, ("%s: invalid lookup flags %d", __func__, lookupflags)); INP_HASH_LOCK_ASSERT(pcbinfo); /* * First look for an exact match. */ tmpinp = NULL; head = &pcbinfo->ipi_hashbase[INP_PCBHASH( INP6_PCBHASHKEY(faddr), lport, fport, pcbinfo->ipi_hashmask)]; LIST_FOREACH(inp, head, inp_hash) { /* XXX inp locking */ if ((inp->inp_vflag & INP_IPV6) == 0) continue; if (IN6_ARE_ADDR_EQUAL(&inp->in6p_faddr, faddr) && IN6_ARE_ADDR_EQUAL(&inp->in6p_laddr, laddr) && inp->inp_fport == fport && inp->inp_lport == lport) { /* * XXX We should be able to directly return * the inp here, without any checks. * Well unless both bound with SO_REUSEPORT? */ if (prison_flag(inp->inp_cred, PR_IP6)) return (inp); if (tmpinp == NULL) tmpinp = inp; } } if (tmpinp != NULL) return (tmpinp); /* * Then look for a wildcard match, if requested. */ if ((lookupflags & INPLOOKUP_WILDCARD) != 0) { struct inpcb *local_wild = NULL, *local_exact = NULL; struct inpcb *jail_wild = NULL; int injail; /* * Order of socket selection - we always prefer jails. * 1. jailed, non-wild. * 2. jailed, wild. * 3. non-jailed, non-wild. * 4. non-jailed, wild. */ head = &pcbinfo->ipi_hashbase[INP_PCBHASH( INP6_PCBHASHKEY(&in6addr_any), lport, 0, pcbinfo->ipi_hashmask)]; LIST_FOREACH(inp, head, inp_hash) { /* XXX inp locking */ if ((inp->inp_vflag & INP_IPV6) == 0) continue; if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr) || inp->inp_lport != lport) { continue; } injail = prison_flag(inp->inp_cred, PR_IP6); if (injail) { if (prison_check_ip6(inp->inp_cred, laddr) != 0) continue; } else { if (local_exact != NULL) continue; } if (IN6_ARE_ADDR_EQUAL(&inp->in6p_laddr, laddr)) { if (injail) return (inp); else local_exact = inp; } else if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)) { if (injail) jail_wild = inp; else local_wild = inp; } } /* LIST_FOREACH */ if (jail_wild != NULL) return (jail_wild); if (local_exact != NULL) return (local_exact); if (local_wild != NULL) return (local_wild); } /* if ((lookupflags & INPLOOKUP_WILDCARD) != 0) */ /* * Not found. */ return (NULL); } /* * Lookup PCB in hash list, using pcbinfo tables. This variation locks the * hash list lock, and will return the inpcb locked (i.e., requires * INPLOOKUP_LOCKPCB). */ static struct inpcb * in6_pcblookup_hash(struct inpcbinfo *pcbinfo, struct in6_addr *faddr, u_int fport, struct in6_addr *laddr, u_int lport, int lookupflags, struct ifnet *ifp) { struct inpcb *inp; bool locked; INP_HASH_RLOCK(pcbinfo); inp = in6_pcblookup_hash_locked(pcbinfo, faddr, fport, laddr, lport, (lookupflags & ~(INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)), ifp); if (inp != NULL) { if (lookupflags & INPLOOKUP_WLOCKPCB) locked = INP_TRY_WLOCK(inp); else if (lookupflags & INPLOOKUP_RLOCKPCB) locked = INP_TRY_RLOCK(inp); else panic("%s: locking bug", __func__); if (!locked) in_pcbref(inp); INP_HASH_RUNLOCK(pcbinfo); if (!locked) { if (lookupflags & INPLOOKUP_WLOCKPCB) { INP_WLOCK(inp); if (in_pcbrele_wlocked(inp)) return (NULL); } else { INP_RLOCK(inp); if (in_pcbrele_rlocked(inp)) return (NULL); } } #ifdef INVARIANTS if (lookupflags & INPLOOKUP_WLOCKPCB) INP_WLOCK_ASSERT(inp); else INP_RLOCK_ASSERT(inp); #endif } else INP_HASH_RUNLOCK(pcbinfo); return (inp); } /* * Public inpcb lookup routines, accepting a 4-tuple, and optionally, an mbuf * from which a pre-calculated hash value may be extracted. * * Possibly more of this logic should be in in6_pcbgroup.c. */ struct inpcb * in6_pcblookup(struct inpcbinfo *pcbinfo, struct in6_addr *faddr, u_int fport, struct in6_addr *laddr, u_int lport, int lookupflags, struct ifnet *ifp) { #if defined(PCBGROUP) && !defined(RSS) struct inpcbgroup *pcbgroup; #endif KASSERT((lookupflags & ~INPLOOKUP_MASK) == 0, ("%s: invalid lookup flags %d", __func__, lookupflags)); KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)) != 0, ("%s: LOCKPCB not set", __func__)); /* * When not using RSS, use connection groups in preference to the * reservation table when looking up 4-tuples. When using RSS, just * use the reservation table, due to the cost of the Toeplitz hash * in software. * * XXXRW: This policy belongs in the pcbgroup code, as in principle * we could be doing RSS with a non-Toeplitz hash that is affordable * in software. */ #if defined(PCBGROUP) && !defined(RSS) if (in_pcbgroup_enabled(pcbinfo)) { pcbgroup = in6_pcbgroup_bytuple(pcbinfo, laddr, lport, faddr, fport); return (in6_pcblookup_group(pcbinfo, pcbgroup, faddr, fport, laddr, lport, lookupflags, ifp)); } #endif return (in6_pcblookup_hash(pcbinfo, faddr, fport, laddr, lport, lookupflags, ifp)); } struct inpcb * in6_pcblookup_mbuf(struct inpcbinfo *pcbinfo, struct in6_addr *faddr, u_int fport, struct in6_addr *laddr, u_int lport, int lookupflags, struct ifnet *ifp, struct mbuf *m) { #ifdef PCBGROUP struct inpcbgroup *pcbgroup; #endif KASSERT((lookupflags & ~INPLOOKUP_MASK) == 0, ("%s: invalid lookup flags %d", __func__, lookupflags)); KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)) != 0, ("%s: LOCKPCB not set", __func__)); #ifdef PCBGROUP /* * If we can use a hardware-generated hash to look up the connection * group, use that connection group to find the inpcb. Otherwise * fall back on a software hash -- or the reservation table if we're * using RSS. * * XXXRW: As above, that policy belongs in the pcbgroup code. */ if (in_pcbgroup_enabled(pcbinfo) && M_HASHTYPE_TEST(m, M_HASHTYPE_NONE) == 0) { pcbgroup = in6_pcbgroup_byhash(pcbinfo, M_HASHTYPE_GET(m), m->m_pkthdr.flowid); if (pcbgroup != NULL) return (in6_pcblookup_group(pcbinfo, pcbgroup, faddr, fport, laddr, lport, lookupflags, ifp)); #ifndef RSS pcbgroup = in6_pcbgroup_bytuple(pcbinfo, laddr, lport, faddr, fport); return (in6_pcblookup_group(pcbinfo, pcbgroup, faddr, fport, laddr, lport, lookupflags, ifp)); #endif } #endif return (in6_pcblookup_hash(pcbinfo, faddr, fport, laddr, lport, lookupflags, ifp)); } void init_sin6(struct sockaddr_in6 *sin6, struct mbuf *m, int srcordst) { struct ip6_hdr *ip; ip = mtod(m, struct ip6_hdr *); bzero(sin6, sizeof(*sin6)); sin6->sin6_len = sizeof(*sin6); sin6->sin6_family = AF_INET6; sin6->sin6_addr = srcordst ? ip->ip6_dst : ip->ip6_src; (void)sa6_recoverscope(sin6); /* XXX: should catch errors... */ return; } Index: head/sys/netinet6/ip6_input.c =================================================================== --- head/sys/netinet6/ip6_input.c (revision 334117) +++ head/sys/netinet6/ip6_input.c (revision 334118) @@ -1,1862 +1,1862 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $KAME: ip6_input.c,v 1.259 2002/01/21 04:58:09 jinmei Exp $ */ /*- * Copyright (c) 1982, 1986, 1988, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ip_input.c 8.2 (Berkeley) 1/4/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ipsec.h" #include "opt_route.h" #include "opt_rss.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET #include #include #endif /* INET */ #include #include #include #include #include #include #include #include #include #include #include #include extern struct domain inet6domain; u_char ip6_protox[IPPROTO_MAX]; VNET_DEFINE(struct in6_ifaddrhead, in6_ifaddrhead); VNET_DEFINE(struct in6_ifaddrlisthead *, in6_ifaddrhashtbl); VNET_DEFINE(u_long, in6_ifaddrhmask); static struct netisr_handler ip6_nh = { .nh_name = "ip6", .nh_handler = ip6_input, .nh_proto = NETISR_IPV6, #ifdef RSS .nh_m2cpuid = rss_soft_m2cpuid_v6, .nh_policy = NETISR_POLICY_CPU, .nh_dispatch = NETISR_DISPATCH_HYBRID, #else .nh_policy = NETISR_POLICY_FLOW, #endif }; static int sysctl_netinet6_intr_queue_maxlen(SYSCTL_HANDLER_ARGS) { int error, qlimit; netisr_getqlimit(&ip6_nh, &qlimit); error = sysctl_handle_int(oidp, &qlimit, 0, req); if (error || !req->newptr) return (error); if (qlimit < 1) return (EINVAL); return (netisr_setqlimit(&ip6_nh, qlimit)); } SYSCTL_DECL(_net_inet6_ip6); SYSCTL_PROC(_net_inet6_ip6, IPV6CTL_INTRQMAXLEN, intr_queue_maxlen, CTLTYPE_INT|CTLFLAG_RW, 0, 0, sysctl_netinet6_intr_queue_maxlen, "I", "Maximum size of the IPv6 input queue"); #ifdef RSS static struct netisr_handler ip6_direct_nh = { .nh_name = "ip6_direct", .nh_handler = ip6_direct_input, .nh_proto = NETISR_IPV6_DIRECT, .nh_m2cpuid = rss_soft_m2cpuid_v6, .nh_policy = NETISR_POLICY_CPU, .nh_dispatch = NETISR_DISPATCH_HYBRID, }; static int sysctl_netinet6_intr_direct_queue_maxlen(SYSCTL_HANDLER_ARGS) { int error, qlimit; netisr_getqlimit(&ip6_direct_nh, &qlimit); error = sysctl_handle_int(oidp, &qlimit, 0, req); if (error || !req->newptr) return (error); if (qlimit < 1) return (EINVAL); return (netisr_setqlimit(&ip6_direct_nh, qlimit)); } SYSCTL_PROC(_net_inet6_ip6, IPV6CTL_INTRDQMAXLEN, intr_direct_queue_maxlen, CTLTYPE_INT|CTLFLAG_RW, 0, 0, sysctl_netinet6_intr_direct_queue_maxlen, "I", "Maximum size of the IPv6 direct input queue"); #endif VNET_DEFINE(struct pfil_head, inet6_pfil_hook); VNET_PCPUSTAT_DEFINE(struct ip6stat, ip6stat); VNET_PCPUSTAT_SYSINIT(ip6stat); #ifdef VIMAGE VNET_PCPUSTAT_SYSUNINIT(ip6stat); #endif /* VIMAGE */ struct rmlock in6_ifaddr_lock; RM_SYSINIT(in6_ifaddr_lock, &in6_ifaddr_lock, "in6_ifaddr_lock"); static int ip6_hopopts_input(u_int32_t *, u_int32_t *, struct mbuf **, int *); #ifdef PULLDOWN_TEST static struct mbuf *ip6_pullexthdr(struct mbuf *, size_t, int); #endif /* * IP6 initialization: fill in IP6 protocol switch table. * All protocols not implemented in kernel go to raw IP6 protocol handler. */ void ip6_init(void) { struct protosw *pr; int i; TUNABLE_INT_FETCH("net.inet6.ip6.auto_linklocal", &V_ip6_auto_linklocal); TUNABLE_INT_FETCH("net.inet6.ip6.accept_rtadv", &V_ip6_accept_rtadv); TUNABLE_INT_FETCH("net.inet6.ip6.no_radr", &V_ip6_no_radr); CK_STAILQ_INIT(&V_in6_ifaddrhead); V_in6_ifaddrhashtbl = hashinit(IN6ADDR_NHASH, M_IFADDR, &V_in6_ifaddrhmask); /* Initialize packet filter hooks. */ V_inet6_pfil_hook.ph_type = PFIL_TYPE_AF; V_inet6_pfil_hook.ph_af = AF_INET6; if ((i = pfil_head_register(&V_inet6_pfil_hook)) != 0) printf("%s: WARNING: unable to register pfil hook, " "error %d\n", __func__, i); if (hhook_head_register(HHOOK_TYPE_IPSEC_IN, AF_INET6, &V_ipsec_hhh_in[HHOOK_IPSEC_INET6], HHOOK_WAITOK | HHOOK_HEADISINVNET) != 0) printf("%s: WARNING: unable to register input helper hook\n", __func__); if (hhook_head_register(HHOOK_TYPE_IPSEC_OUT, AF_INET6, &V_ipsec_hhh_out[HHOOK_IPSEC_INET6], HHOOK_WAITOK | HHOOK_HEADISINVNET) != 0) printf("%s: WARNING: unable to register output helper hook\n", __func__); scope6_init(); addrsel_policy_init(); nd6_init(); frag6_init(); V_ip6_desync_factor = arc4random() % MAX_TEMP_DESYNC_FACTOR; /* Skip global initialization stuff for non-default instances. */ #ifdef VIMAGE if (!IS_DEFAULT_VNET(curvnet)) { netisr_register_vnet(&ip6_nh); #ifdef RSS netisr_register_vnet(&ip6_direct_nh); #endif return; } #endif pr = pffindproto(PF_INET6, IPPROTO_RAW, SOCK_RAW); if (pr == NULL) panic("ip6_init"); /* Initialize the entire ip6_protox[] array to IPPROTO_RAW. */ for (i = 0; i < IPPROTO_MAX; i++) ip6_protox[i] = pr - inet6sw; /* * Cycle through IP protocols and put them into the appropriate place * in ip6_protox[]. */ for (pr = inet6domain.dom_protosw; pr < inet6domain.dom_protoswNPROTOSW; pr++) if (pr->pr_domain->dom_family == PF_INET6 && pr->pr_protocol && pr->pr_protocol != IPPROTO_RAW) { /* Be careful to only index valid IP protocols. */ if (pr->pr_protocol < IPPROTO_MAX) ip6_protox[pr->pr_protocol] = pr - inet6sw; } netisr_register(&ip6_nh); #ifdef RSS netisr_register(&ip6_direct_nh); #endif } /* * The protocol to be inserted into ip6_protox[] must be already registered * in inet6sw[], either statically or through pf_proto_register(). */ int ip6proto_register(short ip6proto) { struct protosw *pr; /* Sanity checks. */ if (ip6proto <= 0 || ip6proto >= IPPROTO_MAX) return (EPROTONOSUPPORT); /* * The protocol slot must not be occupied by another protocol * already. An index pointing to IPPROTO_RAW is unused. */ pr = pffindproto(PF_INET6, IPPROTO_RAW, SOCK_RAW); if (pr == NULL) return (EPFNOSUPPORT); if (ip6_protox[ip6proto] != pr - inet6sw) /* IPPROTO_RAW */ return (EEXIST); /* * Find the protocol position in inet6sw[] and set the index. */ for (pr = inet6domain.dom_protosw; pr < inet6domain.dom_protoswNPROTOSW; pr++) { if (pr->pr_domain->dom_family == PF_INET6 && pr->pr_protocol && pr->pr_protocol == ip6proto) { ip6_protox[pr->pr_protocol] = pr - inet6sw; return (0); } } return (EPROTONOSUPPORT); } int ip6proto_unregister(short ip6proto) { struct protosw *pr; /* Sanity checks. */ if (ip6proto <= 0 || ip6proto >= IPPROTO_MAX) return (EPROTONOSUPPORT); /* Check if the protocol was indeed registered. */ pr = pffindproto(PF_INET6, IPPROTO_RAW, SOCK_RAW); if (pr == NULL) return (EPFNOSUPPORT); if (ip6_protox[ip6proto] == pr - inet6sw) /* IPPROTO_RAW */ return (ENOENT); /* Reset the protocol slot to IPPROTO_RAW. */ ip6_protox[ip6proto] = pr - inet6sw; return (0); } #ifdef VIMAGE static void ip6_destroy(void *unused __unused) { struct ifaddr *ifa, *nifa; struct ifnet *ifp; int error; #ifdef RSS netisr_unregister_vnet(&ip6_direct_nh); #endif netisr_unregister_vnet(&ip6_nh); if ((error = pfil_head_unregister(&V_inet6_pfil_hook)) != 0) printf("%s: WARNING: unable to unregister pfil hook, " "error %d\n", __func__, error); error = hhook_head_deregister(V_ipsec_hhh_in[HHOOK_IPSEC_INET6]); if (error != 0) { printf("%s: WARNING: unable to deregister input helper hook " "type HHOOK_TYPE_IPSEC_IN, id HHOOK_IPSEC_INET6: " "error %d returned\n", __func__, error); } error = hhook_head_deregister(V_ipsec_hhh_out[HHOOK_IPSEC_INET6]); if (error != 0) { printf("%s: WARNING: unable to deregister output helper hook " "type HHOOK_TYPE_IPSEC_OUT, id HHOOK_IPSEC_INET6: " "error %d returned\n", __func__, error); } /* Cleanup addresses. */ IFNET_RLOCK(); - TAILQ_FOREACH(ifp, &V_ifnet, if_link) { + CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) { /* Cannot lock here - lock recursion. */ /* IF_ADDR_LOCK(ifp); */ CK_STAILQ_FOREACH_SAFE(ifa, &ifp->if_addrhead, ifa_link, nifa) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; in6_purgeaddr(ifa); } /* IF_ADDR_UNLOCK(ifp); */ in6_ifdetach_destroy(ifp); mld_domifdetach(ifp); /* Make sure any routes are gone as well. */ rt_flushifroutes_af(ifp, AF_INET6); } IFNET_RUNLOCK(); nd6_destroy(); in6_ifattach_destroy(); hashdestroy(V_in6_ifaddrhashtbl, M_IFADDR, V_in6_ifaddrhmask); } VNET_SYSUNINIT(inet6, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, ip6_destroy, NULL); #endif static int ip6_input_hbh(struct mbuf *m, uint32_t *plen, uint32_t *rtalert, int *off, int *nxt, int *ours) { struct ip6_hdr *ip6; struct ip6_hbh *hbh; if (ip6_hopopts_input(plen, rtalert, &m, off)) { #if 0 /*touches NULL pointer*/ in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_discard); #endif goto out; /* m have already been freed */ } /* adjust pointer */ ip6 = mtod(m, struct ip6_hdr *); /* * if the payload length field is 0 and the next header field * indicates Hop-by-Hop Options header, then a Jumbo Payload * option MUST be included. */ if (ip6->ip6_plen == 0 && *plen == 0) { /* * Note that if a valid jumbo payload option is * contained, ip6_hopopts_input() must set a valid * (non-zero) payload length to the variable plen. */ IP6STAT_INC(ip6s_badoptions); in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_discard); in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_hdrerr); icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_HEADER, (caddr_t)&ip6->ip6_plen - (caddr_t)ip6); goto out; } #ifndef PULLDOWN_TEST /* ip6_hopopts_input() ensures that mbuf is contiguous */ hbh = (struct ip6_hbh *)(ip6 + 1); #else IP6_EXTHDR_GET(hbh, struct ip6_hbh *, m, sizeof(struct ip6_hdr), sizeof(struct ip6_hbh)); if (hbh == NULL) { IP6STAT_INC(ip6s_tooshort); goto out; } #endif *nxt = hbh->ip6h_nxt; /* * If we are acting as a router and the packet contains a * router alert option, see if we know the option value. * Currently, we only support the option value for MLD, in which * case we should pass the packet to the multicast routing * daemon. */ if (*rtalert != ~0) { switch (*rtalert) { case IP6OPT_RTALERT_MLD: if (V_ip6_forwarding) *ours = 1; break; default: /* * RFC2711 requires unrecognized values must be * silently ignored. */ break; } } return (0); out: return (1); } #ifdef RSS /* * IPv6 direct input routine. * * This is called when reinjecting completed fragments where * all of the previous checking and book-keeping has been done. */ void ip6_direct_input(struct mbuf *m) { int off, nxt; int nest; struct m_tag *mtag; struct ip6_direct_ctx *ip6dc; mtag = m_tag_locate(m, MTAG_ABI_IPV6, IPV6_TAG_DIRECT, NULL); KASSERT(mtag != NULL, ("Reinjected packet w/o direct ctx tag!")); ip6dc = (struct ip6_direct_ctx *)(mtag + 1); nxt = ip6dc->ip6dc_nxt; off = ip6dc->ip6dc_off; nest = 0; m_tag_delete(m, mtag); while (nxt != IPPROTO_DONE) { if (V_ip6_hdrnestlimit && (++nest > V_ip6_hdrnestlimit)) { IP6STAT_INC(ip6s_toomanyhdr); goto bad; } /* * protection against faulty packet - there should be * more sanity checks in header chain processing. */ if (m->m_pkthdr.len < off) { IP6STAT_INC(ip6s_tooshort); in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_truncated); goto bad; } #if defined(IPSEC) || defined(IPSEC_SUPPORT) if (IPSEC_ENABLED(ipv6)) { if (IPSEC_INPUT(ipv6, m, off, nxt) != 0) return; } #endif /* IPSEC */ nxt = (*inet6sw[ip6_protox[nxt]].pr_input)(&m, &off, nxt); } return; bad: m_freem(m); } #endif void ip6_input(struct mbuf *m) { struct in6_addr odst; struct ip6_hdr *ip6; struct in6_ifaddr *ia; struct ifnet *rcvif; u_int32_t plen; u_int32_t rtalert = ~0; int off = sizeof(struct ip6_hdr), nest; int nxt, ours = 0; int srcrt = 0; /* * Drop the packet if IPv6 operation is disabled on the interface. */ rcvif = m->m_pkthdr.rcvif; if ((ND_IFINFO(rcvif)->flags & ND6_IFF_IFDISABLED)) goto bad; #if defined(IPSEC) || defined(IPSEC_SUPPORT) /* * should the inner packet be considered authentic? * see comment in ah4_input(). * NB: m cannot be NULL when passed to the input routine */ m->m_flags &= ~M_AUTHIPHDR; m->m_flags &= ~M_AUTHIPDGM; #endif /* IPSEC */ if (m->m_flags & M_FASTFWD_OURS) { /* * Firewall changed destination to local. */ ip6 = mtod(m, struct ip6_hdr *); goto passin; } /* * mbuf statistics */ if (m->m_flags & M_EXT) { if (m->m_next) IP6STAT_INC(ip6s_mext2m); else IP6STAT_INC(ip6s_mext1); } else { if (m->m_next) { if (m->m_flags & M_LOOP) { IP6STAT_INC(ip6s_m2m[V_loif->if_index]); } else if (rcvif->if_index < IP6S_M2MMAX) IP6STAT_INC(ip6s_m2m[rcvif->if_index]); else IP6STAT_INC(ip6s_m2m[0]); } else IP6STAT_INC(ip6s_m1); } in6_ifstat_inc(rcvif, ifs6_in_receive); IP6STAT_INC(ip6s_total); #ifndef PULLDOWN_TEST /* * L2 bridge code and some other code can return mbuf chain * that does not conform to KAME requirement. too bad. * XXX: fails to join if interface MTU > MCLBYTES. jumbogram? */ if (m && m->m_next != NULL && m->m_pkthdr.len < MCLBYTES) { struct mbuf *n; if (m->m_pkthdr.len > MHLEN) n = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); else n = m_gethdr(M_NOWAIT, MT_DATA); if (n == NULL) goto bad; m_move_pkthdr(n, m); m_copydata(m, 0, n->m_pkthdr.len, mtod(n, caddr_t)); n->m_len = n->m_pkthdr.len; m_freem(m); m = n; } IP6_EXTHDR_CHECK(m, 0, sizeof(struct ip6_hdr), /* nothing */); #endif if (m->m_len < sizeof(struct ip6_hdr)) { if ((m = m_pullup(m, sizeof(struct ip6_hdr))) == NULL) { IP6STAT_INC(ip6s_toosmall); in6_ifstat_inc(rcvif, ifs6_in_hdrerr); goto bad; } } ip6 = mtod(m, struct ip6_hdr *); if ((ip6->ip6_vfc & IPV6_VERSION_MASK) != IPV6_VERSION) { IP6STAT_INC(ip6s_badvers); in6_ifstat_inc(rcvif, ifs6_in_hdrerr); goto bad; } IP6STAT_INC(ip6s_nxthist[ip6->ip6_nxt]); IP_PROBE(receive, NULL, NULL, ip6, rcvif, NULL, ip6); /* * Check against address spoofing/corruption. */ if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_src) || IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_dst)) { /* * XXX: "badscope" is not very suitable for a multicast source. */ IP6STAT_INC(ip6s_badscope); in6_ifstat_inc(rcvif, ifs6_in_addrerr); goto bad; } if (IN6_IS_ADDR_MC_INTFACELOCAL(&ip6->ip6_dst) && !(m->m_flags & M_LOOP)) { /* * In this case, the packet should come from the loopback * interface. However, we cannot just check the if_flags, * because ip6_mloopback() passes the "actual" interface * as the outgoing/incoming interface. */ IP6STAT_INC(ip6s_badscope); in6_ifstat_inc(rcvif, ifs6_in_addrerr); goto bad; } if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) && IPV6_ADDR_MC_SCOPE(&ip6->ip6_dst) == 0) { /* * RFC4291 2.7: * Nodes must not originate a packet to a multicast address * whose scop field contains the reserved value 0; if such * a packet is received, it must be silently dropped. */ IP6STAT_INC(ip6s_badscope); in6_ifstat_inc(rcvif, ifs6_in_addrerr); goto bad; } #ifdef ALTQ if (altq_input != NULL && (*altq_input)(m, AF_INET6) == 0) { /* packet is dropped by traffic conditioner */ return; } #endif /* * The following check is not documented in specs. A malicious * party may be able to use IPv4 mapped addr to confuse tcp/udp stack * and bypass security checks (act as if it was from 127.0.0.1 by using * IPv6 src ::ffff:127.0.0.1). Be cautious. * * This check chokes if we are in an SIIT cloud. As none of BSDs * support IPv4-less kernel compilation, we cannot support SIIT * environment at all. So, it makes more sense for us to reject any * malicious packets for non-SIIT environment, than try to do a * partial support for SIIT environment. */ if (IN6_IS_ADDR_V4MAPPED(&ip6->ip6_src) || IN6_IS_ADDR_V4MAPPED(&ip6->ip6_dst)) { IP6STAT_INC(ip6s_badscope); in6_ifstat_inc(rcvif, ifs6_in_addrerr); goto bad; } #if 0 /* * Reject packets with IPv4 compatible addresses (auto tunnel). * * The code forbids auto tunnel relay case in RFC1933 (the check is * stronger than RFC1933). We may want to re-enable it if mech-xx * is revised to forbid relaying case. */ if (IN6_IS_ADDR_V4COMPAT(&ip6->ip6_src) || IN6_IS_ADDR_V4COMPAT(&ip6->ip6_dst)) { IP6STAT_INC(ip6s_badscope); in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_addrerr); goto bad; } #endif /* * Try to forward the packet, but if we fail continue. * ip6_tryforward() does inbound and outbound packet firewall * processing. If firewall has decided that destination becomes * our local address, it sets M_FASTFWD_OURS flag. In this * case skip another inbound firewall processing and update * ip6 pointer. */ if (V_ip6_forwarding != 0 #if defined(IPSEC) || defined(IPSEC_SUPPORT) && (!IPSEC_ENABLED(ipv6) || IPSEC_CAPS(ipv6, m, IPSEC_CAP_OPERABLE) == 0) #endif ) { if ((m = ip6_tryforward(m)) == NULL) return; if (m->m_flags & M_FASTFWD_OURS) { ip6 = mtod(m, struct ip6_hdr *); goto passin; } } #if defined(IPSEC) || defined(IPSEC_SUPPORT) /* * Bypass packet filtering for packets previously handled by IPsec. */ if (IPSEC_ENABLED(ipv6) && IPSEC_CAPS(ipv6, m, IPSEC_CAP_BYPASS_FILTER) != 0) goto passin; #endif /* * Run through list of hooks for input packets. * * NB: Beware of the destination address changing * (e.g. by NAT rewriting). When this happens, * tell ip6_forward to do the right thing. */ /* Jump over all PFIL processing if hooks are not active. */ if (!PFIL_HOOKED(&V_inet6_pfil_hook)) goto passin; odst = ip6->ip6_dst; if (pfil_run_hooks(&V_inet6_pfil_hook, &m, m->m_pkthdr.rcvif, PFIL_IN, 0, NULL)) return; if (m == NULL) /* consumed by filter */ return; ip6 = mtod(m, struct ip6_hdr *); srcrt = !IN6_ARE_ADDR_EQUAL(&odst, &ip6->ip6_dst); if ((m->m_flags & (M_IP6_NEXTHOP | M_FASTFWD_OURS)) == M_IP6_NEXTHOP && m_tag_find(m, PACKET_TAG_IPFORWARD, NULL) != NULL) { /* * Directly ship the packet on. This allows forwarding * packets originally destined to us to some other directly * connected host. */ ip6_forward(m, 1); return; } passin: /* * Disambiguate address scope zones (if there is ambiguity). * We first make sure that the original source or destination address * is not in our internal form for scoped addresses. Such addresses * are not necessarily invalid spec-wise, but we cannot accept them due * to the usage conflict. * in6_setscope() then also checks and rejects the cases where src or * dst are the loopback address and the receiving interface * is not loopback. */ if (in6_clearscope(&ip6->ip6_src) || in6_clearscope(&ip6->ip6_dst)) { IP6STAT_INC(ip6s_badscope); /* XXX */ goto bad; } if (in6_setscope(&ip6->ip6_src, rcvif, NULL) || in6_setscope(&ip6->ip6_dst, rcvif, NULL)) { IP6STAT_INC(ip6s_badscope); goto bad; } if (m->m_flags & M_FASTFWD_OURS) { m->m_flags &= ~M_FASTFWD_OURS; ours = 1; goto hbhcheck; } /* * Multicast check. Assume packet is for us to avoid * prematurely taking locks. */ if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) { ours = 1; in6_ifstat_inc(rcvif, ifs6_in_mcast); goto hbhcheck; } /* * Unicast check * XXX: For now we keep link-local IPv6 addresses with embedded * scope zone id, therefore we use zero zoneid here. */ ia = in6ifa_ifwithaddr(&ip6->ip6_dst, 0 /* XXX */); if (ia != NULL) { if (ia->ia6_flags & IN6_IFF_NOTREADY) { char ip6bufs[INET6_ADDRSTRLEN]; char ip6bufd[INET6_ADDRSTRLEN]; /* address is not ready, so discard the packet. */ nd6log((LOG_INFO, "ip6_input: packet to an unready address %s->%s\n", ip6_sprintf(ip6bufs, &ip6->ip6_src), ip6_sprintf(ip6bufd, &ip6->ip6_dst))); ifa_free(&ia->ia_ifa); goto bad; } /* Count the packet in the ip address stats */ counter_u64_add(ia->ia_ifa.ifa_ipackets, 1); counter_u64_add(ia->ia_ifa.ifa_ibytes, m->m_pkthdr.len); ifa_free(&ia->ia_ifa); ours = 1; goto hbhcheck; } /* * Now there is no reason to process the packet if it's not our own * and we're not a router. */ if (!V_ip6_forwarding) { IP6STAT_INC(ip6s_cantforward); goto bad; } hbhcheck: /* * Process Hop-by-Hop options header if it's contained. * m may be modified in ip6_hopopts_input(). * If a JumboPayload option is included, plen will also be modified. */ plen = (u_int32_t)ntohs(ip6->ip6_plen); if (ip6->ip6_nxt == IPPROTO_HOPOPTS) { if (ip6_input_hbh(m, &plen, &rtalert, &off, &nxt, &ours) != 0) return; } else nxt = ip6->ip6_nxt; /* * Use mbuf flags to propagate Router Alert option to * ICMPv6 layer, as hop-by-hop options have been stripped. */ if (rtalert != ~0) m->m_flags |= M_RTALERT_MLD; /* * Check that the amount of data in the buffers * is as at least much as the IPv6 header would have us expect. * Trim mbufs if longer than we expect. * Drop packet if shorter than we expect. */ if (m->m_pkthdr.len - sizeof(struct ip6_hdr) < plen) { IP6STAT_INC(ip6s_tooshort); in6_ifstat_inc(rcvif, ifs6_in_truncated); goto bad; } if (m->m_pkthdr.len > sizeof(struct ip6_hdr) + plen) { if (m->m_len == m->m_pkthdr.len) { m->m_len = sizeof(struct ip6_hdr) + plen; m->m_pkthdr.len = sizeof(struct ip6_hdr) + plen; } else m_adj(m, sizeof(struct ip6_hdr) + plen - m->m_pkthdr.len); } /* * Forward if desirable. */ if (V_ip6_mrouter && IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) { /* * If we are acting as a multicast router, all * incoming multicast packets are passed to the * kernel-level multicast forwarding function. * The packet is returned (relatively) intact; if * ip6_mforward() returns a non-zero value, the packet * must be discarded, else it may be accepted below. * * XXX TODO: Check hlim and multicast scope here to avoid * unnecessarily calling into ip6_mforward(). */ if (ip6_mforward && ip6_mforward(ip6, rcvif, m)) { IP6STAT_INC(ip6s_cantforward); goto bad; } } else if (!ours) { ip6_forward(m, srcrt); return; } ip6 = mtod(m, struct ip6_hdr *); /* * Malicious party may be able to use IPv4 mapped addr to confuse * tcp/udp stack and bypass security checks (act as if it was from * 127.0.0.1 by using IPv6 src ::ffff:127.0.0.1). Be cautious. * * For SIIT end node behavior, you may want to disable the check. * However, you will become vulnerable to attacks using IPv4 mapped * source. */ if (IN6_IS_ADDR_V4MAPPED(&ip6->ip6_src) || IN6_IS_ADDR_V4MAPPED(&ip6->ip6_dst)) { IP6STAT_INC(ip6s_badscope); in6_ifstat_inc(rcvif, ifs6_in_addrerr); goto bad; } /* * Tell launch routine the next header */ IP6STAT_INC(ip6s_delivered); in6_ifstat_inc(rcvif, ifs6_in_deliver); nest = 0; while (nxt != IPPROTO_DONE) { if (V_ip6_hdrnestlimit && (++nest > V_ip6_hdrnestlimit)) { IP6STAT_INC(ip6s_toomanyhdr); goto bad; } /* * protection against faulty packet - there should be * more sanity checks in header chain processing. */ if (m->m_pkthdr.len < off) { IP6STAT_INC(ip6s_tooshort); in6_ifstat_inc(rcvif, ifs6_in_truncated); goto bad; } #if defined(IPSEC) || defined(IPSEC_SUPPORT) if (IPSEC_ENABLED(ipv6)) { if (IPSEC_INPUT(ipv6, m, off, nxt) != 0) return; } #endif /* IPSEC */ nxt = (*inet6sw[ip6_protox[nxt]].pr_input)(&m, &off, nxt); } return; bad: in6_ifstat_inc(rcvif, ifs6_in_discard); if (m != NULL) m_freem(m); } /* * Hop-by-Hop options header processing. If a valid jumbo payload option is * included, the real payload length will be stored in plenp. * * rtalertp - XXX: should be stored more smart way */ static int ip6_hopopts_input(u_int32_t *plenp, u_int32_t *rtalertp, struct mbuf **mp, int *offp) { struct mbuf *m = *mp; int off = *offp, hbhlen; struct ip6_hbh *hbh; /* validation of the length of the header */ #ifndef PULLDOWN_TEST IP6_EXTHDR_CHECK(m, off, sizeof(*hbh), -1); hbh = (struct ip6_hbh *)(mtod(m, caddr_t) + off); hbhlen = (hbh->ip6h_len + 1) << 3; IP6_EXTHDR_CHECK(m, off, hbhlen, -1); hbh = (struct ip6_hbh *)(mtod(m, caddr_t) + off); #else IP6_EXTHDR_GET(hbh, struct ip6_hbh *, m, sizeof(struct ip6_hdr), sizeof(struct ip6_hbh)); if (hbh == NULL) { IP6STAT_INC(ip6s_tooshort); return -1; } hbhlen = (hbh->ip6h_len + 1) << 3; IP6_EXTHDR_GET(hbh, struct ip6_hbh *, m, sizeof(struct ip6_hdr), hbhlen); if (hbh == NULL) { IP6STAT_INC(ip6s_tooshort); return -1; } #endif off += hbhlen; hbhlen -= sizeof(struct ip6_hbh); if (ip6_process_hopopts(m, (u_int8_t *)hbh + sizeof(struct ip6_hbh), hbhlen, rtalertp, plenp) < 0) return (-1); *offp = off; *mp = m; return (0); } /* * Search header for all Hop-by-hop options and process each option. * This function is separate from ip6_hopopts_input() in order to * handle a case where the sending node itself process its hop-by-hop * options header. In such a case, the function is called from ip6_output(). * * The function assumes that hbh header is located right after the IPv6 header * (RFC2460 p7), opthead is pointer into data content in m, and opthead to * opthead + hbhlen is located in contiguous memory region. */ int ip6_process_hopopts(struct mbuf *m, u_int8_t *opthead, int hbhlen, u_int32_t *rtalertp, u_int32_t *plenp) { struct ip6_hdr *ip6; int optlen = 0; u_int8_t *opt = opthead; u_int16_t rtalert_val; u_int32_t jumboplen; const int erroff = sizeof(struct ip6_hdr) + sizeof(struct ip6_hbh); for (; hbhlen > 0; hbhlen -= optlen, opt += optlen) { switch (*opt) { case IP6OPT_PAD1: optlen = 1; break; case IP6OPT_PADN: if (hbhlen < IP6OPT_MINLEN) { IP6STAT_INC(ip6s_toosmall); goto bad; } optlen = *(opt + 1) + 2; break; case IP6OPT_ROUTER_ALERT: /* XXX may need check for alignment */ if (hbhlen < IP6OPT_RTALERT_LEN) { IP6STAT_INC(ip6s_toosmall); goto bad; } if (*(opt + 1) != IP6OPT_RTALERT_LEN - 2) { /* XXX stat */ icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_HEADER, erroff + opt + 1 - opthead); return (-1); } optlen = IP6OPT_RTALERT_LEN; bcopy((caddr_t)(opt + 2), (caddr_t)&rtalert_val, 2); *rtalertp = ntohs(rtalert_val); break; case IP6OPT_JUMBO: /* XXX may need check for alignment */ if (hbhlen < IP6OPT_JUMBO_LEN) { IP6STAT_INC(ip6s_toosmall); goto bad; } if (*(opt + 1) != IP6OPT_JUMBO_LEN - 2) { /* XXX stat */ icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_HEADER, erroff + opt + 1 - opthead); return (-1); } optlen = IP6OPT_JUMBO_LEN; /* * IPv6 packets that have non 0 payload length * must not contain a jumbo payload option. */ ip6 = mtod(m, struct ip6_hdr *); if (ip6->ip6_plen) { IP6STAT_INC(ip6s_badoptions); icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_HEADER, erroff + opt - opthead); return (-1); } /* * We may see jumbolen in unaligned location, so * we'd need to perform bcopy(). */ bcopy(opt + 2, &jumboplen, sizeof(jumboplen)); jumboplen = (u_int32_t)htonl(jumboplen); #if 1 /* * if there are multiple jumbo payload options, * *plenp will be non-zero and the packet will be * rejected. * the behavior may need some debate in ipngwg - * multiple options does not make sense, however, * there's no explicit mention in specification. */ if (*plenp != 0) { IP6STAT_INC(ip6s_badoptions); icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_HEADER, erroff + opt + 2 - opthead); return (-1); } #endif /* * jumbo payload length must be larger than 65535. */ if (jumboplen <= IPV6_MAXPACKET) { IP6STAT_INC(ip6s_badoptions); icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_HEADER, erroff + opt + 2 - opthead); return (-1); } *plenp = jumboplen; break; default: /* unknown option */ if (hbhlen < IP6OPT_MINLEN) { IP6STAT_INC(ip6s_toosmall); goto bad; } optlen = ip6_unknown_opt(opt, m, erroff + opt - opthead); if (optlen == -1) return (-1); optlen += 2; break; } } return (0); bad: m_freem(m); return (-1); } /* * Unknown option processing. * The third argument `off' is the offset from the IPv6 header to the option, * which is necessary if the IPv6 header the and option header and IPv6 header * is not contiguous in order to return an ICMPv6 error. */ int ip6_unknown_opt(u_int8_t *optp, struct mbuf *m, int off) { struct ip6_hdr *ip6; switch (IP6OPT_TYPE(*optp)) { case IP6OPT_TYPE_SKIP: /* ignore the option */ return ((int)*(optp + 1)); case IP6OPT_TYPE_DISCARD: /* silently discard */ m_freem(m); return (-1); case IP6OPT_TYPE_FORCEICMP: /* send ICMP even if multicasted */ IP6STAT_INC(ip6s_badoptions); icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_OPTION, off); return (-1); case IP6OPT_TYPE_ICMP: /* send ICMP if not multicasted */ IP6STAT_INC(ip6s_badoptions); ip6 = mtod(m, struct ip6_hdr *); if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) || (m->m_flags & (M_BCAST|M_MCAST))) m_freem(m); else icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_OPTION, off); return (-1); } m_freem(m); /* XXX: NOTREACHED */ return (-1); } /* * Create the "control" list for this pcb. * These functions will not modify mbuf chain at all. * * With KAME mbuf chain restriction: * The routine will be called from upper layer handlers like tcp6_input(). * Thus the routine assumes that the caller (tcp6_input) have already * called IP6_EXTHDR_CHECK() and all the extension headers are located in the * very first mbuf on the mbuf chain. * * ip6_savecontrol_v4 will handle those options that are possible to be * set on a v4-mapped socket. * ip6_savecontrol will directly call ip6_savecontrol_v4 to handle those * options and handle the v6-only ones itself. */ struct mbuf ** ip6_savecontrol_v4(struct inpcb *inp, struct mbuf *m, struct mbuf **mp, int *v4only) { struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *); #ifdef SO_TIMESTAMP if ((inp->inp_socket->so_options & SO_TIMESTAMP) != 0) { union { struct timeval tv; struct bintime bt; struct timespec ts; } t; struct bintime boottimebin, bt1; struct timespec ts1; bool stamped; stamped = false; switch (inp->inp_socket->so_ts_clock) { case SO_TS_REALTIME_MICRO: if ((m->m_flags & (M_PKTHDR | M_TSTMP)) == (M_PKTHDR | M_TSTMP)) { mbuf_tstmp2timespec(m, &ts1); timespec2bintime(&ts1, &bt1); getboottimebin(&boottimebin); bintime_add(&bt1, &boottimebin); bintime2timeval(&bt1, &t.tv); } else { microtime(&t.tv); } *mp = sbcreatecontrol((caddr_t) &t.tv, sizeof(t.tv), SCM_TIMESTAMP, SOL_SOCKET); if (*mp != NULL) { mp = &(*mp)->m_next; stamped = true; } break; case SO_TS_BINTIME: if ((m->m_flags & (M_PKTHDR | M_TSTMP)) == (M_PKTHDR | M_TSTMP)) { mbuf_tstmp2timespec(m, &ts1); timespec2bintime(&ts1, &t.bt); getboottimebin(&boottimebin); bintime_add(&t.bt, &boottimebin); } else { bintime(&t.bt); } *mp = sbcreatecontrol((caddr_t)&t.bt, sizeof(t.bt), SCM_BINTIME, SOL_SOCKET); if (*mp != NULL) { mp = &(*mp)->m_next; stamped = true; } break; case SO_TS_REALTIME: if ((m->m_flags & (M_PKTHDR | M_TSTMP)) == (M_PKTHDR | M_TSTMP)) { mbuf_tstmp2timespec(m, &t.ts); getboottimebin(&boottimebin); bintime2timespec(&boottimebin, &ts1); timespecadd(&t.ts, &ts1); } else { nanotime(&t.ts); } *mp = sbcreatecontrol((caddr_t)&t.ts, sizeof(t.ts), SCM_REALTIME, SOL_SOCKET); if (*mp != NULL) { mp = &(*mp)->m_next; stamped = true; } break; case SO_TS_MONOTONIC: if ((m->m_flags & (M_PKTHDR | M_TSTMP)) == (M_PKTHDR | M_TSTMP)) mbuf_tstmp2timespec(m, &t.ts); else nanouptime(&t.ts); *mp = sbcreatecontrol((caddr_t)&t.ts, sizeof(t.ts), SCM_MONOTONIC, SOL_SOCKET); if (*mp != NULL) { mp = &(*mp)->m_next; stamped = true; } break; default: panic("unknown (corrupted) so_ts_clock"); } if (stamped && (m->m_flags & (M_PKTHDR | M_TSTMP)) == (M_PKTHDR | M_TSTMP)) { struct sock_timestamp_info sti; bzero(&sti, sizeof(sti)); sti.st_info_flags = ST_INFO_HW; if ((m->m_flags & M_TSTMP_HPREC) != 0) sti.st_info_flags |= ST_INFO_HW_HPREC; *mp = sbcreatecontrol((caddr_t)&sti, sizeof(sti), SCM_TIME_INFO, SOL_SOCKET); if (*mp != NULL) mp = &(*mp)->m_next; } } #endif #define IS2292(inp, x, y) (((inp)->inp_flags & IN6P_RFC2292) ? (x) : (y)) /* RFC 2292 sec. 5 */ if ((inp->inp_flags & IN6P_PKTINFO) != 0) { struct in6_pktinfo pi6; if ((ip6->ip6_vfc & IPV6_VERSION_MASK) != IPV6_VERSION) { #ifdef INET struct ip *ip; ip = mtod(m, struct ip *); pi6.ipi6_addr.s6_addr32[0] = 0; pi6.ipi6_addr.s6_addr32[1] = 0; pi6.ipi6_addr.s6_addr32[2] = IPV6_ADDR_INT32_SMP; pi6.ipi6_addr.s6_addr32[3] = ip->ip_dst.s_addr; #else /* We won't hit this code */ bzero(&pi6.ipi6_addr, sizeof(struct in6_addr)); #endif } else { bcopy(&ip6->ip6_dst, &pi6.ipi6_addr, sizeof(struct in6_addr)); in6_clearscope(&pi6.ipi6_addr); /* XXX */ } pi6.ipi6_ifindex = (m && m->m_pkthdr.rcvif) ? m->m_pkthdr.rcvif->if_index : 0; *mp = sbcreatecontrol((caddr_t) &pi6, sizeof(struct in6_pktinfo), IS2292(inp, IPV6_2292PKTINFO, IPV6_PKTINFO), IPPROTO_IPV6); if (*mp) mp = &(*mp)->m_next; } if ((inp->inp_flags & IN6P_HOPLIMIT) != 0) { int hlim; if ((ip6->ip6_vfc & IPV6_VERSION_MASK) != IPV6_VERSION) { #ifdef INET struct ip *ip; ip = mtod(m, struct ip *); hlim = ip->ip_ttl; #else /* We won't hit this code */ hlim = 0; #endif } else { hlim = ip6->ip6_hlim & 0xff; } *mp = sbcreatecontrol((caddr_t) &hlim, sizeof(int), IS2292(inp, IPV6_2292HOPLIMIT, IPV6_HOPLIMIT), IPPROTO_IPV6); if (*mp) mp = &(*mp)->m_next; } if ((inp->inp_flags & IN6P_TCLASS) != 0) { int tclass; if ((ip6->ip6_vfc & IPV6_VERSION_MASK) != IPV6_VERSION) { #ifdef INET struct ip *ip; ip = mtod(m, struct ip *); tclass = ip->ip_tos; #else /* We won't hit this code */ tclass = 0; #endif } else { u_int32_t flowinfo; flowinfo = (u_int32_t)ntohl(ip6->ip6_flow & IPV6_FLOWINFO_MASK); flowinfo >>= 20; tclass = flowinfo & 0xff; } *mp = sbcreatecontrol((caddr_t) &tclass, sizeof(int), IPV6_TCLASS, IPPROTO_IPV6); if (*mp) mp = &(*mp)->m_next; } if (v4only != NULL) { if ((ip6->ip6_vfc & IPV6_VERSION_MASK) != IPV6_VERSION) { *v4only = 1; } else { *v4only = 0; } } return (mp); } void ip6_savecontrol(struct inpcb *in6p, struct mbuf *m, struct mbuf **mp) { struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *); int v4only = 0; mp = ip6_savecontrol_v4(in6p, m, mp, &v4only); if (v4only) return; /* * IPV6_HOPOPTS socket option. Recall that we required super-user * privilege for the option (see ip6_ctloutput), but it might be too * strict, since there might be some hop-by-hop options which can be * returned to normal user. * See also RFC 2292 section 6 (or RFC 3542 section 8). */ if ((in6p->inp_flags & IN6P_HOPOPTS) != 0) { /* * Check if a hop-by-hop options header is contatined in the * received packet, and if so, store the options as ancillary * data. Note that a hop-by-hop options header must be * just after the IPv6 header, which is assured through the * IPv6 input processing. */ if (ip6->ip6_nxt == IPPROTO_HOPOPTS) { struct ip6_hbh *hbh; int hbhlen = 0; #ifdef PULLDOWN_TEST struct mbuf *ext; #endif #ifndef PULLDOWN_TEST hbh = (struct ip6_hbh *)(ip6 + 1); hbhlen = (hbh->ip6h_len + 1) << 3; #else ext = ip6_pullexthdr(m, sizeof(struct ip6_hdr), ip6->ip6_nxt); if (ext == NULL) { IP6STAT_INC(ip6s_tooshort); return; } hbh = mtod(ext, struct ip6_hbh *); hbhlen = (hbh->ip6h_len + 1) << 3; if (hbhlen != ext->m_len) { m_freem(ext); IP6STAT_INC(ip6s_tooshort); return; } #endif /* * XXX: We copy the whole header even if a * jumbo payload option is included, the option which * is to be removed before returning according to * RFC2292. * Note: this constraint is removed in RFC3542 */ *mp = sbcreatecontrol((caddr_t)hbh, hbhlen, IS2292(in6p, IPV6_2292HOPOPTS, IPV6_HOPOPTS), IPPROTO_IPV6); if (*mp) mp = &(*mp)->m_next; #ifdef PULLDOWN_TEST m_freem(ext); #endif } } if ((in6p->inp_flags & (IN6P_RTHDR | IN6P_DSTOPTS)) != 0) { int nxt = ip6->ip6_nxt, off = sizeof(struct ip6_hdr); /* * Search for destination options headers or routing * header(s) through the header chain, and stores each * header as ancillary data. * Note that the order of the headers remains in * the chain of ancillary data. */ while (1) { /* is explicit loop prevention necessary? */ struct ip6_ext *ip6e = NULL; int elen; #ifdef PULLDOWN_TEST struct mbuf *ext = NULL; #endif /* * if it is not an extension header, don't try to * pull it from the chain. */ switch (nxt) { case IPPROTO_DSTOPTS: case IPPROTO_ROUTING: case IPPROTO_HOPOPTS: case IPPROTO_AH: /* is it possible? */ break; default: goto loopend; } #ifndef PULLDOWN_TEST if (off + sizeof(*ip6e) > m->m_len) goto loopend; ip6e = (struct ip6_ext *)(mtod(m, caddr_t) + off); if (nxt == IPPROTO_AH) elen = (ip6e->ip6e_len + 2) << 2; else elen = (ip6e->ip6e_len + 1) << 3; if (off + elen > m->m_len) goto loopend; #else ext = ip6_pullexthdr(m, off, nxt); if (ext == NULL) { IP6STAT_INC(ip6s_tooshort); return; } ip6e = mtod(ext, struct ip6_ext *); if (nxt == IPPROTO_AH) elen = (ip6e->ip6e_len + 2) << 2; else elen = (ip6e->ip6e_len + 1) << 3; if (elen != ext->m_len) { m_freem(ext); IP6STAT_INC(ip6s_tooshort); return; } #endif switch (nxt) { case IPPROTO_DSTOPTS: if (!(in6p->inp_flags & IN6P_DSTOPTS)) break; *mp = sbcreatecontrol((caddr_t)ip6e, elen, IS2292(in6p, IPV6_2292DSTOPTS, IPV6_DSTOPTS), IPPROTO_IPV6); if (*mp) mp = &(*mp)->m_next; break; case IPPROTO_ROUTING: if (!(in6p->inp_flags & IN6P_RTHDR)) break; *mp = sbcreatecontrol((caddr_t)ip6e, elen, IS2292(in6p, IPV6_2292RTHDR, IPV6_RTHDR), IPPROTO_IPV6); if (*mp) mp = &(*mp)->m_next; break; case IPPROTO_HOPOPTS: case IPPROTO_AH: /* is it possible? */ break; default: /* * other cases have been filtered in the above. * none will visit this case. here we supply * the code just in case (nxt overwritten or * other cases). */ #ifdef PULLDOWN_TEST m_freem(ext); #endif goto loopend; } /* proceed with the next header. */ off += elen; nxt = ip6e->ip6e_nxt; ip6e = NULL; #ifdef PULLDOWN_TEST m_freem(ext); ext = NULL; #endif } loopend: ; } if (in6p->inp_flags2 & INP_RECVFLOWID) { uint32_t flowid, flow_type; flowid = m->m_pkthdr.flowid; flow_type = M_HASHTYPE_GET(m); /* * XXX should handle the failure of one or the * other - don't populate both? */ *mp = sbcreatecontrol((caddr_t) &flowid, sizeof(uint32_t), IPV6_FLOWID, IPPROTO_IPV6); if (*mp) mp = &(*mp)->m_next; *mp = sbcreatecontrol((caddr_t) &flow_type, sizeof(uint32_t), IPV6_FLOWTYPE, IPPROTO_IPV6); if (*mp) mp = &(*mp)->m_next; } #ifdef RSS if (in6p->inp_flags2 & INP_RECVRSSBUCKETID) { uint32_t flowid, flow_type; uint32_t rss_bucketid; flowid = m->m_pkthdr.flowid; flow_type = M_HASHTYPE_GET(m); if (rss_hash2bucket(flowid, flow_type, &rss_bucketid) == 0) { *mp = sbcreatecontrol((caddr_t) &rss_bucketid, sizeof(uint32_t), IPV6_RSSBUCKETID, IPPROTO_IPV6); if (*mp) mp = &(*mp)->m_next; } } #endif } #undef IS2292 void ip6_notify_pmtu(struct inpcb *inp, struct sockaddr_in6 *dst, u_int32_t mtu) { struct socket *so; struct mbuf *m_mtu; struct ip6_mtuinfo mtuctl; KASSERT(inp != NULL, ("%s: inp == NULL", __func__)); /* * Notify the error by sending IPV6_PATHMTU ancillary data if * application wanted to know the MTU value. * NOTE: we notify disconnected sockets, because some udp * applications keep sending sockets disconnected. * NOTE: our implementation doesn't notify connected sockets that has * foreign address that is different than given destination addresses * (this is permitted by RFC 3542). */ if ((inp->inp_flags & IN6P_MTU) == 0 || ( !IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr) && !IN6_ARE_ADDR_EQUAL(&inp->in6p_faddr, &dst->sin6_addr))) return; mtuctl.ip6m_mtu = mtu; mtuctl.ip6m_addr = *dst; if (sa6_recoverscope(&mtuctl.ip6m_addr)) return; if ((m_mtu = sbcreatecontrol((caddr_t)&mtuctl, sizeof(mtuctl), IPV6_PATHMTU, IPPROTO_IPV6)) == NULL) return; so = inp->inp_socket; if (sbappendaddr(&so->so_rcv, (struct sockaddr *)dst, NULL, m_mtu) == 0) { m_freem(m_mtu); /* XXX: should count statistics */ } else sorwakeup(so); } #ifdef PULLDOWN_TEST /* * pull single extension header from mbuf chain. returns single mbuf that * contains the result, or NULL on error. */ static struct mbuf * ip6_pullexthdr(struct mbuf *m, size_t off, int nxt) { struct ip6_ext ip6e; size_t elen; struct mbuf *n; #ifdef DIAGNOSTIC switch (nxt) { case IPPROTO_DSTOPTS: case IPPROTO_ROUTING: case IPPROTO_HOPOPTS: case IPPROTO_AH: /* is it possible? */ break; default: printf("ip6_pullexthdr: invalid nxt=%d\n", nxt); } #endif m_copydata(m, off, sizeof(ip6e), (caddr_t)&ip6e); if (nxt == IPPROTO_AH) elen = (ip6e.ip6e_len + 2) << 2; else elen = (ip6e.ip6e_len + 1) << 3; if (elen > MLEN) n = m_getcl(M_NOWAIT, MT_DATA, 0); else n = m_get(M_NOWAIT, MT_DATA); if (n == NULL) return NULL; m_copydata(m, off, elen, mtod(n, caddr_t)); n->m_len = elen; return n; } #endif /* * Get pointer to the previous header followed by the header * currently processed. */ int ip6_get_prevhdr(const struct mbuf *m, int off) { struct ip6_ext ip6e; struct ip6_hdr *ip6; int len, nlen, nxt; if (off == sizeof(struct ip6_hdr)) return (offsetof(struct ip6_hdr, ip6_nxt)); if (off < sizeof(struct ip6_hdr)) panic("%s: off < sizeof(struct ip6_hdr)", __func__); ip6 = mtod(m, struct ip6_hdr *); nxt = ip6->ip6_nxt; len = sizeof(struct ip6_hdr); nlen = 0; while (len < off) { m_copydata(m, len, sizeof(ip6e), (caddr_t)&ip6e); switch (nxt) { case IPPROTO_FRAGMENT: nlen = sizeof(struct ip6_frag); break; case IPPROTO_AH: nlen = (ip6e.ip6e_len + 2) << 2; break; default: nlen = (ip6e.ip6e_len + 1) << 3; } len += nlen; nxt = ip6e.ip6e_nxt; } return (len - nlen); } /* * get next header offset. m will be retained. */ int ip6_nexthdr(const struct mbuf *m, int off, int proto, int *nxtp) { struct ip6_hdr ip6; struct ip6_ext ip6e; struct ip6_frag fh; /* just in case */ if (m == NULL) panic("ip6_nexthdr: m == NULL"); if ((m->m_flags & M_PKTHDR) == 0 || m->m_pkthdr.len < off) return -1; switch (proto) { case IPPROTO_IPV6: if (m->m_pkthdr.len < off + sizeof(ip6)) return -1; m_copydata(m, off, sizeof(ip6), (caddr_t)&ip6); if (nxtp) *nxtp = ip6.ip6_nxt; off += sizeof(ip6); return off; case IPPROTO_FRAGMENT: /* * terminate parsing if it is not the first fragment, * it does not make sense to parse through it. */ if (m->m_pkthdr.len < off + sizeof(fh)) return -1; m_copydata(m, off, sizeof(fh), (caddr_t)&fh); /* IP6F_OFF_MASK = 0xfff8(BigEndian), 0xf8ff(LittleEndian) */ if (fh.ip6f_offlg & IP6F_OFF_MASK) return -1; if (nxtp) *nxtp = fh.ip6f_nxt; off += sizeof(struct ip6_frag); return off; case IPPROTO_AH: if (m->m_pkthdr.len < off + sizeof(ip6e)) return -1; m_copydata(m, off, sizeof(ip6e), (caddr_t)&ip6e); if (nxtp) *nxtp = ip6e.ip6e_nxt; off += (ip6e.ip6e_len + 2) << 2; return off; case IPPROTO_HOPOPTS: case IPPROTO_ROUTING: case IPPROTO_DSTOPTS: if (m->m_pkthdr.len < off + sizeof(ip6e)) return -1; m_copydata(m, off, sizeof(ip6e), (caddr_t)&ip6e); if (nxtp) *nxtp = ip6e.ip6e_nxt; off += (ip6e.ip6e_len + 1) << 3; return off; case IPPROTO_NONE: case IPPROTO_ESP: case IPPROTO_IPCOMP: /* give up */ return -1; default: return -1; } /* NOTREACHED */ } /* * get offset for the last header in the chain. m will be kept untainted. */ int ip6_lasthdr(const struct mbuf *m, int off, int proto, int *nxtp) { int newoff; int nxt; if (!nxtp) { nxt = -1; nxtp = &nxt; } while (1) { newoff = ip6_nexthdr(m, off, proto, nxtp); if (newoff < 0) return off; else if (newoff < off) return -1; /* invalid */ else if (newoff == off) return newoff; off = newoff; proto = *nxtp; } } /* * System control for IP6 */ u_char inet6ctlerrmap[PRC_NCMDS] = { 0, 0, 0, 0, 0, EMSGSIZE, EHOSTDOWN, EHOSTUNREACH, EHOSTUNREACH, EHOSTUNREACH, ECONNREFUSED, ECONNREFUSED, EMSGSIZE, EHOSTUNREACH, 0, 0, 0, 0, EHOSTUNREACH, 0, ENOPROTOOPT, ECONNREFUSED }; Index: head/sys/netinet6/nd6.c =================================================================== --- head/sys/netinet6/nd6.c (revision 334117) +++ head/sys/netinet6/nd6.c (revision 334118) @@ -1,2740 +1,2740 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $KAME: nd6.c,v 1.144 2001/05/24 07:44:00 itojun Exp $ */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define ND6_SLOWTIMER_INTERVAL (60 * 60) /* 1 hour */ #define ND6_RECALC_REACHTM_INTERVAL (60 * 120) /* 2 hours */ #define SIN6(s) ((const struct sockaddr_in6 *)(s)) MALLOC_DEFINE(M_IP6NDP, "ip6ndp", "IPv6 Neighbor Discovery"); /* timer values */ VNET_DEFINE(int, nd6_prune) = 1; /* walk list every 1 seconds */ VNET_DEFINE(int, nd6_delay) = 5; /* delay first probe time 5 second */ VNET_DEFINE(int, nd6_umaxtries) = 3; /* maximum unicast query */ VNET_DEFINE(int, nd6_mmaxtries) = 3; /* maximum multicast query */ VNET_DEFINE(int, nd6_useloopback) = 1; /* use loopback interface for * local traffic */ VNET_DEFINE(int, nd6_gctimer) = (60 * 60 * 24); /* 1 day: garbage * collection timer */ /* preventing too many loops in ND option parsing */ static VNET_DEFINE(int, nd6_maxndopt) = 10; /* max # of ND options allowed */ VNET_DEFINE(int, nd6_maxnudhint) = 0; /* max # of subsequent upper * layer hints */ static VNET_DEFINE(int, nd6_maxqueuelen) = 1; /* max pkts cached in unresolved * ND entries */ #define V_nd6_maxndopt VNET(nd6_maxndopt) #define V_nd6_maxqueuelen VNET(nd6_maxqueuelen) #ifdef ND6_DEBUG VNET_DEFINE(int, nd6_debug) = 1; #else VNET_DEFINE(int, nd6_debug) = 0; #endif static eventhandler_tag lle_event_eh, iflladdr_event_eh; VNET_DEFINE(struct nd_drhead, nd_defrouter); VNET_DEFINE(struct nd_prhead, nd_prefix); VNET_DEFINE(struct rwlock, nd6_lock); VNET_DEFINE(uint64_t, nd6_list_genid); VNET_DEFINE(struct mtx, nd6_onlink_mtx); VNET_DEFINE(int, nd6_recalc_reachtm_interval) = ND6_RECALC_REACHTM_INTERVAL; #define V_nd6_recalc_reachtm_interval VNET(nd6_recalc_reachtm_interval) int (*send_sendso_input_hook)(struct mbuf *, struct ifnet *, int, int); static int nd6_is_new_addr_neighbor(const struct sockaddr_in6 *, struct ifnet *); static void nd6_setmtu0(struct ifnet *, struct nd_ifinfo *); static void nd6_slowtimo(void *); static int regen_tmpaddr(struct in6_ifaddr *); static void nd6_free(struct llentry **, int); static void nd6_free_redirect(const struct llentry *); static void nd6_llinfo_timer(void *); static void nd6_llinfo_settimer_locked(struct llentry *, long); static void clear_llinfo_pqueue(struct llentry *); static void nd6_rtrequest(int, struct rtentry *, struct rt_addrinfo *); static int nd6_resolve_slow(struct ifnet *, int, struct mbuf *, const struct sockaddr_in6 *, u_char *, uint32_t *, struct llentry **); static int nd6_need_cache(struct ifnet *); static VNET_DEFINE(struct callout, nd6_slowtimo_ch); #define V_nd6_slowtimo_ch VNET(nd6_slowtimo_ch) VNET_DEFINE(struct callout, nd6_timer_ch); #define V_nd6_timer_ch VNET(nd6_timer_ch) static void nd6_lle_event(void *arg __unused, struct llentry *lle, int evt) { struct rt_addrinfo rtinfo; struct sockaddr_in6 dst; struct sockaddr_dl gw; struct ifnet *ifp; int type; int fibnum; LLE_WLOCK_ASSERT(lle); if (lltable_get_af(lle->lle_tbl) != AF_INET6) return; switch (evt) { case LLENTRY_RESOLVED: type = RTM_ADD; KASSERT(lle->la_flags & LLE_VALID, ("%s: %p resolved but not valid?", __func__, lle)); break; case LLENTRY_EXPIRED: type = RTM_DELETE; break; default: return; } ifp = lltable_get_ifp(lle->lle_tbl); bzero(&dst, sizeof(dst)); bzero(&gw, sizeof(gw)); bzero(&rtinfo, sizeof(rtinfo)); lltable_fill_sa_entry(lle, (struct sockaddr *)&dst); dst.sin6_scope_id = in6_getscopezone(ifp, in6_addrscope(&dst.sin6_addr)); gw.sdl_len = sizeof(struct sockaddr_dl); gw.sdl_family = AF_LINK; gw.sdl_alen = ifp->if_addrlen; gw.sdl_index = ifp->if_index; gw.sdl_type = ifp->if_type; if (evt == LLENTRY_RESOLVED) bcopy(lle->ll_addr, gw.sdl_data, ifp->if_addrlen); rtinfo.rti_info[RTAX_DST] = (struct sockaddr *)&dst; rtinfo.rti_info[RTAX_GATEWAY] = (struct sockaddr *)&gw; rtinfo.rti_addrs = RTA_DST | RTA_GATEWAY; fibnum = V_rt_add_addr_allfibs ? RT_ALL_FIBS : ifp->if_fib; rt_missmsg_fib(type, &rtinfo, RTF_HOST | RTF_LLDATA | ( type == RTM_ADD ? RTF_UP: 0), 0, fibnum); } /* * A handler for interface link layer address change event. */ static void nd6_iflladdr(void *arg __unused, struct ifnet *ifp) { lltable_update_ifaddr(LLTABLE6(ifp)); } void nd6_init(void) { mtx_init(&V_nd6_onlink_mtx, "nd6 onlink", NULL, MTX_DEF); rw_init(&V_nd6_lock, "nd6 list"); LIST_INIT(&V_nd_prefix); TAILQ_INIT(&V_nd_defrouter); /* Start timers. */ callout_init(&V_nd6_slowtimo_ch, 0); callout_reset(&V_nd6_slowtimo_ch, ND6_SLOWTIMER_INTERVAL * hz, nd6_slowtimo, curvnet); callout_init(&V_nd6_timer_ch, 0); callout_reset(&V_nd6_timer_ch, hz, nd6_timer, curvnet); nd6_dad_init(); if (IS_DEFAULT_VNET(curvnet)) { lle_event_eh = EVENTHANDLER_REGISTER(lle_event, nd6_lle_event, NULL, EVENTHANDLER_PRI_ANY); iflladdr_event_eh = EVENTHANDLER_REGISTER(iflladdr_event, nd6_iflladdr, NULL, EVENTHANDLER_PRI_ANY); } } #ifdef VIMAGE void nd6_destroy() { callout_drain(&V_nd6_slowtimo_ch); callout_drain(&V_nd6_timer_ch); if (IS_DEFAULT_VNET(curvnet)) { EVENTHANDLER_DEREGISTER(lle_event, lle_event_eh); EVENTHANDLER_DEREGISTER(iflladdr_event, iflladdr_event_eh); } rw_destroy(&V_nd6_lock); mtx_destroy(&V_nd6_onlink_mtx); } #endif struct nd_ifinfo * nd6_ifattach(struct ifnet *ifp) { struct nd_ifinfo *nd; nd = malloc(sizeof(*nd), M_IP6NDP, M_WAITOK | M_ZERO); nd->initialized = 1; nd->chlim = IPV6_DEFHLIM; nd->basereachable = REACHABLE_TIME; nd->reachable = ND_COMPUTE_RTIME(nd->basereachable); nd->retrans = RETRANS_TIMER; nd->flags = ND6_IFF_PERFORMNUD; /* A loopback interface always has ND6_IFF_AUTO_LINKLOCAL. * XXXHRS: Clear ND6_IFF_AUTO_LINKLOCAL on an IFT_BRIDGE interface by * default regardless of the V_ip6_auto_linklocal configuration to * give a reasonable default behavior. */ if ((V_ip6_auto_linklocal && ifp->if_type != IFT_BRIDGE) || (ifp->if_flags & IFF_LOOPBACK)) nd->flags |= ND6_IFF_AUTO_LINKLOCAL; /* * A loopback interface does not need to accept RTADV. * XXXHRS: Clear ND6_IFF_ACCEPT_RTADV on an IFT_BRIDGE interface by * default regardless of the V_ip6_accept_rtadv configuration to * prevent the interface from accepting RA messages arrived * on one of the member interfaces with ND6_IFF_ACCEPT_RTADV. */ if (V_ip6_accept_rtadv && !(ifp->if_flags & IFF_LOOPBACK) && (ifp->if_type != IFT_BRIDGE)) nd->flags |= ND6_IFF_ACCEPT_RTADV; if (V_ip6_no_radr && !(ifp->if_flags & IFF_LOOPBACK)) nd->flags |= ND6_IFF_NO_RADR; /* XXX: we cannot call nd6_setmtu since ifp is not fully initialized */ nd6_setmtu0(ifp, nd); return nd; } void nd6_ifdetach(struct ifnet *ifp, struct nd_ifinfo *nd) { struct ifaddr *ifa, *next; IF_ADDR_RLOCK(ifp); CK_STAILQ_FOREACH_SAFE(ifa, &ifp->if_addrhead, ifa_link, next) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; /* stop DAD processing */ nd6_dad_stop(ifa); } IF_ADDR_RUNLOCK(ifp); free(nd, M_IP6NDP); } /* * Reset ND level link MTU. This function is called when the physical MTU * changes, which means we might have to adjust the ND level MTU. */ void nd6_setmtu(struct ifnet *ifp) { if (ifp->if_afdata[AF_INET6] == NULL) return; nd6_setmtu0(ifp, ND_IFINFO(ifp)); } /* XXX todo: do not maintain copy of ifp->if_mtu in ndi->maxmtu */ void nd6_setmtu0(struct ifnet *ifp, struct nd_ifinfo *ndi) { u_int32_t omaxmtu; omaxmtu = ndi->maxmtu; ndi->maxmtu = ifp->if_mtu; /* * Decreasing the interface MTU under IPV6 minimum MTU may cause * undesirable situation. We thus notify the operator of the change * explicitly. The check for omaxmtu is necessary to restrict the * log to the case of changing the MTU, not initializing it. */ if (omaxmtu >= IPV6_MMTU && ndi->maxmtu < IPV6_MMTU) { log(LOG_NOTICE, "nd6_setmtu0: " "new link MTU on %s (%lu) is too small for IPv6\n", if_name(ifp), (unsigned long)ndi->maxmtu); } if (ndi->maxmtu > V_in6_maxmtu) in6_setmaxmtu(); /* check all interfaces just in case */ } void nd6_option_init(void *opt, int icmp6len, union nd_opts *ndopts) { bzero(ndopts, sizeof(*ndopts)); ndopts->nd_opts_search = (struct nd_opt_hdr *)opt; ndopts->nd_opts_last = (struct nd_opt_hdr *)(((u_char *)opt) + icmp6len); if (icmp6len == 0) { ndopts->nd_opts_done = 1; ndopts->nd_opts_search = NULL; } } /* * Take one ND option. */ struct nd_opt_hdr * nd6_option(union nd_opts *ndopts) { struct nd_opt_hdr *nd_opt; int olen; KASSERT(ndopts != NULL, ("%s: ndopts == NULL", __func__)); KASSERT(ndopts->nd_opts_last != NULL, ("%s: uninitialized ndopts", __func__)); if (ndopts->nd_opts_search == NULL) return NULL; if (ndopts->nd_opts_done) return NULL; nd_opt = ndopts->nd_opts_search; /* make sure nd_opt_len is inside the buffer */ if ((caddr_t)&nd_opt->nd_opt_len >= (caddr_t)ndopts->nd_opts_last) { bzero(ndopts, sizeof(*ndopts)); return NULL; } olen = nd_opt->nd_opt_len << 3; if (olen == 0) { /* * Message validation requires that all included * options have a length that is greater than zero. */ bzero(ndopts, sizeof(*ndopts)); return NULL; } ndopts->nd_opts_search = (struct nd_opt_hdr *)((caddr_t)nd_opt + olen); if (ndopts->nd_opts_search > ndopts->nd_opts_last) { /* option overruns the end of buffer, invalid */ bzero(ndopts, sizeof(*ndopts)); return NULL; } else if (ndopts->nd_opts_search == ndopts->nd_opts_last) { /* reached the end of options chain */ ndopts->nd_opts_done = 1; ndopts->nd_opts_search = NULL; } return nd_opt; } /* * Parse multiple ND options. * This function is much easier to use, for ND routines that do not need * multiple options of the same type. */ int nd6_options(union nd_opts *ndopts) { struct nd_opt_hdr *nd_opt; int i = 0; KASSERT(ndopts != NULL, ("%s: ndopts == NULL", __func__)); KASSERT(ndopts->nd_opts_last != NULL, ("%s: uninitialized ndopts", __func__)); if (ndopts->nd_opts_search == NULL) return 0; while (1) { nd_opt = nd6_option(ndopts); if (nd_opt == NULL && ndopts->nd_opts_last == NULL) { /* * Message validation requires that all included * options have a length that is greater than zero. */ ICMP6STAT_INC(icp6s_nd_badopt); bzero(ndopts, sizeof(*ndopts)); return -1; } if (nd_opt == NULL) goto skip1; switch (nd_opt->nd_opt_type) { case ND_OPT_SOURCE_LINKADDR: case ND_OPT_TARGET_LINKADDR: case ND_OPT_MTU: case ND_OPT_REDIRECTED_HEADER: case ND_OPT_NONCE: if (ndopts->nd_opt_array[nd_opt->nd_opt_type]) { nd6log((LOG_INFO, "duplicated ND6 option found (type=%d)\n", nd_opt->nd_opt_type)); /* XXX bark? */ } else { ndopts->nd_opt_array[nd_opt->nd_opt_type] = nd_opt; } break; case ND_OPT_PREFIX_INFORMATION: if (ndopts->nd_opt_array[nd_opt->nd_opt_type] == 0) { ndopts->nd_opt_array[nd_opt->nd_opt_type] = nd_opt; } ndopts->nd_opts_pi_end = (struct nd_opt_prefix_info *)nd_opt; break; /* What about ND_OPT_ROUTE_INFO? RFC 4191 */ case ND_OPT_RDNSS: /* RFC 6106 */ case ND_OPT_DNSSL: /* RFC 6106 */ /* * Silently ignore options we know and do not care about * in the kernel. */ break; default: /* * Unknown options must be silently ignored, * to accommodate future extension to the protocol. */ nd6log((LOG_DEBUG, "nd6_options: unsupported option %d - " "option ignored\n", nd_opt->nd_opt_type)); } skip1: i++; if (i > V_nd6_maxndopt) { ICMP6STAT_INC(icp6s_nd_toomanyopt); nd6log((LOG_INFO, "too many loop in nd opt\n")); break; } if (ndopts->nd_opts_done) break; } return 0; } /* * ND6 timer routine to handle ND6 entries */ static void nd6_llinfo_settimer_locked(struct llentry *ln, long tick) { int canceled; LLE_WLOCK_ASSERT(ln); if (tick < 0) { ln->la_expire = 0; ln->ln_ntick = 0; canceled = callout_stop(&ln->lle_timer); } else { ln->la_expire = time_uptime + tick / hz; LLE_ADDREF(ln); if (tick > INT_MAX) { ln->ln_ntick = tick - INT_MAX; canceled = callout_reset(&ln->lle_timer, INT_MAX, nd6_llinfo_timer, ln); } else { ln->ln_ntick = 0; canceled = callout_reset(&ln->lle_timer, tick, nd6_llinfo_timer, ln); } } if (canceled > 0) LLE_REMREF(ln); } /* * Gets source address of the first packet in hold queue * and stores it in @src. * Returns pointer to @src (if hold queue is not empty) or NULL. * * Set noinline to be dtrace-friendly */ static __noinline struct in6_addr * nd6_llinfo_get_holdsrc(struct llentry *ln, struct in6_addr *src) { struct ip6_hdr hdr; struct mbuf *m; if (ln->la_hold == NULL) return (NULL); /* * assume every packet in la_hold has the same IP header */ m = ln->la_hold; if (sizeof(hdr) > m->m_len) return (NULL); m_copydata(m, 0, sizeof(hdr), (caddr_t)&hdr); *src = hdr.ip6_src; return (src); } /* * Checks if we need to switch from STALE state. * * RFC 4861 requires switching from STALE to DELAY state * on first packet matching entry, waiting V_nd6_delay and * transition to PROBE state (if upper layer confirmation was * not received). * * This code performs a bit differently: * On packet hit we don't change state (but desired state * can be guessed by control plane). However, after V_nd6_delay * seconds code will transition to PROBE state (so DELAY state * is kinda skipped in most situations). * * Typically, V_nd6_gctimer is bigger than V_nd6_delay, so * we perform the following upon entering STALE state: * * 1) Arm timer to run each V_nd6_delay seconds to make sure that * if packet was transmitted at the start of given interval, we * would be able to switch to PROBE state in V_nd6_delay seconds * as user expects. * * 2) Reschedule timer until original V_nd6_gctimer expires keeping * lle in STALE state (remaining timer value stored in lle_remtime). * * 3) Reschedule timer if packet was transmitted less that V_nd6_delay * seconds ago. * * Returns non-zero value if the entry is still STALE (storing * the next timer interval in @pdelay). * * Returns zero value if original timer expired or we need to switch to * PROBE (store that in @do_switch variable). */ static int nd6_is_stale(struct llentry *lle, long *pdelay, int *do_switch) { int nd_delay, nd_gctimer, r_skip_req; time_t lle_hittime; long delay; *do_switch = 0; nd_gctimer = V_nd6_gctimer; nd_delay = V_nd6_delay; LLE_REQ_LOCK(lle); r_skip_req = lle->r_skip_req; lle_hittime = lle->lle_hittime; LLE_REQ_UNLOCK(lle); if (r_skip_req > 0) { /* * Nonzero r_skip_req value was set upon entering * STALE state. Since value was not changed, no * packets were passed using this lle. Ask for * timer reschedule and keep STALE state. */ delay = (long)(MIN(nd_gctimer, nd_delay)); delay *= hz; if (lle->lle_remtime > delay) lle->lle_remtime -= delay; else { delay = lle->lle_remtime; lle->lle_remtime = 0; } if (delay == 0) { /* * The original ng6_gctime timeout ended, * no more rescheduling. */ return (0); } *pdelay = delay; return (1); } /* * Packet received. Verify timestamp */ delay = (long)(time_uptime - lle_hittime); if (delay < nd_delay) { /* * V_nd6_delay still not passed since the first * hit in STALE state. * Reshedule timer and return. */ *pdelay = (long)(nd_delay - delay) * hz; return (1); } /* Request switching to probe */ *do_switch = 1; return (0); } /* * Switch @lle state to new state optionally arming timers. * * Set noinline to be dtrace-friendly */ __noinline void nd6_llinfo_setstate(struct llentry *lle, int newstate) { struct ifnet *ifp; int nd_gctimer, nd_delay; long delay, remtime; delay = 0; remtime = 0; switch (newstate) { case ND6_LLINFO_INCOMPLETE: ifp = lle->lle_tbl->llt_ifp; delay = (long)ND_IFINFO(ifp)->retrans * hz / 1000; break; case ND6_LLINFO_REACHABLE: if (!ND6_LLINFO_PERMANENT(lle)) { ifp = lle->lle_tbl->llt_ifp; delay = (long)ND_IFINFO(ifp)->reachable * hz; } break; case ND6_LLINFO_STALE: /* * Notify fast path that we want to know if any packet * is transmitted by setting r_skip_req. */ LLE_REQ_LOCK(lle); lle->r_skip_req = 1; LLE_REQ_UNLOCK(lle); nd_delay = V_nd6_delay; nd_gctimer = V_nd6_gctimer; delay = (long)(MIN(nd_gctimer, nd_delay)) * hz; remtime = (long)nd_gctimer * hz - delay; break; case ND6_LLINFO_DELAY: lle->la_asked = 0; delay = (long)V_nd6_delay * hz; break; } if (delay > 0) nd6_llinfo_settimer_locked(lle, delay); lle->lle_remtime = remtime; lle->ln_state = newstate; } /* * Timer-dependent part of nd state machine. * * Set noinline to be dtrace-friendly */ static __noinline void nd6_llinfo_timer(void *arg) { struct llentry *ln; struct in6_addr *dst, *pdst, *psrc, src; struct ifnet *ifp; struct nd_ifinfo *ndi; int do_switch, send_ns; long delay; KASSERT(arg != NULL, ("%s: arg NULL", __func__)); ln = (struct llentry *)arg; ifp = lltable_get_ifp(ln->lle_tbl); CURVNET_SET(ifp->if_vnet); ND6_RLOCK(); LLE_WLOCK(ln); if (callout_pending(&ln->lle_timer)) { /* * Here we are a bit odd here in the treatment of * active/pending. If the pending bit is set, it got * rescheduled before I ran. The active * bit we ignore, since if it was stopped * in ll_tablefree() and was currently running * it would have return 0 so the code would * not have deleted it since the callout could * not be stopped so we want to go through * with the delete here now. If the callout * was restarted, the pending bit will be back on and * we just want to bail since the callout_reset would * return 1 and our reference would have been removed * by nd6_llinfo_settimer_locked above since canceled * would have been 1. */ LLE_WUNLOCK(ln); ND6_RUNLOCK(); CURVNET_RESTORE(); return; } ndi = ND_IFINFO(ifp); send_ns = 0; dst = &ln->r_l3addr.addr6; pdst = dst; if (ln->ln_ntick > 0) { if (ln->ln_ntick > INT_MAX) { ln->ln_ntick -= INT_MAX; nd6_llinfo_settimer_locked(ln, INT_MAX); } else { ln->ln_ntick = 0; nd6_llinfo_settimer_locked(ln, ln->ln_ntick); } goto done; } if (ln->la_flags & LLE_STATIC) { goto done; } if (ln->la_flags & LLE_DELETED) { nd6_free(&ln, 0); goto done; } switch (ln->ln_state) { case ND6_LLINFO_INCOMPLETE: if (ln->la_asked < V_nd6_mmaxtries) { ln->la_asked++; send_ns = 1; /* Send NS to multicast address */ pdst = NULL; } else { struct mbuf *m = ln->la_hold; if (m) { struct mbuf *m0; /* * assuming every packet in la_hold has the * same IP header. Send error after unlock. */ m0 = m->m_nextpkt; m->m_nextpkt = NULL; ln->la_hold = m0; clear_llinfo_pqueue(ln); } nd6_free(&ln, 0); if (m != NULL) icmp6_error2(m, ICMP6_DST_UNREACH, ICMP6_DST_UNREACH_ADDR, 0, ifp); } break; case ND6_LLINFO_REACHABLE: if (!ND6_LLINFO_PERMANENT(ln)) nd6_llinfo_setstate(ln, ND6_LLINFO_STALE); break; case ND6_LLINFO_STALE: if (nd6_is_stale(ln, &delay, &do_switch) != 0) { /* * No packet has used this entry and GC timeout * has not been passed. Reshedule timer and * return. */ nd6_llinfo_settimer_locked(ln, delay); break; } if (do_switch == 0) { /* * GC timer has ended and entry hasn't been used. * Run Garbage collector (RFC 4861, 5.3) */ if (!ND6_LLINFO_PERMANENT(ln)) nd6_free(&ln, 1); break; } /* Entry has been used AND delay timer has ended. */ /* FALLTHROUGH */ case ND6_LLINFO_DELAY: if (ndi && (ndi->flags & ND6_IFF_PERFORMNUD) != 0) { /* We need NUD */ ln->la_asked = 1; nd6_llinfo_setstate(ln, ND6_LLINFO_PROBE); send_ns = 1; } else nd6_llinfo_setstate(ln, ND6_LLINFO_STALE); /* XXX */ break; case ND6_LLINFO_PROBE: if (ln->la_asked < V_nd6_umaxtries) { ln->la_asked++; send_ns = 1; } else { nd6_free(&ln, 0); } break; default: panic("%s: paths in a dark night can be confusing: %d", __func__, ln->ln_state); } done: if (ln != NULL) ND6_RUNLOCK(); if (send_ns != 0) { nd6_llinfo_settimer_locked(ln, (long)ndi->retrans * hz / 1000); psrc = nd6_llinfo_get_holdsrc(ln, &src); LLE_FREE_LOCKED(ln); ln = NULL; nd6_ns_output(ifp, psrc, pdst, dst, NULL); } if (ln != NULL) LLE_FREE_LOCKED(ln); CURVNET_RESTORE(); } /* * ND6 timer routine to expire default route list and prefix list */ void nd6_timer(void *arg) { CURVNET_SET((struct vnet *) arg); struct nd_drhead drq; struct nd_prhead prl; struct nd_defrouter *dr, *ndr; struct nd_prefix *pr, *npr; struct in6_ifaddr *ia6, *nia6; uint64_t genid; TAILQ_INIT(&drq); LIST_INIT(&prl); ND6_WLOCK(); TAILQ_FOREACH_SAFE(dr, &V_nd_defrouter, dr_entry, ndr) if (dr->expire && dr->expire < time_uptime) defrouter_unlink(dr, &drq); ND6_WUNLOCK(); while ((dr = TAILQ_FIRST(&drq)) != NULL) { TAILQ_REMOVE(&drq, dr, dr_entry); defrouter_del(dr); } /* * expire interface addresses. * in the past the loop was inside prefix expiry processing. * However, from a stricter speci-confrmance standpoint, we should * rather separate address lifetimes and prefix lifetimes. * * XXXRW: in6_ifaddrhead locking. */ addrloop: CK_STAILQ_FOREACH_SAFE(ia6, &V_in6_ifaddrhead, ia_link, nia6) { /* check address lifetime */ if (IFA6_IS_INVALID(ia6)) { int regen = 0; /* * If the expiring address is temporary, try * regenerating a new one. This would be useful when * we suspended a laptop PC, then turned it on after a * period that could invalidate all temporary * addresses. Although we may have to restart the * loop (see below), it must be after purging the * address. Otherwise, we'd see an infinite loop of * regeneration. */ if (V_ip6_use_tempaddr && (ia6->ia6_flags & IN6_IFF_TEMPORARY) != 0) { if (regen_tmpaddr(ia6) == 0) regen = 1; } in6_purgeaddr(&ia6->ia_ifa); if (regen) goto addrloop; /* XXX: see below */ } else if (IFA6_IS_DEPRECATED(ia6)) { int oldflags = ia6->ia6_flags; ia6->ia6_flags |= IN6_IFF_DEPRECATED; /* * If a temporary address has just become deprecated, * regenerate a new one if possible. */ if (V_ip6_use_tempaddr && (ia6->ia6_flags & IN6_IFF_TEMPORARY) != 0 && (oldflags & IN6_IFF_DEPRECATED) == 0) { if (regen_tmpaddr(ia6) == 0) { /* * A new temporary address is * generated. * XXX: this means the address chain * has changed while we are still in * the loop. Although the change * would not cause disaster (because * it's not a deletion, but an * addition,) we'd rather restart the * loop just for safety. Or does this * significantly reduce performance?? */ goto addrloop; } } } else if ((ia6->ia6_flags & IN6_IFF_TENTATIVE) != 0) { /* * Schedule DAD for a tentative address. This happens * if the interface was down or not running * when the address was configured. */ int delay; delay = arc4random() % (MAX_RTR_SOLICITATION_DELAY * hz); nd6_dad_start((struct ifaddr *)ia6, delay); } else { /* * Check status of the interface. If it is down, * mark the address as tentative for future DAD. */ if ((ia6->ia_ifp->if_flags & IFF_UP) == 0 || (ia6->ia_ifp->if_drv_flags & IFF_DRV_RUNNING) == 0 || (ND_IFINFO(ia6->ia_ifp)->flags & ND6_IFF_IFDISABLED) != 0) { ia6->ia6_flags &= ~IN6_IFF_DUPLICATED; ia6->ia6_flags |= IN6_IFF_TENTATIVE; } /* * A new RA might have made a deprecated address * preferred. */ ia6->ia6_flags &= ~IN6_IFF_DEPRECATED; } } ND6_WLOCK(); restart: LIST_FOREACH_SAFE(pr, &V_nd_prefix, ndpr_entry, npr) { /* * Expire prefixes. Since the pltime is only used for * autoconfigured addresses, pltime processing for prefixes is * not necessary. * * Only unlink after all derived addresses have expired. This * may not occur until two hours after the prefix has expired * per RFC 4862. If the prefix expires before its derived * addresses, mark it off-link. This will be done automatically * after unlinking if no address references remain. */ if (pr->ndpr_vltime == ND6_INFINITE_LIFETIME || time_uptime - pr->ndpr_lastupdate <= pr->ndpr_vltime) continue; if (pr->ndpr_addrcnt == 0) { nd6_prefix_unlink(pr, &prl); continue; } if ((pr->ndpr_stateflags & NDPRF_ONLINK) != 0) { genid = V_nd6_list_genid; nd6_prefix_ref(pr); ND6_WUNLOCK(); ND6_ONLINK_LOCK(); (void)nd6_prefix_offlink(pr); ND6_ONLINK_UNLOCK(); ND6_WLOCK(); nd6_prefix_rele(pr); if (genid != V_nd6_list_genid) goto restart; } } ND6_WUNLOCK(); while ((pr = LIST_FIRST(&prl)) != NULL) { LIST_REMOVE(pr, ndpr_entry); nd6_prefix_del(pr); } callout_reset(&V_nd6_timer_ch, V_nd6_prune * hz, nd6_timer, curvnet); CURVNET_RESTORE(); } /* * ia6 - deprecated/invalidated temporary address */ static int regen_tmpaddr(struct in6_ifaddr *ia6) { struct ifaddr *ifa; struct ifnet *ifp; struct in6_ifaddr *public_ifa6 = NULL; ifp = ia6->ia_ifa.ifa_ifp; IF_ADDR_RLOCK(ifp); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { struct in6_ifaddr *it6; if (ifa->ifa_addr->sa_family != AF_INET6) continue; it6 = (struct in6_ifaddr *)ifa; /* ignore no autoconf addresses. */ if ((it6->ia6_flags & IN6_IFF_AUTOCONF) == 0) continue; /* ignore autoconf addresses with different prefixes. */ if (it6->ia6_ndpr == NULL || it6->ia6_ndpr != ia6->ia6_ndpr) continue; /* * Now we are looking at an autoconf address with the same * prefix as ours. If the address is temporary and is still * preferred, do not create another one. It would be rare, but * could happen, for example, when we resume a laptop PC after * a long period. */ if ((it6->ia6_flags & IN6_IFF_TEMPORARY) != 0 && !IFA6_IS_DEPRECATED(it6)) { public_ifa6 = NULL; break; } /* * This is a public autoconf address that has the same prefix * as ours. If it is preferred, keep it. We can't break the * loop here, because there may be a still-preferred temporary * address with the prefix. */ if (!IFA6_IS_DEPRECATED(it6)) public_ifa6 = it6; } if (public_ifa6 != NULL) ifa_ref(&public_ifa6->ia_ifa); IF_ADDR_RUNLOCK(ifp); if (public_ifa6 != NULL) { int e; if ((e = in6_tmpifadd(public_ifa6, 0, 0)) != 0) { ifa_free(&public_ifa6->ia_ifa); log(LOG_NOTICE, "regen_tmpaddr: failed to create a new" " tmp addr,errno=%d\n", e); return (-1); } ifa_free(&public_ifa6->ia_ifa); return (0); } return (-1); } /* * Remove prefix and default router list entries corresponding to ifp. Neighbor * cache entries are freed in in6_domifdetach(). */ void nd6_purge(struct ifnet *ifp) { struct nd_drhead drq; struct nd_prhead prl; struct nd_defrouter *dr, *ndr; struct nd_prefix *pr, *npr; TAILQ_INIT(&drq); LIST_INIT(&prl); /* * Nuke default router list entries toward ifp. * We defer removal of default router list entries that is installed * in the routing table, in order to keep additional side effects as * small as possible. */ ND6_WLOCK(); TAILQ_FOREACH_SAFE(dr, &V_nd_defrouter, dr_entry, ndr) { if (dr->installed) continue; if (dr->ifp == ifp) defrouter_unlink(dr, &drq); } TAILQ_FOREACH_SAFE(dr, &V_nd_defrouter, dr_entry, ndr) { if (!dr->installed) continue; if (dr->ifp == ifp) defrouter_unlink(dr, &drq); } /* * Remove prefixes on ifp. We should have already removed addresses on * this interface, so no addresses should be referencing these prefixes. */ LIST_FOREACH_SAFE(pr, &V_nd_prefix, ndpr_entry, npr) { if (pr->ndpr_ifp == ifp) nd6_prefix_unlink(pr, &prl); } ND6_WUNLOCK(); /* Delete the unlinked router and prefix objects. */ while ((dr = TAILQ_FIRST(&drq)) != NULL) { TAILQ_REMOVE(&drq, dr, dr_entry); defrouter_del(dr); } while ((pr = LIST_FIRST(&prl)) != NULL) { LIST_REMOVE(pr, ndpr_entry); nd6_prefix_del(pr); } /* cancel default outgoing interface setting */ if (V_nd6_defifindex == ifp->if_index) nd6_setdefaultiface(0); if (ND_IFINFO(ifp)->flags & ND6_IFF_ACCEPT_RTADV) { /* Refresh default router list. */ defrouter_select_fib(ifp->if_fib); } } /* * the caller acquires and releases the lock on the lltbls * Returns the llentry locked */ struct llentry * nd6_lookup(const struct in6_addr *addr6, int flags, struct ifnet *ifp) { struct sockaddr_in6 sin6; struct llentry *ln; bzero(&sin6, sizeof(sin6)); sin6.sin6_len = sizeof(struct sockaddr_in6); sin6.sin6_family = AF_INET6; sin6.sin6_addr = *addr6; IF_AFDATA_LOCK_ASSERT(ifp); ln = lla_lookup(LLTABLE6(ifp), flags, (struct sockaddr *)&sin6); return (ln); } struct llentry * nd6_alloc(const struct in6_addr *addr6, int flags, struct ifnet *ifp) { struct sockaddr_in6 sin6; struct llentry *ln; bzero(&sin6, sizeof(sin6)); sin6.sin6_len = sizeof(struct sockaddr_in6); sin6.sin6_family = AF_INET6; sin6.sin6_addr = *addr6; ln = lltable_alloc_entry(LLTABLE6(ifp), 0, (struct sockaddr *)&sin6); if (ln != NULL) ln->ln_state = ND6_LLINFO_NOSTATE; return (ln); } /* * Test whether a given IPv6 address is a neighbor or not, ignoring * the actual neighbor cache. The neighbor cache is ignored in order * to not reenter the routing code from within itself. */ static int nd6_is_new_addr_neighbor(const struct sockaddr_in6 *addr, struct ifnet *ifp) { struct nd_prefix *pr; struct ifaddr *ifa; struct rt_addrinfo info; struct sockaddr_in6 rt_key; const struct sockaddr *dst6; uint64_t genid; int error, fibnum; /* * A link-local address is always a neighbor. * XXX: a link does not necessarily specify a single interface. */ if (IN6_IS_ADDR_LINKLOCAL(&addr->sin6_addr)) { struct sockaddr_in6 sin6_copy; u_int32_t zone; /* * We need sin6_copy since sa6_recoverscope() may modify the * content (XXX). */ sin6_copy = *addr; if (sa6_recoverscope(&sin6_copy)) return (0); /* XXX: should be impossible */ if (in6_setscope(&sin6_copy.sin6_addr, ifp, &zone)) return (0); if (sin6_copy.sin6_scope_id == zone) return (1); else return (0); } bzero(&rt_key, sizeof(rt_key)); bzero(&info, sizeof(info)); info.rti_info[RTAX_DST] = (struct sockaddr *)&rt_key; /* * If the address matches one of our addresses, * it should be a neighbor. * If the address matches one of our on-link prefixes, it should be a * neighbor. */ ND6_RLOCK(); restart: LIST_FOREACH(pr, &V_nd_prefix, ndpr_entry) { if (pr->ndpr_ifp != ifp) continue; if ((pr->ndpr_stateflags & NDPRF_ONLINK) == 0) { dst6 = (const struct sockaddr *)&pr->ndpr_prefix; /* * We only need to check all FIBs if add_addr_allfibs * is unset. If set, checking any FIB will suffice. */ fibnum = V_rt_add_addr_allfibs ? rt_numfibs - 1 : 0; for (; fibnum < rt_numfibs; fibnum++) { genid = V_nd6_list_genid; ND6_RUNLOCK(); /* * Restore length field before * retrying lookup */ rt_key.sin6_len = sizeof(rt_key); error = rib_lookup_info(fibnum, dst6, 0, 0, &info); ND6_RLOCK(); if (genid != V_nd6_list_genid) goto restart; if (error == 0) break; } if (error != 0) continue; /* * This is the case where multiple interfaces * have the same prefix, but only one is installed * into the routing table and that prefix entry * is not the one being examined here. In the case * where RADIX_MPATH is enabled, multiple route * entries (of the same rt_key value) will be * installed because the interface addresses all * differ. */ if (!IN6_ARE_ADDR_EQUAL(&pr->ndpr_prefix.sin6_addr, &rt_key.sin6_addr)) continue; } if (IN6_ARE_MASKED_ADDR_EQUAL(&pr->ndpr_prefix.sin6_addr, &addr->sin6_addr, &pr->ndpr_mask)) { ND6_RUNLOCK(); return (1); } } ND6_RUNLOCK(); /* * If the address is assigned on the node of the other side of * a p2p interface, the address should be a neighbor. */ if (ifp->if_flags & IFF_POINTOPOINT) { IF_ADDR_RLOCK(ifp); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != addr->sin6_family) continue; if (ifa->ifa_dstaddr != NULL && sa_equal(addr, ifa->ifa_dstaddr)) { IF_ADDR_RUNLOCK(ifp); return 1; } } IF_ADDR_RUNLOCK(ifp); } /* * If the default router list is empty, all addresses are regarded * as on-link, and thus, as a neighbor. */ if (ND_IFINFO(ifp)->flags & ND6_IFF_ACCEPT_RTADV && TAILQ_EMPTY(&V_nd_defrouter) && V_nd6_defifindex == ifp->if_index) { return (1); } return (0); } /* * Detect if a given IPv6 address identifies a neighbor on a given link. * XXX: should take care of the destination of a p2p link? */ int nd6_is_addr_neighbor(const struct sockaddr_in6 *addr, struct ifnet *ifp) { struct llentry *lle; int rc = 0; IF_AFDATA_UNLOCK_ASSERT(ifp); if (nd6_is_new_addr_neighbor(addr, ifp)) return (1); /* * Even if the address matches none of our addresses, it might be * in the neighbor cache. */ IF_AFDATA_RLOCK(ifp); if ((lle = nd6_lookup(&addr->sin6_addr, 0, ifp)) != NULL) { LLE_RUNLOCK(lle); rc = 1; } IF_AFDATA_RUNLOCK(ifp); return (rc); } /* * Free an nd6 llinfo entry. * Since the function would cause significant changes in the kernel, DO NOT * make it global, unless you have a strong reason for the change, and are sure * that the change is safe. * * Set noinline to be dtrace-friendly */ static __noinline void nd6_free(struct llentry **lnp, int gc) { struct ifnet *ifp; struct llentry *ln; struct nd_defrouter *dr; ln = *lnp; *lnp = NULL; LLE_WLOCK_ASSERT(ln); ND6_RLOCK_ASSERT(); ifp = lltable_get_ifp(ln->lle_tbl); if ((ND_IFINFO(ifp)->flags & ND6_IFF_ACCEPT_RTADV) != 0) dr = defrouter_lookup_locked(&ln->r_l3addr.addr6, ifp); else dr = NULL; ND6_RUNLOCK(); if ((ln->la_flags & LLE_DELETED) == 0) EVENTHANDLER_INVOKE(lle_event, ln, LLENTRY_EXPIRED); /* * we used to have pfctlinput(PRC_HOSTDEAD) here. * even though it is not harmful, it was not really necessary. */ /* cancel timer */ nd6_llinfo_settimer_locked(ln, -1); if (ND_IFINFO(ifp)->flags & ND6_IFF_ACCEPT_RTADV) { if (dr != NULL && dr->expire && ln->ln_state == ND6_LLINFO_STALE && gc) { /* * If the reason for the deletion is just garbage * collection, and the neighbor is an active default * router, do not delete it. Instead, reset the GC * timer using the router's lifetime. * Simply deleting the entry would affect default * router selection, which is not necessarily a good * thing, especially when we're using router preference * values. * XXX: the check for ln_state would be redundant, * but we intentionally keep it just in case. */ if (dr->expire > time_uptime) nd6_llinfo_settimer_locked(ln, (dr->expire - time_uptime) * hz); else nd6_llinfo_settimer_locked(ln, (long)V_nd6_gctimer * hz); LLE_REMREF(ln); LLE_WUNLOCK(ln); defrouter_rele(dr); return; } if (dr) { /* * Unreachablity of a router might affect the default * router selection and on-link detection of advertised * prefixes. */ /* * Temporarily fake the state to choose a new default * router and to perform on-link determination of * prefixes correctly. * Below the state will be set correctly, * or the entry itself will be deleted. */ ln->ln_state = ND6_LLINFO_INCOMPLETE; } if (ln->ln_router || dr) { /* * We need to unlock to avoid a LOR with rt6_flush() with the * rnh and for the calls to pfxlist_onlink_check() and * defrouter_select_fib() in the block further down for calls * into nd6_lookup(). We still hold a ref. */ LLE_WUNLOCK(ln); /* * rt6_flush must be called whether or not the neighbor * is in the Default Router List. * See a corresponding comment in nd6_na_input(). */ rt6_flush(&ln->r_l3addr.addr6, ifp); } if (dr) { /* * Since defrouter_select_fib() does not affect the * on-link determination and MIP6 needs the check * before the default router selection, we perform * the check now. */ pfxlist_onlink_check(); /* * Refresh default router list. */ defrouter_select_fib(dr->ifp->if_fib); } /* * If this entry was added by an on-link redirect, remove the * corresponding host route. */ if (ln->la_flags & LLE_REDIRECT) nd6_free_redirect(ln); if (ln->ln_router || dr) LLE_WLOCK(ln); } /* * Save to unlock. We still hold an extra reference and will not * free(9) in llentry_free() if someone else holds one as well. */ LLE_WUNLOCK(ln); IF_AFDATA_LOCK(ifp); LLE_WLOCK(ln); /* Guard against race with other llentry_free(). */ if (ln->la_flags & LLE_LINKED) { /* Remove callout reference */ LLE_REMREF(ln); lltable_unlink_entry(ln->lle_tbl, ln); } IF_AFDATA_UNLOCK(ifp); llentry_free(ln); if (dr != NULL) defrouter_rele(dr); } static int nd6_isdynrte(const struct rtentry *rt, void *xap) { if (rt->rt_flags == (RTF_UP | RTF_HOST | RTF_DYNAMIC)) return (1); return (0); } /* * Remove the rtentry for the given llentry, * both of which were installed by a redirect. */ static void nd6_free_redirect(const struct llentry *ln) { int fibnum; struct sockaddr_in6 sin6; struct rt_addrinfo info; lltable_fill_sa_entry(ln, (struct sockaddr *)&sin6); memset(&info, 0, sizeof(info)); info.rti_info[RTAX_DST] = (struct sockaddr *)&sin6; info.rti_filter = nd6_isdynrte; for (fibnum = 0; fibnum < rt_numfibs; fibnum++) rtrequest1_fib(RTM_DELETE, &info, NULL, fibnum); } /* * Rejuvenate this function for routing operations related * processing. */ void nd6_rtrequest(int req, struct rtentry *rt, struct rt_addrinfo *info) { struct sockaddr_in6 *gateway; struct nd_defrouter *dr; struct ifnet *ifp; gateway = (struct sockaddr_in6 *)rt->rt_gateway; ifp = rt->rt_ifp; switch (req) { case RTM_ADD: break; case RTM_DELETE: if (!ifp) return; /* * Only indirect routes are interesting. */ if ((rt->rt_flags & RTF_GATEWAY) == 0) return; /* * check for default route */ if (IN6_ARE_ADDR_EQUAL(&in6addr_any, &SIN6(rt_key(rt))->sin6_addr)) { dr = defrouter_lookup(&gateway->sin6_addr, ifp); if (dr != NULL) { dr->installed = 0; defrouter_rele(dr); } } break; } } int nd6_ioctl(u_long cmd, caddr_t data, struct ifnet *ifp) { struct in6_ndireq *ndi = (struct in6_ndireq *)data; struct in6_nbrinfo *nbi = (struct in6_nbrinfo *)data; struct in6_ndifreq *ndif = (struct in6_ndifreq *)data; int error = 0; if (ifp->if_afdata[AF_INET6] == NULL) return (EPFNOSUPPORT); switch (cmd) { case OSIOCGIFINFO_IN6: #define ND ndi->ndi /* XXX: old ndp(8) assumes a positive value for linkmtu. */ bzero(&ND, sizeof(ND)); ND.linkmtu = IN6_LINKMTU(ifp); ND.maxmtu = ND_IFINFO(ifp)->maxmtu; ND.basereachable = ND_IFINFO(ifp)->basereachable; ND.reachable = ND_IFINFO(ifp)->reachable; ND.retrans = ND_IFINFO(ifp)->retrans; ND.flags = ND_IFINFO(ifp)->flags; ND.recalctm = ND_IFINFO(ifp)->recalctm; ND.chlim = ND_IFINFO(ifp)->chlim; break; case SIOCGIFINFO_IN6: ND = *ND_IFINFO(ifp); break; case SIOCSIFINFO_IN6: /* * used to change host variables from userland. * intended for a use on router to reflect RA configurations. */ /* 0 means 'unspecified' */ if (ND.linkmtu != 0) { if (ND.linkmtu < IPV6_MMTU || ND.linkmtu > IN6_LINKMTU(ifp)) { error = EINVAL; break; } ND_IFINFO(ifp)->linkmtu = ND.linkmtu; } if (ND.basereachable != 0) { int obasereachable = ND_IFINFO(ifp)->basereachable; ND_IFINFO(ifp)->basereachable = ND.basereachable; if (ND.basereachable != obasereachable) ND_IFINFO(ifp)->reachable = ND_COMPUTE_RTIME(ND.basereachable); } if (ND.retrans != 0) ND_IFINFO(ifp)->retrans = ND.retrans; if (ND.chlim != 0) ND_IFINFO(ifp)->chlim = ND.chlim; /* FALLTHROUGH */ case SIOCSIFINFO_FLAGS: { struct ifaddr *ifa; struct in6_ifaddr *ia; if ((ND_IFINFO(ifp)->flags & ND6_IFF_IFDISABLED) && !(ND.flags & ND6_IFF_IFDISABLED)) { /* ifdisabled 1->0 transision */ /* * If the interface is marked as ND6_IFF_IFDISABLED and * has an link-local address with IN6_IFF_DUPLICATED, * do not clear ND6_IFF_IFDISABLED. * See RFC 4862, Section 5.4.5. */ IF_ADDR_RLOCK(ifp); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; ia = (struct in6_ifaddr *)ifa; if ((ia->ia6_flags & IN6_IFF_DUPLICATED) && IN6_IS_ADDR_LINKLOCAL(IA6_IN6(ia))) break; } IF_ADDR_RUNLOCK(ifp); if (ifa != NULL) { /* LLA is duplicated. */ ND.flags |= ND6_IFF_IFDISABLED; log(LOG_ERR, "Cannot enable an interface" " with a link-local address marked" " duplicate.\n"); } else { ND_IFINFO(ifp)->flags &= ~ND6_IFF_IFDISABLED; if (ifp->if_flags & IFF_UP) in6_if_up(ifp); } } else if (!(ND_IFINFO(ifp)->flags & ND6_IFF_IFDISABLED) && (ND.flags & ND6_IFF_IFDISABLED)) { /* ifdisabled 0->1 transision */ /* Mark all IPv6 address as tentative. */ ND_IFINFO(ifp)->flags |= ND6_IFF_IFDISABLED; if (V_ip6_dad_count > 0 && (ND_IFINFO(ifp)->flags & ND6_IFF_NO_DAD) == 0) { IF_ADDR_RLOCK(ifp); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; ia = (struct in6_ifaddr *)ifa; ia->ia6_flags |= IN6_IFF_TENTATIVE; } IF_ADDR_RUNLOCK(ifp); } } if (ND.flags & ND6_IFF_AUTO_LINKLOCAL) { if (!(ND_IFINFO(ifp)->flags & ND6_IFF_AUTO_LINKLOCAL)) { /* auto_linklocal 0->1 transision */ /* If no link-local address on ifp, configure */ ND_IFINFO(ifp)->flags |= ND6_IFF_AUTO_LINKLOCAL; in6_ifattach(ifp, NULL); } else if (!(ND.flags & ND6_IFF_IFDISABLED) && ifp->if_flags & IFF_UP) { /* * When the IF already has * ND6_IFF_AUTO_LINKLOCAL, no link-local * address is assigned, and IFF_UP, try to * assign one. */ IF_ADDR_RLOCK(ifp); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; ia = (struct in6_ifaddr *)ifa; if (IN6_IS_ADDR_LINKLOCAL(IA6_IN6(ia))) break; } IF_ADDR_RUNLOCK(ifp); if (ifa != NULL) /* No LLA is configured. */ in6_ifattach(ifp, NULL); } } } ND_IFINFO(ifp)->flags = ND.flags; break; #undef ND case SIOCSNDFLUSH_IN6: /* XXX: the ioctl name is confusing... */ /* sync kernel routing table with the default router list */ defrouter_reset(); defrouter_select(); break; case SIOCSPFXFLUSH_IN6: { /* flush all the prefix advertised by routers */ struct in6_ifaddr *ia, *ia_next; struct nd_prefix *pr, *next; struct nd_prhead prl; LIST_INIT(&prl); ND6_WLOCK(); LIST_FOREACH_SAFE(pr, &V_nd_prefix, ndpr_entry, next) { if (IN6_IS_ADDR_LINKLOCAL(&pr->ndpr_prefix.sin6_addr)) continue; /* XXX */ nd6_prefix_unlink(pr, &prl); } ND6_WUNLOCK(); while ((pr = LIST_FIRST(&prl)) != NULL) { LIST_REMOVE(pr, ndpr_entry); /* XXXRW: in6_ifaddrhead locking. */ CK_STAILQ_FOREACH_SAFE(ia, &V_in6_ifaddrhead, ia_link, ia_next) { if ((ia->ia6_flags & IN6_IFF_AUTOCONF) == 0) continue; if (ia->ia6_ndpr == pr) in6_purgeaddr(&ia->ia_ifa); } nd6_prefix_del(pr); } break; } case SIOCSRTRFLUSH_IN6: { /* flush all the default routers */ struct nd_drhead drq; struct nd_defrouter *dr; TAILQ_INIT(&drq); defrouter_reset(); ND6_WLOCK(); while ((dr = TAILQ_FIRST(&V_nd_defrouter)) != NULL) defrouter_unlink(dr, &drq); ND6_WUNLOCK(); while ((dr = TAILQ_FIRST(&drq)) != NULL) { TAILQ_REMOVE(&drq, dr, dr_entry); defrouter_del(dr); } defrouter_select(); break; } case SIOCGNBRINFO_IN6: { struct llentry *ln; struct in6_addr nb_addr = nbi->addr; /* make local for safety */ if ((error = in6_setscope(&nb_addr, ifp, NULL)) != 0) return (error); IF_AFDATA_RLOCK(ifp); ln = nd6_lookup(&nb_addr, 0, ifp); IF_AFDATA_RUNLOCK(ifp); if (ln == NULL) { error = EINVAL; break; } nbi->state = ln->ln_state; nbi->asked = ln->la_asked; nbi->isrouter = ln->ln_router; if (ln->la_expire == 0) nbi->expire = 0; else nbi->expire = ln->la_expire + ln->lle_remtime / hz + (time_second - time_uptime); LLE_RUNLOCK(ln); break; } case SIOCGDEFIFACE_IN6: /* XXX: should be implemented as a sysctl? */ ndif->ifindex = V_nd6_defifindex; break; case SIOCSDEFIFACE_IN6: /* XXX: should be implemented as a sysctl? */ return (nd6_setdefaultiface(ndif->ifindex)); } return (error); } /* * Calculates new isRouter value based on provided parameters and * returns it. */ static int nd6_is_router(int type, int code, int is_new, int old_addr, int new_addr, int ln_router) { /* * ICMP6 type dependent behavior. * * NS: clear IsRouter if new entry * RS: clear IsRouter * RA: set IsRouter if there's lladdr * redir: clear IsRouter if new entry * * RA case, (1): * The spec says that we must set IsRouter in the following cases: * - If lladdr exist, set IsRouter. This means (1-5). * - If it is old entry (!newentry), set IsRouter. This means (7). * So, based on the spec, in (1-5) and (7) cases we must set IsRouter. * A quetion arises for (1) case. (1) case has no lladdr in the * neighbor cache, this is similar to (6). * This case is rare but we figured that we MUST NOT set IsRouter. * * is_new old_addr new_addr NS RS RA redir * D R * 0 n n (1) c ? s * 0 y n (2) c s s * 0 n y (3) c s s * 0 y y (4) c s s * 0 y y (5) c s s * 1 -- n (6) c c c s * 1 -- y (7) c c s c s * * (c=clear s=set) */ switch (type & 0xff) { case ND_NEIGHBOR_SOLICIT: /* * New entry must have is_router flag cleared. */ if (is_new) /* (6-7) */ ln_router = 0; break; case ND_REDIRECT: /* * If the icmp is a redirect to a better router, always set the * is_router flag. Otherwise, if the entry is newly created, * clear the flag. [RFC 2461, sec 8.3] */ if (code == ND_REDIRECT_ROUTER) ln_router = 1; else { if (is_new) /* (6-7) */ ln_router = 0; } break; case ND_ROUTER_SOLICIT: /* * is_router flag must always be cleared. */ ln_router = 0; break; case ND_ROUTER_ADVERT: /* * Mark an entry with lladdr as a router. */ if ((!is_new && (old_addr || new_addr)) || /* (2-5) */ (is_new && new_addr)) { /* (7) */ ln_router = 1; } break; } return (ln_router); } /* * Create neighbor cache entry and cache link-layer address, * on reception of inbound ND6 packets. (RS/RA/NS/redirect) * * type - ICMP6 type * code - type dependent information * */ void nd6_cache_lladdr(struct ifnet *ifp, struct in6_addr *from, char *lladdr, int lladdrlen, int type, int code) { struct llentry *ln = NULL, *ln_tmp; int is_newentry; int do_update; int olladdr; int llchange; int flags; uint16_t router = 0; struct sockaddr_in6 sin6; struct mbuf *chain = NULL; u_char linkhdr[LLE_MAX_LINKHDR]; size_t linkhdrsize; int lladdr_off; IF_AFDATA_UNLOCK_ASSERT(ifp); KASSERT(ifp != NULL, ("%s: ifp == NULL", __func__)); KASSERT(from != NULL, ("%s: from == NULL", __func__)); /* nothing must be updated for unspecified address */ if (IN6_IS_ADDR_UNSPECIFIED(from)) return; /* * Validation about ifp->if_addrlen and lladdrlen must be done in * the caller. * * XXX If the link does not have link-layer adderss, what should * we do? (ifp->if_addrlen == 0) * Spec says nothing in sections for RA, RS and NA. There's small * description on it in NS section (RFC 2461 7.2.3). */ flags = lladdr ? LLE_EXCLUSIVE : 0; IF_AFDATA_RLOCK(ifp); ln = nd6_lookup(from, flags, ifp); IF_AFDATA_RUNLOCK(ifp); is_newentry = 0; if (ln == NULL) { flags |= LLE_EXCLUSIVE; ln = nd6_alloc(from, 0, ifp); if (ln == NULL) return; /* * Since we already know all the data for the new entry, * fill it before insertion. */ if (lladdr != NULL) { linkhdrsize = sizeof(linkhdr); if (lltable_calc_llheader(ifp, AF_INET6, lladdr, linkhdr, &linkhdrsize, &lladdr_off) != 0) return; lltable_set_entry_addr(ifp, ln, linkhdr, linkhdrsize, lladdr_off); } IF_AFDATA_WLOCK(ifp); LLE_WLOCK(ln); /* Prefer any existing lle over newly-created one */ ln_tmp = nd6_lookup(from, LLE_EXCLUSIVE, ifp); if (ln_tmp == NULL) lltable_link_entry(LLTABLE6(ifp), ln); IF_AFDATA_WUNLOCK(ifp); if (ln_tmp == NULL) { /* No existing lle, mark as new entry (6,7) */ is_newentry = 1; if (lladdr != NULL) { /* (7) */ nd6_llinfo_setstate(ln, ND6_LLINFO_STALE); EVENTHANDLER_INVOKE(lle_event, ln, LLENTRY_RESOLVED); } } else { lltable_free_entry(LLTABLE6(ifp), ln); ln = ln_tmp; ln_tmp = NULL; } } /* do nothing if static ndp is set */ if ((ln->la_flags & LLE_STATIC)) { if (flags & LLE_EXCLUSIVE) LLE_WUNLOCK(ln); else LLE_RUNLOCK(ln); return; } olladdr = (ln->la_flags & LLE_VALID) ? 1 : 0; if (olladdr && lladdr) { llchange = bcmp(lladdr, ln->ll_addr, ifp->if_addrlen); } else if (!olladdr && lladdr) llchange = 1; else llchange = 0; /* * newentry olladdr lladdr llchange (*=record) * 0 n n -- (1) * 0 y n -- (2) * 0 n y y (3) * STALE * 0 y y n (4) * * 0 y y y (5) * STALE * 1 -- n -- (6) NOSTATE(= PASSIVE) * 1 -- y -- (7) * STALE */ do_update = 0; if (is_newentry == 0 && llchange != 0) { do_update = 1; /* (3,5) */ /* * Record source link-layer address * XXX is it dependent to ifp->if_type? */ linkhdrsize = sizeof(linkhdr); if (lltable_calc_llheader(ifp, AF_INET6, lladdr, linkhdr, &linkhdrsize, &lladdr_off) != 0) return; if (lltable_try_set_entry_addr(ifp, ln, linkhdr, linkhdrsize, lladdr_off) == 0) { /* Entry was deleted */ return; } nd6_llinfo_setstate(ln, ND6_LLINFO_STALE); EVENTHANDLER_INVOKE(lle_event, ln, LLENTRY_RESOLVED); if (ln->la_hold != NULL) nd6_grab_holdchain(ln, &chain, &sin6); } /* Calculates new router status */ router = nd6_is_router(type, code, is_newentry, olladdr, lladdr != NULL ? 1 : 0, ln->ln_router); ln->ln_router = router; /* Mark non-router redirects with special flag */ if ((type & 0xFF) == ND_REDIRECT && code != ND_REDIRECT_ROUTER) ln->la_flags |= LLE_REDIRECT; if (flags & LLE_EXCLUSIVE) LLE_WUNLOCK(ln); else LLE_RUNLOCK(ln); if (chain != NULL) nd6_flush_holdchain(ifp, chain, &sin6); /* * When the link-layer address of a router changes, select the * best router again. In particular, when the neighbor entry is newly * created, it might affect the selection policy. * Question: can we restrict the first condition to the "is_newentry" * case? * XXX: when we hear an RA from a new router with the link-layer * address option, defrouter_select_fib() is called twice, since * defrtrlist_update called the function as well. However, I believe * we can compromise the overhead, since it only happens the first * time. * XXX: although defrouter_select_fib() should not have a bad effect * for those are not autoconfigured hosts, we explicitly avoid such * cases for safety. */ if ((do_update || is_newentry) && router && ND_IFINFO(ifp)->flags & ND6_IFF_ACCEPT_RTADV) { /* * guaranteed recursion */ defrouter_select_fib(ifp->if_fib); } } static void nd6_slowtimo(void *arg) { CURVNET_SET((struct vnet *) arg); struct nd_ifinfo *nd6if; struct ifnet *ifp; callout_reset(&V_nd6_slowtimo_ch, ND6_SLOWTIMER_INTERVAL * hz, nd6_slowtimo, curvnet); IFNET_RLOCK_NOSLEEP(); - TAILQ_FOREACH(ifp, &V_ifnet, if_link) { + CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) { if (ifp->if_afdata[AF_INET6] == NULL) continue; nd6if = ND_IFINFO(ifp); if (nd6if->basereachable && /* already initialized */ (nd6if->recalctm -= ND6_SLOWTIMER_INTERVAL) <= 0) { /* * Since reachable time rarely changes by router * advertisements, we SHOULD insure that a new random * value gets recomputed at least once every few hours. * (RFC 2461, 6.3.4) */ nd6if->recalctm = V_nd6_recalc_reachtm_interval; nd6if->reachable = ND_COMPUTE_RTIME(nd6if->basereachable); } } IFNET_RUNLOCK_NOSLEEP(); CURVNET_RESTORE(); } void nd6_grab_holdchain(struct llentry *ln, struct mbuf **chain, struct sockaddr_in6 *sin6) { LLE_WLOCK_ASSERT(ln); *chain = ln->la_hold; ln->la_hold = NULL; lltable_fill_sa_entry(ln, (struct sockaddr *)sin6); if (ln->ln_state == ND6_LLINFO_STALE) { /* * The first time we send a packet to a * neighbor whose entry is STALE, we have * to change the state to DELAY and a sets * a timer to expire in DELAY_FIRST_PROBE_TIME * seconds to ensure do neighbor unreachability * detection on expiration. * (RFC 2461 7.3.3) */ nd6_llinfo_setstate(ln, ND6_LLINFO_DELAY); } } int nd6_output_ifp(struct ifnet *ifp, struct ifnet *origifp, struct mbuf *m, struct sockaddr_in6 *dst, struct route *ro) { int error; int ip6len; struct ip6_hdr *ip6; struct m_tag *mtag; #ifdef MAC mac_netinet6_nd6_send(ifp, m); #endif /* * If called from nd6_ns_output() (NS), nd6_na_output() (NA), * icmp6_redirect_output() (REDIRECT) or from rip6_output() (RS, RA * as handled by rtsol and rtadvd), mbufs will be tagged for SeND * to be diverted to user space. When re-injected into the kernel, * send_output() will directly dispatch them to the outgoing interface. */ if (send_sendso_input_hook != NULL) { mtag = m_tag_find(m, PACKET_TAG_ND_OUTGOING, NULL); if (mtag != NULL) { ip6 = mtod(m, struct ip6_hdr *); ip6len = sizeof(struct ip6_hdr) + ntohs(ip6->ip6_plen); /* Use the SEND socket */ error = send_sendso_input_hook(m, ifp, SND_OUT, ip6len); /* -1 == no app on SEND socket */ if (error == 0 || error != -1) return (error); } } m_clrprotoflags(m); /* Avoid confusing lower layers. */ IP_PROBE(send, NULL, NULL, mtod(m, struct ip6_hdr *), ifp, NULL, mtod(m, struct ip6_hdr *)); if ((ifp->if_flags & IFF_LOOPBACK) == 0) origifp = ifp; error = (*ifp->if_output)(origifp, m, (struct sockaddr *)dst, ro); return (error); } /* * Lookup link headerfor @sa_dst address. Stores found * data in @desten buffer. Copy of lle ln_flags can be also * saved in @pflags if @pflags is non-NULL. * * If destination LLE does not exists or lle state modification * is required, call "slow" version. * * Return values: * - 0 on success (address copied to buffer). * - EWOULDBLOCK (no local error, but address is still unresolved) * - other errors (alloc failure, etc) */ int nd6_resolve(struct ifnet *ifp, int is_gw, struct mbuf *m, const struct sockaddr *sa_dst, u_char *desten, uint32_t *pflags, struct llentry **plle) { struct llentry *ln = NULL; const struct sockaddr_in6 *dst6; if (pflags != NULL) *pflags = 0; dst6 = (const struct sockaddr_in6 *)sa_dst; /* discard the packet if IPv6 operation is disabled on the interface */ if ((ND_IFINFO(ifp)->flags & ND6_IFF_IFDISABLED)) { m_freem(m); return (ENETDOWN); /* better error? */ } if (m != NULL && m->m_flags & M_MCAST) { switch (ifp->if_type) { case IFT_ETHER: case IFT_L2VLAN: case IFT_BRIDGE: ETHER_MAP_IPV6_MULTICAST(&dst6->sin6_addr, desten); return (0); default: m_freem(m); return (EAFNOSUPPORT); } } IF_AFDATA_RLOCK(ifp); ln = nd6_lookup(&dst6->sin6_addr, plle ? LLE_EXCLUSIVE : LLE_UNLOCKED, ifp); if (ln != NULL && (ln->r_flags & RLLE_VALID) != 0) { /* Entry found, let's copy lle info */ bcopy(ln->r_linkdata, desten, ln->r_hdrlen); if (pflags != NULL) *pflags = LLE_VALID | (ln->r_flags & RLLE_IFADDR); /* Check if we have feedback request from nd6 timer */ if (ln->r_skip_req != 0) { LLE_REQ_LOCK(ln); ln->r_skip_req = 0; /* Notify that entry was used */ ln->lle_hittime = time_uptime; LLE_REQ_UNLOCK(ln); } if (plle) { LLE_ADDREF(ln); *plle = ln; LLE_WUNLOCK(ln); } IF_AFDATA_RUNLOCK(ifp); return (0); } else if (plle && ln) LLE_WUNLOCK(ln); IF_AFDATA_RUNLOCK(ifp); return (nd6_resolve_slow(ifp, 0, m, dst6, desten, pflags, plle)); } /* * Do L2 address resolution for @sa_dst address. Stores found * address in @desten buffer. Copy of lle ln_flags can be also * saved in @pflags if @pflags is non-NULL. * * Heavy version. * Function assume that destination LLE does not exist, * is invalid or stale, so LLE_EXCLUSIVE lock needs to be acquired. * * Set noinline to be dtrace-friendly */ static __noinline int nd6_resolve_slow(struct ifnet *ifp, int flags, struct mbuf *m, const struct sockaddr_in6 *dst, u_char *desten, uint32_t *pflags, struct llentry **plle) { struct llentry *lle = NULL, *lle_tmp; struct in6_addr *psrc, src; int send_ns, ll_len; char *lladdr; /* * Address resolution or Neighbor Unreachability Detection * for the next hop. * At this point, the destination of the packet must be a unicast * or an anycast address(i.e. not a multicast). */ if (lle == NULL) { IF_AFDATA_RLOCK(ifp); lle = nd6_lookup(&dst->sin6_addr, LLE_EXCLUSIVE, ifp); IF_AFDATA_RUNLOCK(ifp); if ((lle == NULL) && nd6_is_addr_neighbor(dst, ifp)) { /* * Since nd6_is_addr_neighbor() internally calls nd6_lookup(), * the condition below is not very efficient. But we believe * it is tolerable, because this should be a rare case. */ lle = nd6_alloc(&dst->sin6_addr, 0, ifp); if (lle == NULL) { char ip6buf[INET6_ADDRSTRLEN]; log(LOG_DEBUG, "nd6_output: can't allocate llinfo for %s " "(ln=%p)\n", ip6_sprintf(ip6buf, &dst->sin6_addr), lle); m_freem(m); return (ENOBUFS); } IF_AFDATA_WLOCK(ifp); LLE_WLOCK(lle); /* Prefer any existing entry over newly-created one */ lle_tmp = nd6_lookup(&dst->sin6_addr, LLE_EXCLUSIVE, ifp); if (lle_tmp == NULL) lltable_link_entry(LLTABLE6(ifp), lle); IF_AFDATA_WUNLOCK(ifp); if (lle_tmp != NULL) { lltable_free_entry(LLTABLE6(ifp), lle); lle = lle_tmp; lle_tmp = NULL; } } } if (lle == NULL) { if (!(ND_IFINFO(ifp)->flags & ND6_IFF_PERFORMNUD)) { m_freem(m); return (ENOBUFS); } if (m != NULL) m_freem(m); return (ENOBUFS); } LLE_WLOCK_ASSERT(lle); /* * The first time we send a packet to a neighbor whose entry is * STALE, we have to change the state to DELAY and a sets a timer to * expire in DELAY_FIRST_PROBE_TIME seconds to ensure do * neighbor unreachability detection on expiration. * (RFC 2461 7.3.3) */ if (lle->ln_state == ND6_LLINFO_STALE) nd6_llinfo_setstate(lle, ND6_LLINFO_DELAY); /* * If the neighbor cache entry has a state other than INCOMPLETE * (i.e. its link-layer address is already resolved), just * send the packet. */ if (lle->ln_state > ND6_LLINFO_INCOMPLETE) { if (flags & LLE_ADDRONLY) { lladdr = lle->ll_addr; ll_len = ifp->if_addrlen; } else { lladdr = lle->r_linkdata; ll_len = lle->r_hdrlen; } bcopy(lladdr, desten, ll_len); if (pflags != NULL) *pflags = lle->la_flags; if (plle) { LLE_ADDREF(lle); *plle = lle; } LLE_WUNLOCK(lle); return (0); } /* * There is a neighbor cache entry, but no ethernet address * response yet. Append this latest packet to the end of the * packet queue in the mbuf. When it exceeds nd6_maxqueuelen, * the oldest packet in the queue will be removed. */ if (lle->la_hold != NULL) { struct mbuf *m_hold; int i; i = 0; for (m_hold = lle->la_hold; m_hold; m_hold = m_hold->m_nextpkt){ i++; if (m_hold->m_nextpkt == NULL) { m_hold->m_nextpkt = m; break; } } while (i >= V_nd6_maxqueuelen) { m_hold = lle->la_hold; lle->la_hold = lle->la_hold->m_nextpkt; m_freem(m_hold); i--; } } else { lle->la_hold = m; } /* * If there has been no NS for the neighbor after entering the * INCOMPLETE state, send the first solicitation. * Note that for newly-created lle la_asked will be 0, * so we will transition from ND6_LLINFO_NOSTATE to * ND6_LLINFO_INCOMPLETE state here. */ psrc = NULL; send_ns = 0; if (lle->la_asked == 0) { lle->la_asked++; send_ns = 1; psrc = nd6_llinfo_get_holdsrc(lle, &src); nd6_llinfo_setstate(lle, ND6_LLINFO_INCOMPLETE); } LLE_WUNLOCK(lle); if (send_ns != 0) nd6_ns_output(ifp, psrc, NULL, &dst->sin6_addr, NULL); return (EWOULDBLOCK); } /* * Do L2 address resolution for @sa_dst address. Stores found * address in @desten buffer. Copy of lle ln_flags can be also * saved in @pflags if @pflags is non-NULL. * * Return values: * - 0 on success (address copied to buffer). * - EWOULDBLOCK (no local error, but address is still unresolved) * - other errors (alloc failure, etc) */ int nd6_resolve_addr(struct ifnet *ifp, int flags, const struct sockaddr *dst, char *desten, uint32_t *pflags) { int error; flags |= LLE_ADDRONLY; error = nd6_resolve_slow(ifp, flags, NULL, (const struct sockaddr_in6 *)dst, desten, pflags, NULL); return (error); } int nd6_flush_holdchain(struct ifnet *ifp, struct mbuf *chain, struct sockaddr_in6 *dst) { struct mbuf *m, *m_head; int error = 0; m_head = chain; while (m_head) { m = m_head; m_head = m_head->m_nextpkt; error = nd6_output_ifp(ifp, ifp, m, dst, NULL); } /* * XXX * note that intermediate errors are blindly ignored */ return (error); } static int nd6_need_cache(struct ifnet *ifp) { /* * XXX: we currently do not make neighbor cache on any interface * other than Ethernet and GIF. * * RFC2893 says: * - unidirectional tunnels needs no ND */ switch (ifp->if_type) { case IFT_ETHER: case IFT_IEEE1394: case IFT_L2VLAN: case IFT_INFINIBAND: case IFT_BRIDGE: case IFT_PROPVIRTUAL: return (1); default: return (0); } } /* * Add pernament ND6 link-layer record for given * interface address. * * Very similar to IPv4 arp_ifinit(), but: * 1) IPv6 DAD is performed in different place * 2) It is called by IPv6 protocol stack in contrast to * arp_ifinit() which is typically called in SIOCSIFADDR * driver ioctl handler. * */ int nd6_add_ifa_lle(struct in6_ifaddr *ia) { struct ifnet *ifp; struct llentry *ln, *ln_tmp; struct sockaddr *dst; ifp = ia->ia_ifa.ifa_ifp; if (nd6_need_cache(ifp) == 0) return (0); ia->ia_ifa.ifa_rtrequest = nd6_rtrequest; dst = (struct sockaddr *)&ia->ia_addr; ln = lltable_alloc_entry(LLTABLE6(ifp), LLE_IFADDR, dst); if (ln == NULL) return (ENOBUFS); IF_AFDATA_WLOCK(ifp); LLE_WLOCK(ln); /* Unlink any entry if exists */ ln_tmp = lla_lookup(LLTABLE6(ifp), LLE_EXCLUSIVE, dst); if (ln_tmp != NULL) lltable_unlink_entry(LLTABLE6(ifp), ln_tmp); lltable_link_entry(LLTABLE6(ifp), ln); IF_AFDATA_WUNLOCK(ifp); if (ln_tmp != NULL) EVENTHANDLER_INVOKE(lle_event, ln_tmp, LLENTRY_EXPIRED); EVENTHANDLER_INVOKE(lle_event, ln, LLENTRY_RESOLVED); LLE_WUNLOCK(ln); if (ln_tmp != NULL) llentry_free(ln_tmp); return (0); } /* * Removes either all lle entries for given @ia, or lle * corresponding to @ia address. */ void nd6_rem_ifa_lle(struct in6_ifaddr *ia, int all) { struct sockaddr_in6 mask, addr; struct sockaddr *saddr, *smask; struct ifnet *ifp; ifp = ia->ia_ifa.ifa_ifp; memcpy(&addr, &ia->ia_addr, sizeof(ia->ia_addr)); memcpy(&mask, &ia->ia_prefixmask, sizeof(ia->ia_prefixmask)); saddr = (struct sockaddr *)&addr; smask = (struct sockaddr *)&mask; if (all != 0) lltable_prefix_free(AF_INET6, saddr, smask, LLE_STATIC); else lltable_delete_addr(LLTABLE6(ifp), LLE_IFADDR, saddr); } static void clear_llinfo_pqueue(struct llentry *ln) { struct mbuf *m_hold, *m_hold_next; for (m_hold = ln->la_hold; m_hold; m_hold = m_hold_next) { m_hold_next = m_hold->m_nextpkt; m_freem(m_hold); } ln->la_hold = NULL; } static int nd6_sysctl_drlist(SYSCTL_HANDLER_ARGS); static int nd6_sysctl_prlist(SYSCTL_HANDLER_ARGS); SYSCTL_DECL(_net_inet6_icmp6); SYSCTL_PROC(_net_inet6_icmp6, ICMPV6CTL_ND6_DRLIST, nd6_drlist, CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, nd6_sysctl_drlist, "S,in6_defrouter", "NDP default router list"); SYSCTL_PROC(_net_inet6_icmp6, ICMPV6CTL_ND6_PRLIST, nd6_prlist, CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, nd6_sysctl_prlist, "S,in6_prefix", "NDP prefix list"); SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_ND6_MAXQLEN, nd6_maxqueuelen, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(nd6_maxqueuelen), 1, ""); SYSCTL_INT(_net_inet6_icmp6, OID_AUTO, nd6_gctimer, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(nd6_gctimer), (60 * 60 * 24), ""); static int nd6_sysctl_drlist(SYSCTL_HANDLER_ARGS) { struct in6_defrouter d; struct nd_defrouter *dr; int error; if (req->newptr != NULL) return (EPERM); error = sysctl_wire_old_buffer(req, 0); if (error != 0) return (error); bzero(&d, sizeof(d)); d.rtaddr.sin6_family = AF_INET6; d.rtaddr.sin6_len = sizeof(d.rtaddr); ND6_RLOCK(); TAILQ_FOREACH(dr, &V_nd_defrouter, dr_entry) { d.rtaddr.sin6_addr = dr->rtaddr; error = sa6_recoverscope(&d.rtaddr); if (error != 0) break; d.flags = dr->raflags; d.rtlifetime = dr->rtlifetime; d.expire = dr->expire + (time_second - time_uptime); d.if_index = dr->ifp->if_index; error = SYSCTL_OUT(req, &d, sizeof(d)); if (error != 0) break; } ND6_RUNLOCK(); return (error); } static int nd6_sysctl_prlist(SYSCTL_HANDLER_ARGS) { struct in6_prefix p; struct sockaddr_in6 s6; struct nd_prefix *pr; struct nd_pfxrouter *pfr; time_t maxexpire; int error; char ip6buf[INET6_ADDRSTRLEN]; if (req->newptr) return (EPERM); error = sysctl_wire_old_buffer(req, 0); if (error != 0) return (error); bzero(&p, sizeof(p)); p.origin = PR_ORIG_RA; bzero(&s6, sizeof(s6)); s6.sin6_family = AF_INET6; s6.sin6_len = sizeof(s6); ND6_RLOCK(); LIST_FOREACH(pr, &V_nd_prefix, ndpr_entry) { p.prefix = pr->ndpr_prefix; if (sa6_recoverscope(&p.prefix)) { log(LOG_ERR, "scope error in prefix list (%s)\n", ip6_sprintf(ip6buf, &p.prefix.sin6_addr)); /* XXX: press on... */ } p.raflags = pr->ndpr_raf; p.prefixlen = pr->ndpr_plen; p.vltime = pr->ndpr_vltime; p.pltime = pr->ndpr_pltime; p.if_index = pr->ndpr_ifp->if_index; if (pr->ndpr_vltime == ND6_INFINITE_LIFETIME) p.expire = 0; else { /* XXX: we assume time_t is signed. */ maxexpire = (-1) & ~((time_t)1 << ((sizeof(maxexpire) * 8) - 1)); if (pr->ndpr_vltime < maxexpire - pr->ndpr_lastupdate) p.expire = pr->ndpr_lastupdate + pr->ndpr_vltime + (time_second - time_uptime); else p.expire = maxexpire; } p.refcnt = pr->ndpr_addrcnt; p.flags = pr->ndpr_stateflags; p.advrtrs = 0; LIST_FOREACH(pfr, &pr->ndpr_advrtrs, pfr_entry) p.advrtrs++; error = SYSCTL_OUT(req, &p, sizeof(p)); if (error != 0) break; LIST_FOREACH(pfr, &pr->ndpr_advrtrs, pfr_entry) { s6.sin6_addr = pfr->router->rtaddr; if (sa6_recoverscope(&s6)) log(LOG_ERR, "scope error in prefix list (%s)\n", ip6_sprintf(ip6buf, &pfr->router->rtaddr)); error = SYSCTL_OUT(req, &s6, sizeof(s6)); if (error != 0) goto out; } } out: ND6_RUNLOCK(); return (error); } Index: head/sys/netinet6/raw_ip6.c =================================================================== --- head/sys/netinet6/raw_ip6.c (revision 334117) +++ head/sys/netinet6/raw_ip6.c (revision 334118) @@ -1,897 +1,899 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /*- * Copyright (c) 1982, 1986, 1988, 1993 * The Regents of the University of California. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)raw_ip.c 8.2 (Berkeley) 1/4/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_ipsec.h" #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define satosin6(sa) ((struct sockaddr_in6 *)(sa)) #define ifatoia6(ifa) ((struct in6_ifaddr *)(ifa)) /* * Raw interface to IP6 protocol. */ VNET_DECLARE(struct inpcbhead, ripcb); VNET_DECLARE(struct inpcbinfo, ripcbinfo); #define V_ripcb VNET(ripcb) #define V_ripcbinfo VNET(ripcbinfo) extern u_long rip_sendspace; extern u_long rip_recvspace; VNET_PCPUSTAT_DEFINE(struct rip6stat, rip6stat); VNET_PCPUSTAT_SYSINIT(rip6stat); #ifdef VIMAGE VNET_PCPUSTAT_SYSUNINIT(rip6stat); #endif /* VIMAGE */ /* * Hooks for multicast routing. They all default to NULL, so leave them not * initialized and rely on BSS being set to 0. */ /* * The socket used to communicate with the multicast routing daemon. */ VNET_DEFINE(struct socket *, ip6_mrouter); /* * The various mrouter functions. */ int (*ip6_mrouter_set)(struct socket *, struct sockopt *); int (*ip6_mrouter_get)(struct socket *, struct sockopt *); int (*ip6_mrouter_done)(void); int (*ip6_mforward)(struct ip6_hdr *, struct ifnet *, struct mbuf *); int (*mrt6_ioctl)(u_long, caddr_t); /* * Setup generic address and protocol structures for raw_input routine, then * pass them along with mbuf chain. */ int rip6_input(struct mbuf **mp, int *offp, int proto) { struct ifnet *ifp; struct mbuf *m = *mp; struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *); struct inpcb *in6p; struct inpcb *last = NULL; struct mbuf *opts = NULL; struct sockaddr_in6 fromsa; RIP6STAT_INC(rip6s_ipackets); init_sin6(&fromsa, m, 0); /* general init */ ifp = m->m_pkthdr.rcvif; INP_INFO_RLOCK(&V_ripcbinfo); LIST_FOREACH(in6p, &V_ripcb, inp_list) { /* XXX inp locking */ if ((in6p->inp_vflag & INP_IPV6) == 0) continue; if (in6p->inp_ip_p && in6p->inp_ip_p != proto) continue; if (!IN6_IS_ADDR_UNSPECIFIED(&in6p->in6p_laddr) && !IN6_ARE_ADDR_EQUAL(&in6p->in6p_laddr, &ip6->ip6_dst)) continue; if (!IN6_IS_ADDR_UNSPECIFIED(&in6p->in6p_faddr) && !IN6_ARE_ADDR_EQUAL(&in6p->in6p_faddr, &ip6->ip6_src)) continue; if (jailed_without_vnet(in6p->inp_cred)) { /* * Allow raw socket in jail to receive multicast; * assume process had PRIV_NETINET_RAW at attach, * and fall through into normal filter path if so. */ if (!IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) && prison_check_ip6(in6p->inp_cred, &ip6->ip6_dst) != 0) continue; } INP_RLOCK(in6p); if (in6p->in6p_cksum != -1) { RIP6STAT_INC(rip6s_isum); if (in6_cksum(m, proto, *offp, m->m_pkthdr.len - *offp)) { INP_RUNLOCK(in6p); RIP6STAT_INC(rip6s_badsum); continue; } } /* * If this raw socket has multicast state, and we * have received a multicast, check if this socket * should receive it, as multicast filtering is now * the responsibility of the transport layer. */ if (in6p->in6p_moptions && IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) { /* * If the incoming datagram is for MLD, allow it * through unconditionally to the raw socket. * * Use the M_RTALERT_MLD flag to check for MLD * traffic without having to inspect the mbuf chain * more deeply, as all MLDv1/v2 host messages MUST * contain the Router Alert option. * * In the case of MLDv1, we may not have explicitly * joined the group, and may have set IFF_ALLMULTI * on the interface. im6o_mc_filter() may discard * control traffic we actually need to see. * * Userland multicast routing daemons should continue * filter the control traffic appropriately. */ int blocked; blocked = MCAST_PASS; if ((m->m_flags & M_RTALERT_MLD) == 0) { struct sockaddr_in6 mcaddr; bzero(&mcaddr, sizeof(struct sockaddr_in6)); mcaddr.sin6_len = sizeof(struct sockaddr_in6); mcaddr.sin6_family = AF_INET6; mcaddr.sin6_addr = ip6->ip6_dst; blocked = im6o_mc_filter(in6p->in6p_moptions, ifp, (struct sockaddr *)&mcaddr, (struct sockaddr *)&fromsa); } if (blocked != MCAST_PASS) { IP6STAT_INC(ip6s_notmember); INP_RUNLOCK(in6p); continue; } } if (last != NULL) { struct mbuf *n = m_copym(m, 0, M_COPYALL, M_NOWAIT); #if defined(IPSEC) || defined(IPSEC_SUPPORT) /* * Check AH/ESP integrity. */ if (IPSEC_ENABLED(ipv6)) { if (n != NULL && IPSEC_CHECK_POLICY(ipv6, n, last) != 0) { m_freem(n); /* Do not inject data into pcb. */ n = NULL; } } #endif /* IPSEC */ if (n) { if (last->inp_flags & INP_CONTROLOPTS || last->inp_socket->so_options & SO_TIMESTAMP) ip6_savecontrol(last, n, &opts); /* strip intermediate headers */ m_adj(n, *offp); if (sbappendaddr(&last->inp_socket->so_rcv, (struct sockaddr *)&fromsa, n, opts) == 0) { m_freem(n); if (opts) m_freem(opts); RIP6STAT_INC(rip6s_fullsock); } else sorwakeup(last->inp_socket); opts = NULL; } INP_RUNLOCK(last); } last = in6p; } INP_INFO_RUNLOCK(&V_ripcbinfo); #if defined(IPSEC) || defined(IPSEC_SUPPORT) /* * Check AH/ESP integrity. */ if (IPSEC_ENABLED(ipv6) && last != NULL && IPSEC_CHECK_POLICY(ipv6, m, last) != 0) { m_freem(m); IP6STAT_DEC(ip6s_delivered); /* Do not inject data into pcb. */ INP_RUNLOCK(last); } else #endif /* IPSEC */ if (last != NULL) { if (last->inp_flags & INP_CONTROLOPTS || last->inp_socket->so_options & SO_TIMESTAMP) ip6_savecontrol(last, m, &opts); /* Strip intermediate headers. */ m_adj(m, *offp); if (sbappendaddr(&last->inp_socket->so_rcv, (struct sockaddr *)&fromsa, m, opts) == 0) { m_freem(m); if (opts) m_freem(opts); RIP6STAT_INC(rip6s_fullsock); } else sorwakeup(last->inp_socket); INP_RUNLOCK(last); } else { RIP6STAT_INC(rip6s_nosock); if (m->m_flags & M_MCAST) RIP6STAT_INC(rip6s_nosockmcast); if (proto == IPPROTO_NONE) m_freem(m); else icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_NEXTHEADER, ip6_get_prevhdr(m, *offp)); IP6STAT_DEC(ip6s_delivered); } return (IPPROTO_DONE); } void rip6_ctlinput(int cmd, struct sockaddr *sa, void *d) { struct ip6ctlparam *ip6cp = NULL; const struct sockaddr_in6 *sa6_src = NULL; void *cmdarg; struct inpcb *(*notify)(struct inpcb *, int) = in6_rtchange; if (sa->sa_family != AF_INET6 || sa->sa_len != sizeof(struct sockaddr_in6)) return; if ((unsigned)cmd >= PRC_NCMDS) return; if (PRC_IS_REDIRECT(cmd)) notify = in6_rtchange, d = NULL; else if (cmd == PRC_HOSTDEAD) d = NULL; else if (inet6ctlerrmap[cmd] == 0) return; /* * If the parameter is from icmp6, decode it. */ if (d != NULL) { ip6cp = (struct ip6ctlparam *)d; cmdarg = ip6cp->ip6c_cmdarg; sa6_src = ip6cp->ip6c_src; } else { cmdarg = NULL; sa6_src = &sa6_any; } (void) in6_pcbnotify(&V_ripcbinfo, sa, 0, (const struct sockaddr *)sa6_src, 0, cmd, cmdarg, notify); } /* * Generate IPv6 header and pass packet to ip6_output. Tack on options user * may have setup with control call. */ int rip6_output(struct mbuf *m, struct socket *so, ...) { struct mbuf *control; struct m_tag *mtag; struct sockaddr_in6 *dstsock; struct ip6_hdr *ip6; struct inpcb *in6p; u_int plen = m->m_pkthdr.len; int error = 0; struct ip6_pktopts opt, *optp; struct ifnet *oifp = NULL; int type = 0, code = 0; /* for ICMPv6 output statistics only */ int scope_ambiguous = 0; int use_defzone = 0; int hlim = 0; struct in6_addr in6a; va_list ap; va_start(ap, so); dstsock = va_arg(ap, struct sockaddr_in6 *); control = va_arg(ap, struct mbuf *); va_end(ap); in6p = sotoinpcb(so); INP_WLOCK(in6p); if (control != NULL) { if ((error = ip6_setpktopts(control, &opt, in6p->in6p_outputopts, so->so_cred, so->so_proto->pr_protocol)) != 0) { goto bad; } optp = &opt; } else optp = in6p->in6p_outputopts; /* * Check and convert scope zone ID into internal form. * * XXX: we may still need to determine the zone later. */ if (!(so->so_state & SS_ISCONNECTED)) { if (!optp || !optp->ip6po_pktinfo || !optp->ip6po_pktinfo->ipi6_ifindex) use_defzone = V_ip6_use_defzone; if (dstsock->sin6_scope_id == 0 && !use_defzone) scope_ambiguous = 1; if ((error = sa6_embedscope(dstsock, use_defzone)) != 0) goto bad; } /* * For an ICMPv6 packet, we should know its type and code to update * statistics. */ if (so->so_proto->pr_protocol == IPPROTO_ICMPV6) { struct icmp6_hdr *icmp6; if (m->m_len < sizeof(struct icmp6_hdr) && (m = m_pullup(m, sizeof(struct icmp6_hdr))) == NULL) { error = ENOBUFS; goto bad; } icmp6 = mtod(m, struct icmp6_hdr *); type = icmp6->icmp6_type; code = icmp6->icmp6_code; } M_PREPEND(m, sizeof(*ip6), M_NOWAIT); if (m == NULL) { error = ENOBUFS; goto bad; } ip6 = mtod(m, struct ip6_hdr *); /* * Source address selection. */ error = in6_selectsrc_socket(dstsock, optp, in6p, so->so_cred, scope_ambiguous, &in6a, &hlim); if (error) goto bad; error = prison_check_ip6(in6p->inp_cred, &in6a); if (error != 0) goto bad; ip6->ip6_src = in6a; ip6->ip6_dst = dstsock->sin6_addr; /* * Fill in the rest of the IPv6 header fields. */ ip6->ip6_flow = (ip6->ip6_flow & ~IPV6_FLOWINFO_MASK) | (in6p->inp_flow & IPV6_FLOWINFO_MASK); ip6->ip6_vfc = (ip6->ip6_vfc & ~IPV6_VERSION_MASK) | (IPV6_VERSION & IPV6_VERSION_MASK); /* * ip6_plen will be filled in ip6_output, so not fill it here. */ ip6->ip6_nxt = in6p->inp_ip_p; ip6->ip6_hlim = hlim; if (so->so_proto->pr_protocol == IPPROTO_ICMPV6 || in6p->in6p_cksum != -1) { struct mbuf *n; int off; u_int16_t *p; /* Compute checksum. */ if (so->so_proto->pr_protocol == IPPROTO_ICMPV6) off = offsetof(struct icmp6_hdr, icmp6_cksum); else off = in6p->in6p_cksum; if (plen < off + 1) { error = EINVAL; goto bad; } off += sizeof(struct ip6_hdr); n = m; while (n && n->m_len <= off) { off -= n->m_len; n = n->m_next; } if (!n) goto bad; p = (u_int16_t *)(mtod(n, caddr_t) + off); *p = 0; *p = in6_cksum(m, ip6->ip6_nxt, sizeof(*ip6), plen); } /* * Send RA/RS messages to user land for protection, before sending * them to rtadvd/rtsol. */ if ((send_sendso_input_hook != NULL) && so->so_proto->pr_protocol == IPPROTO_ICMPV6) { switch (type) { case ND_ROUTER_ADVERT: case ND_ROUTER_SOLICIT: mtag = m_tag_get(PACKET_TAG_ND_OUTGOING, sizeof(unsigned short), M_NOWAIT); if (mtag == NULL) goto bad; m_tag_prepend(m, mtag); } } error = ip6_output(m, optp, NULL, 0, in6p->in6p_moptions, &oifp, in6p); if (so->so_proto->pr_protocol == IPPROTO_ICMPV6) { if (oifp) icmp6_ifoutstat_inc(oifp, type, code); ICMP6STAT_INC(icp6s_outhist[type]); } else RIP6STAT_INC(rip6s_opackets); goto freectl; bad: if (m) m_freem(m); freectl: if (control != NULL) { ip6_clearpktopts(&opt, -1); m_freem(control); } INP_WUNLOCK(in6p); return (error); } /* * Raw IPv6 socket option processing. */ int rip6_ctloutput(struct socket *so, struct sockopt *sopt) { struct inpcb *inp; int error; if (sopt->sopt_level == IPPROTO_ICMPV6) /* * XXX: is it better to call icmp6_ctloutput() directly * from protosw? */ return (icmp6_ctloutput(so, sopt)); else if (sopt->sopt_level != IPPROTO_IPV6) { if (sopt->sopt_level == SOL_SOCKET && sopt->sopt_name == SO_SETFIB) { inp = sotoinpcb(so); INP_WLOCK(inp); inp->inp_inc.inc_fibnum = so->so_fibnum; INP_WUNLOCK(inp); return (0); } return (EINVAL); } error = 0; switch (sopt->sopt_dir) { case SOPT_GET: switch (sopt->sopt_name) { case MRT6_INIT: case MRT6_DONE: case MRT6_ADD_MIF: case MRT6_DEL_MIF: case MRT6_ADD_MFC: case MRT6_DEL_MFC: case MRT6_PIM: error = ip6_mrouter_get ? ip6_mrouter_get(so, sopt) : EOPNOTSUPP; break; case IPV6_CHECKSUM: error = ip6_raw_ctloutput(so, sopt); break; default: error = ip6_ctloutput(so, sopt); break; } break; case SOPT_SET: switch (sopt->sopt_name) { case MRT6_INIT: case MRT6_DONE: case MRT6_ADD_MIF: case MRT6_DEL_MIF: case MRT6_ADD_MFC: case MRT6_DEL_MFC: case MRT6_PIM: error = ip6_mrouter_set ? ip6_mrouter_set(so, sopt) : EOPNOTSUPP; break; case IPV6_CHECKSUM: error = ip6_raw_ctloutput(so, sopt); break; default: error = ip6_ctloutput(so, sopt); break; } break; } return (error); } static int rip6_attach(struct socket *so, int proto, struct thread *td) { struct inpcb *inp; struct icmp6_filter *filter; int error; inp = sotoinpcb(so); KASSERT(inp == NULL, ("rip6_attach: inp != NULL")); error = priv_check(td, PRIV_NETINET_RAW); if (error) return (error); error = soreserve(so, rip_sendspace, rip_recvspace); if (error) return (error); filter = malloc(sizeof(struct icmp6_filter), M_PCB, M_NOWAIT); if (filter == NULL) return (ENOMEM); INP_INFO_WLOCK(&V_ripcbinfo); error = in_pcballoc(so, &V_ripcbinfo); if (error) { INP_INFO_WUNLOCK(&V_ripcbinfo); free(filter, M_PCB); return (error); } inp = (struct inpcb *)so->so_pcb; INP_INFO_WUNLOCK(&V_ripcbinfo); inp->inp_vflag |= INP_IPV6; inp->inp_ip_p = (long)proto; inp->in6p_hops = -1; /* use kernel default */ inp->in6p_cksum = -1; inp->in6p_icmp6filt = filter; ICMP6_FILTER_SETPASSALL(inp->in6p_icmp6filt); INP_WUNLOCK(inp); return (0); } static void rip6_detach(struct socket *so) { struct inpcb *inp; inp = sotoinpcb(so); KASSERT(inp != NULL, ("rip6_detach: inp == NULL")); if (so == V_ip6_mrouter && ip6_mrouter_done) ip6_mrouter_done(); /* xxx: RSVP */ INP_INFO_WLOCK(&V_ripcbinfo); INP_WLOCK(inp); free(inp->in6p_icmp6filt, M_PCB); in_pcbdetach(inp); in_pcbfree(inp); INP_INFO_WUNLOCK(&V_ripcbinfo); } /* XXXRW: This can't ever be called. */ static void rip6_abort(struct socket *so) { struct inpcb *inp; inp = sotoinpcb(so); KASSERT(inp != NULL, ("rip6_abort: inp == NULL")); soisdisconnected(so); } static void rip6_close(struct socket *so) { struct inpcb *inp; inp = sotoinpcb(so); KASSERT(inp != NULL, ("rip6_close: inp == NULL")); soisdisconnected(so); } static int rip6_disconnect(struct socket *so) { struct inpcb *inp; inp = sotoinpcb(so); KASSERT(inp != NULL, ("rip6_disconnect: inp == NULL")); if ((so->so_state & SS_ISCONNECTED) == 0) return (ENOTCONN); inp->in6p_faddr = in6addr_any; rip6_abort(so); return (0); } static int rip6_bind(struct socket *so, struct sockaddr *nam, struct thread *td) { struct inpcb *inp; struct sockaddr_in6 *addr = (struct sockaddr_in6 *)nam; struct ifaddr *ifa = NULL; int error = 0; inp = sotoinpcb(so); KASSERT(inp != NULL, ("rip6_bind: inp == NULL")); if (nam->sa_len != sizeof(*addr)) return (EINVAL); if ((error = prison_check_ip6(td->td_ucred, &addr->sin6_addr)) != 0) return (error); - if (TAILQ_EMPTY(&V_ifnet) || addr->sin6_family != AF_INET6) + if (CK_STAILQ_EMPTY(&V_ifnet) || addr->sin6_family != AF_INET6) return (EADDRNOTAVAIL); if ((error = sa6_embedscope(addr, V_ip6_use_defzone)) != 0) return (error); + NET_EPOCH_ENTER(); if (!IN6_IS_ADDR_UNSPECIFIED(&addr->sin6_addr) && - (ifa = ifa_ifwithaddr((struct sockaddr *)addr)) == NULL) + (ifa = ifa_ifwithaddr((struct sockaddr *)addr)) == NULL) { + NET_EPOCH_EXIT(); return (EADDRNOTAVAIL); + } if (ifa != NULL && ((struct in6_ifaddr *)ifa)->ia6_flags & (IN6_IFF_ANYCAST|IN6_IFF_NOTREADY| IN6_IFF_DETACHED|IN6_IFF_DEPRECATED)) { - ifa_free(ifa); + NET_EPOCH_EXIT(); return (EADDRNOTAVAIL); } - if (ifa != NULL) - ifa_free(ifa); + NET_EPOCH_EXIT(); INP_INFO_WLOCK(&V_ripcbinfo); INP_WLOCK(inp); inp->in6p_laddr = addr->sin6_addr; INP_WUNLOCK(inp); INP_INFO_WUNLOCK(&V_ripcbinfo); return (0); } static int rip6_connect(struct socket *so, struct sockaddr *nam, struct thread *td) { struct inpcb *inp; struct sockaddr_in6 *addr = (struct sockaddr_in6 *)nam; struct in6_addr in6a; int error = 0, scope_ambiguous = 0; inp = sotoinpcb(so); KASSERT(inp != NULL, ("rip6_connect: inp == NULL")); if (nam->sa_len != sizeof(*addr)) return (EINVAL); - if (TAILQ_EMPTY(&V_ifnet)) + if (CK_STAILQ_EMPTY(&V_ifnet)) return (EADDRNOTAVAIL); if (addr->sin6_family != AF_INET6) return (EAFNOSUPPORT); /* * Application should provide a proper zone ID or the use of default * zone IDs should be enabled. Unfortunately, some applications do * not behave as it should, so we need a workaround. Even if an * appropriate ID is not determined, we'll see if we can determine * the outgoing interface. If we can, determine the zone ID based on * the interface below. */ if (addr->sin6_scope_id == 0 && !V_ip6_use_defzone) scope_ambiguous = 1; if ((error = sa6_embedscope(addr, V_ip6_use_defzone)) != 0) return (error); INP_INFO_WLOCK(&V_ripcbinfo); INP_WLOCK(inp); /* Source address selection. XXX: need pcblookup? */ error = in6_selectsrc_socket(addr, inp->in6p_outputopts, inp, so->so_cred, scope_ambiguous, &in6a, NULL); if (error) { INP_WUNLOCK(inp); INP_INFO_WUNLOCK(&V_ripcbinfo); return (error); } inp->in6p_faddr = addr->sin6_addr; inp->in6p_laddr = in6a; soisconnected(so); INP_WUNLOCK(inp); INP_INFO_WUNLOCK(&V_ripcbinfo); return (0); } static int rip6_shutdown(struct socket *so) { struct inpcb *inp; inp = sotoinpcb(so); KASSERT(inp != NULL, ("rip6_shutdown: inp == NULL")); INP_WLOCK(inp); socantsendmore(so); INP_WUNLOCK(inp); return (0); } static int rip6_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam, struct mbuf *control, struct thread *td) { struct inpcb *inp; struct sockaddr_in6 tmp; struct sockaddr_in6 *dst; int ret; inp = sotoinpcb(so); KASSERT(inp != NULL, ("rip6_send: inp == NULL")); /* Always copy sockaddr to avoid overwrites. */ /* Unlocked read. */ if (so->so_state & SS_ISCONNECTED) { if (nam) { m_freem(m); return (EISCONN); } /* XXX */ bzero(&tmp, sizeof(tmp)); tmp.sin6_family = AF_INET6; tmp.sin6_len = sizeof(struct sockaddr_in6); INP_RLOCK(inp); bcopy(&inp->in6p_faddr, &tmp.sin6_addr, sizeof(struct in6_addr)); INP_RUNLOCK(inp); dst = &tmp; } else { if (nam == NULL) { m_freem(m); return (ENOTCONN); } if (nam->sa_len != sizeof(struct sockaddr_in6)) { m_freem(m); return (EINVAL); } tmp = *(struct sockaddr_in6 *)nam; dst = &tmp; if (dst->sin6_family == AF_UNSPEC) { /* * XXX: we allow this case for backward * compatibility to buggy applications that * rely on old (and wrong) kernel behavior. */ log(LOG_INFO, "rip6 SEND: address family is " "unspec. Assume AF_INET6\n"); dst->sin6_family = AF_INET6; } else if (dst->sin6_family != AF_INET6) { m_freem(m); return(EAFNOSUPPORT); } } ret = rip6_output(m, so, dst, control); return (ret); } struct pr_usrreqs rip6_usrreqs = { .pru_abort = rip6_abort, .pru_attach = rip6_attach, .pru_bind = rip6_bind, .pru_connect = rip6_connect, .pru_control = in6_control, .pru_detach = rip6_detach, .pru_disconnect = rip6_disconnect, .pru_peeraddr = in6_getpeeraddr, .pru_send = rip6_send, .pru_shutdown = rip6_shutdown, .pru_sockaddr = in6_getsockaddr, .pru_close = rip6_close, }; Index: head/sys/netpfil/pf/pf_if.c =================================================================== --- head/sys/netpfil/pf/pf_if.c (revision 334117) +++ head/sys/netpfil/pf/pf_if.c (revision 334118) @@ -1,906 +1,906 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2001 Daniel Hartmeier * Copyright (c) 2003 Cedric Berger * Copyright (c) 2005 Henning Brauer * Copyright (c) 2005 Ryan McBride * Copyright (c) 2012 Gleb Smirnoff * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * - Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials provided * with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * * $OpenBSD: pf_if.c,v 1.54 2008/06/14 16:55:28 mk Exp $ */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include VNET_DEFINE(struct pfi_kif *, pfi_all); static VNET_DEFINE(long, pfi_update); #define V_pfi_update VNET(pfi_update) #define PFI_BUFFER_MAX 0x10000 VNET_DECLARE(int, pf_vnet_active); #define V_pf_vnet_active VNET(pf_vnet_active) static VNET_DEFINE(struct pfr_addr *, pfi_buffer); static VNET_DEFINE(int, pfi_buffer_cnt); static VNET_DEFINE(int, pfi_buffer_max); #define V_pfi_buffer VNET(pfi_buffer) #define V_pfi_buffer_cnt VNET(pfi_buffer_cnt) #define V_pfi_buffer_max VNET(pfi_buffer_max) eventhandler_tag pfi_attach_cookie; eventhandler_tag pfi_detach_cookie; eventhandler_tag pfi_attach_group_cookie; eventhandler_tag pfi_change_group_cookie; eventhandler_tag pfi_detach_group_cookie; eventhandler_tag pfi_ifaddr_event_cookie; static void pfi_attach_ifnet(struct ifnet *); static void pfi_attach_ifgroup(struct ifg_group *); static void pfi_kif_update(struct pfi_kif *); static void pfi_dynaddr_update(struct pfi_dynaddr *dyn); static void pfi_table_update(struct pfr_ktable *, struct pfi_kif *, int, int); static void pfi_instance_add(struct ifnet *, int, int); static void pfi_address_add(struct sockaddr *, int, int); static int pfi_if_compare(struct pfi_kif *, struct pfi_kif *); static int pfi_skip_if(const char *, struct pfi_kif *); static int pfi_unmask(void *); static void pfi_attach_ifnet_event(void * __unused, struct ifnet *); static void pfi_detach_ifnet_event(void * __unused, struct ifnet *); static void pfi_attach_group_event(void * __unused, struct ifg_group *); static void pfi_change_group_event(void * __unused, char *); static void pfi_detach_group_event(void * __unused, struct ifg_group *); static void pfi_ifaddr_event(void * __unused, struct ifnet *); RB_HEAD(pfi_ifhead, pfi_kif); static RB_PROTOTYPE(pfi_ifhead, pfi_kif, pfik_tree, pfi_if_compare); static RB_GENERATE(pfi_ifhead, pfi_kif, pfik_tree, pfi_if_compare); static VNET_DEFINE(struct pfi_ifhead, pfi_ifs); #define V_pfi_ifs VNET(pfi_ifs) #define PFI_BUFFER_MAX 0x10000 MALLOC_DEFINE(PFI_MTYPE, "pf_ifnet", "pf(4) interface database"); LIST_HEAD(pfi_list, pfi_kif); static VNET_DEFINE(struct pfi_list, pfi_unlinked_kifs); #define V_pfi_unlinked_kifs VNET(pfi_unlinked_kifs) static struct mtx pfi_unlnkdkifs_mtx; MTX_SYSINIT(pfi_unlnkdkifs_mtx, &pfi_unlnkdkifs_mtx, "pf unlinked interfaces", MTX_DEF); void pfi_initialize_vnet(void) { struct ifg_group *ifg; struct ifnet *ifp; struct pfi_kif *kif; V_pfi_buffer_max = 64; V_pfi_buffer = malloc(V_pfi_buffer_max * sizeof(*V_pfi_buffer), PFI_MTYPE, M_WAITOK); kif = malloc(sizeof(*kif), PFI_MTYPE, M_WAITOK); PF_RULES_WLOCK(); V_pfi_all = pfi_kif_attach(kif, IFG_ALL); PF_RULES_WUNLOCK(); IFNET_RLOCK(); - TAILQ_FOREACH(ifg, &V_ifg_head, ifg_next) + CK_STAILQ_FOREACH(ifg, &V_ifg_head, ifg_next) pfi_attach_ifgroup(ifg); - TAILQ_FOREACH(ifp, &V_ifnet, if_link) + CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) pfi_attach_ifnet(ifp); IFNET_RUNLOCK(); } void pfi_initialize(void) { pfi_attach_cookie = EVENTHANDLER_REGISTER(ifnet_arrival_event, pfi_attach_ifnet_event, NULL, EVENTHANDLER_PRI_ANY); pfi_detach_cookie = EVENTHANDLER_REGISTER(ifnet_departure_event, pfi_detach_ifnet_event, NULL, EVENTHANDLER_PRI_ANY); pfi_attach_group_cookie = EVENTHANDLER_REGISTER(group_attach_event, pfi_attach_group_event, NULL, EVENTHANDLER_PRI_ANY); pfi_change_group_cookie = EVENTHANDLER_REGISTER(group_change_event, pfi_change_group_event, NULL, EVENTHANDLER_PRI_ANY); pfi_detach_group_cookie = EVENTHANDLER_REGISTER(group_detach_event, pfi_detach_group_event, NULL, EVENTHANDLER_PRI_ANY); pfi_ifaddr_event_cookie = EVENTHANDLER_REGISTER(ifaddr_event, pfi_ifaddr_event, NULL, EVENTHANDLER_PRI_ANY); } void pfi_cleanup_vnet(void) { struct pfi_kif *kif; PF_RULES_WASSERT(); V_pfi_all = NULL; while ((kif = RB_MIN(pfi_ifhead, &V_pfi_ifs))) { RB_REMOVE(pfi_ifhead, &V_pfi_ifs, kif); if (kif->pfik_group) kif->pfik_group->ifg_pf_kif = NULL; if (kif->pfik_ifp) kif->pfik_ifp->if_pf_kif = NULL; free(kif, PFI_MTYPE); } mtx_lock(&pfi_unlnkdkifs_mtx); while ((kif = LIST_FIRST(&V_pfi_unlinked_kifs))) { LIST_REMOVE(kif, pfik_list); free(kif, PFI_MTYPE); } mtx_unlock(&pfi_unlnkdkifs_mtx); free(V_pfi_buffer, PFI_MTYPE); } void pfi_cleanup(void) { EVENTHANDLER_DEREGISTER(ifnet_arrival_event, pfi_attach_cookie); EVENTHANDLER_DEREGISTER(ifnet_departure_event, pfi_detach_cookie); EVENTHANDLER_DEREGISTER(group_attach_event, pfi_attach_group_cookie); EVENTHANDLER_DEREGISTER(group_change_event, pfi_change_group_cookie); EVENTHANDLER_DEREGISTER(group_detach_event, pfi_detach_group_cookie); EVENTHANDLER_DEREGISTER(ifaddr_event, pfi_ifaddr_event_cookie); } struct pfi_kif * pfi_kif_find(const char *kif_name) { struct pfi_kif_cmp s; PF_RULES_ASSERT(); bzero(&s, sizeof(s)); strlcpy(s.pfik_name, kif_name, sizeof(s.pfik_name)); return (RB_FIND(pfi_ifhead, &V_pfi_ifs, (struct pfi_kif *)&s)); } struct pfi_kif * pfi_kif_attach(struct pfi_kif *kif, const char *kif_name) { struct pfi_kif *kif1; PF_RULES_WASSERT(); KASSERT(kif != NULL, ("%s: null kif", __func__)); kif1 = pfi_kif_find(kif_name); if (kif1 != NULL) { free(kif, PFI_MTYPE); return (kif1); } bzero(kif, sizeof(*kif)); strlcpy(kif->pfik_name, kif_name, sizeof(kif->pfik_name)); /* * It seems that the value of time_second is in unintialzied state * when pf sets interface statistics clear time in boot phase if pf * was statically linked to kernel. Instead of setting the bogus * time value have pfi_get_ifaces handle this case. In * pfi_get_ifaces it uses time_second if it sees the time is 0. */ kif->pfik_tzero = time_second > 1 ? time_second : 0; TAILQ_INIT(&kif->pfik_dynaddrs); RB_INSERT(pfi_ifhead, &V_pfi_ifs, kif); return (kif); } void pfi_kif_ref(struct pfi_kif *kif) { PF_RULES_WASSERT(); kif->pfik_rulerefs++; } void pfi_kif_unref(struct pfi_kif *kif) { PF_RULES_WASSERT(); KASSERT(kif->pfik_rulerefs > 0, ("%s: %p has zero refs", __func__, kif)); kif->pfik_rulerefs--; if (kif->pfik_rulerefs > 0) return; /* kif referencing an existing ifnet or group should exist. */ if (kif->pfik_ifp != NULL || kif->pfik_group != NULL || kif == V_pfi_all) return; RB_REMOVE(pfi_ifhead, &V_pfi_ifs, kif); kif->pfik_flags |= PFI_IFLAG_REFS; mtx_lock(&pfi_unlnkdkifs_mtx); LIST_INSERT_HEAD(&V_pfi_unlinked_kifs, kif, pfik_list); mtx_unlock(&pfi_unlnkdkifs_mtx); } void pfi_kif_purge(void) { struct pfi_kif *kif, *kif1; /* * Do naive mark-and-sweep garbage collecting of old kifs. * Reference flag is raised by pf_purge_expired_states(). */ mtx_lock(&pfi_unlnkdkifs_mtx); LIST_FOREACH_SAFE(kif, &V_pfi_unlinked_kifs, pfik_list, kif1) { if (!(kif->pfik_flags & PFI_IFLAG_REFS)) { LIST_REMOVE(kif, pfik_list); free(kif, PFI_MTYPE); } else kif->pfik_flags &= ~PFI_IFLAG_REFS; } mtx_unlock(&pfi_unlnkdkifs_mtx); } int pfi_kif_match(struct pfi_kif *rule_kif, struct pfi_kif *packet_kif) { struct ifg_list *p; if (rule_kif == NULL || rule_kif == packet_kif) return (1); if (rule_kif->pfik_group != NULL) /* XXXGL: locking? */ - TAILQ_FOREACH(p, &packet_kif->pfik_ifp->if_groups, ifgl_next) + CK_STAILQ_FOREACH(p, &packet_kif->pfik_ifp->if_groups, ifgl_next) if (p->ifgl_group == rule_kif->pfik_group) return (1); return (0); } static void pfi_attach_ifnet(struct ifnet *ifp) { struct pfi_kif *kif; kif = malloc(sizeof(*kif), PFI_MTYPE, M_WAITOK); PF_RULES_WLOCK(); V_pfi_update++; kif = pfi_kif_attach(kif, ifp->if_xname); kif->pfik_ifp = ifp; ifp->if_pf_kif = kif; pfi_kif_update(kif); PF_RULES_WUNLOCK(); } static void pfi_attach_ifgroup(struct ifg_group *ifg) { struct pfi_kif *kif; kif = malloc(sizeof(*kif), PFI_MTYPE, M_WAITOK); PF_RULES_WLOCK(); V_pfi_update++; kif = pfi_kif_attach(kif, ifg->ifg_group); kif->pfik_group = ifg; ifg->ifg_pf_kif = kif; PF_RULES_WUNLOCK(); } int pfi_match_addr(struct pfi_dynaddr *dyn, struct pf_addr *a, sa_family_t af) { switch (af) { #ifdef INET case AF_INET: switch (dyn->pfid_acnt4) { case 0: return (0); case 1: return (PF_MATCHA(0, &dyn->pfid_addr4, &dyn->pfid_mask4, a, AF_INET)); default: return (pfr_match_addr(dyn->pfid_kt, a, AF_INET)); } break; #endif /* INET */ #ifdef INET6 case AF_INET6: switch (dyn->pfid_acnt6) { case 0: return (0); case 1: return (PF_MATCHA(0, &dyn->pfid_addr6, &dyn->pfid_mask6, a, AF_INET6)); default: return (pfr_match_addr(dyn->pfid_kt, a, AF_INET6)); } break; #endif /* INET6 */ default: return (0); } } int pfi_dynaddr_setup(struct pf_addr_wrap *aw, sa_family_t af) { struct pfi_dynaddr *dyn; char tblname[PF_TABLE_NAME_SIZE]; struct pf_ruleset *ruleset = NULL; struct pfi_kif *kif; int rv = 0; PF_RULES_WASSERT(); KASSERT(aw->type == PF_ADDR_DYNIFTL, ("%s: type %u", __func__, aw->type)); KASSERT(aw->p.dyn == NULL, ("%s: dyn is %p", __func__, aw->p.dyn)); if ((dyn = malloc(sizeof(*dyn), PFI_MTYPE, M_NOWAIT | M_ZERO)) == NULL) return (ENOMEM); if ((kif = malloc(sizeof(*kif), PFI_MTYPE, M_NOWAIT)) == NULL) { free(dyn, PFI_MTYPE); return (ENOMEM); } if (!strcmp(aw->v.ifname, "self")) dyn->pfid_kif = pfi_kif_attach(kif, IFG_ALL); else dyn->pfid_kif = pfi_kif_attach(kif, aw->v.ifname); pfi_kif_ref(dyn->pfid_kif); dyn->pfid_net = pfi_unmask(&aw->v.a.mask); if (af == AF_INET && dyn->pfid_net == 32) dyn->pfid_net = 128; strlcpy(tblname, aw->v.ifname, sizeof(tblname)); if (aw->iflags & PFI_AFLAG_NETWORK) strlcat(tblname, ":network", sizeof(tblname)); if (aw->iflags & PFI_AFLAG_BROADCAST) strlcat(tblname, ":broadcast", sizeof(tblname)); if (aw->iflags & PFI_AFLAG_PEER) strlcat(tblname, ":peer", sizeof(tblname)); if (aw->iflags & PFI_AFLAG_NOALIAS) strlcat(tblname, ":0", sizeof(tblname)); if (dyn->pfid_net != 128) snprintf(tblname + strlen(tblname), sizeof(tblname) - strlen(tblname), "/%d", dyn->pfid_net); if ((ruleset = pf_find_or_create_ruleset(PF_RESERVED_ANCHOR)) == NULL) { rv = ENOMEM; goto _bad; } if ((dyn->pfid_kt = pfr_attach_table(ruleset, tblname)) == NULL) { rv = ENOMEM; goto _bad; } dyn->pfid_kt->pfrkt_flags |= PFR_TFLAG_ACTIVE; dyn->pfid_iflags = aw->iflags; dyn->pfid_af = af; TAILQ_INSERT_TAIL(&dyn->pfid_kif->pfik_dynaddrs, dyn, entry); aw->p.dyn = dyn; pfi_kif_update(dyn->pfid_kif); return (0); _bad: if (dyn->pfid_kt != NULL) pfr_detach_table(dyn->pfid_kt); if (ruleset != NULL) pf_remove_if_empty_ruleset(ruleset); if (dyn->pfid_kif != NULL) pfi_kif_unref(dyn->pfid_kif); free(dyn, PFI_MTYPE); return (rv); } static void pfi_kif_update(struct pfi_kif *kif) { struct ifg_list *ifgl; struct pfi_dynaddr *p; PF_RULES_WASSERT(); /* update all dynaddr */ TAILQ_FOREACH(p, &kif->pfik_dynaddrs, entry) pfi_dynaddr_update(p); /* again for all groups kif is member of */ if (kif->pfik_ifp != NULL) { IF_ADDR_RLOCK(kif->pfik_ifp); - TAILQ_FOREACH(ifgl, &kif->pfik_ifp->if_groups, ifgl_next) + CK_STAILQ_FOREACH(ifgl, &kif->pfik_ifp->if_groups, ifgl_next) pfi_kif_update((struct pfi_kif *) ifgl->ifgl_group->ifg_pf_kif); IF_ADDR_RUNLOCK(kif->pfik_ifp); } } static void pfi_dynaddr_update(struct pfi_dynaddr *dyn) { struct pfi_kif *kif; struct pfr_ktable *kt; PF_RULES_WASSERT(); KASSERT(dyn && dyn->pfid_kif && dyn->pfid_kt, ("%s: bad argument", __func__)); kif = dyn->pfid_kif; kt = dyn->pfid_kt; if (kt->pfrkt_larg != V_pfi_update) { /* this table needs to be brought up-to-date */ pfi_table_update(kt, kif, dyn->pfid_net, dyn->pfid_iflags); kt->pfrkt_larg = V_pfi_update; } pfr_dynaddr_update(kt, dyn); } static void pfi_table_update(struct pfr_ktable *kt, struct pfi_kif *kif, int net, int flags) { int e, size2 = 0; struct ifg_member *ifgm; V_pfi_buffer_cnt = 0; if (kif->pfik_ifp != NULL) pfi_instance_add(kif->pfik_ifp, net, flags); else if (kif->pfik_group != NULL) { IFNET_RLOCK_NOSLEEP(); - TAILQ_FOREACH(ifgm, &kif->pfik_group->ifg_members, ifgm_next) + CK_STAILQ_FOREACH(ifgm, &kif->pfik_group->ifg_members, ifgm_next) pfi_instance_add(ifgm->ifgm_ifp, net, flags); IFNET_RUNLOCK_NOSLEEP(); } if ((e = pfr_set_addrs(&kt->pfrkt_t, V_pfi_buffer, V_pfi_buffer_cnt, &size2, NULL, NULL, NULL, 0, PFR_TFLAG_ALLMASK))) printf("%s: cannot set %d new addresses into table %s: %d\n", __func__, V_pfi_buffer_cnt, kt->pfrkt_name, e); } static void pfi_instance_add(struct ifnet *ifp, int net, int flags) { struct ifaddr *ia; int got4 = 0, got6 = 0; int net2, af; IF_ADDR_RLOCK(ifp); CK_STAILQ_FOREACH(ia, &ifp->if_addrhead, ifa_link) { if (ia->ifa_addr == NULL) continue; af = ia->ifa_addr->sa_family; if (af != AF_INET && af != AF_INET6) continue; /* * XXX: For point-to-point interfaces, (ifname:0) and IPv4, * jump over addresses without a proper route to work * around a problem with ppp not fully removing the * address used during IPCP. */ if ((ifp->if_flags & IFF_POINTOPOINT) && !(ia->ifa_flags & IFA_ROUTE) && (flags & PFI_AFLAG_NOALIAS) && (af == AF_INET)) continue; if ((flags & PFI_AFLAG_BROADCAST) && af == AF_INET6) continue; if ((flags & PFI_AFLAG_BROADCAST) && !(ifp->if_flags & IFF_BROADCAST)) continue; if ((flags & PFI_AFLAG_PEER) && !(ifp->if_flags & IFF_POINTOPOINT)) continue; if ((flags & PFI_AFLAG_NETWORK) && af == AF_INET6 && IN6_IS_ADDR_LINKLOCAL( &((struct sockaddr_in6 *)ia->ifa_addr)->sin6_addr)) continue; if (flags & PFI_AFLAG_NOALIAS) { if (af == AF_INET && got4) continue; if (af == AF_INET6 && got6) continue; } if (af == AF_INET) got4 = 1; else if (af == AF_INET6) got6 = 1; net2 = net; if (net2 == 128 && (flags & PFI_AFLAG_NETWORK)) { if (af == AF_INET) net2 = pfi_unmask(&((struct sockaddr_in *) ia->ifa_netmask)->sin_addr); else if (af == AF_INET6) net2 = pfi_unmask(&((struct sockaddr_in6 *) ia->ifa_netmask)->sin6_addr); } if (af == AF_INET && net2 > 32) net2 = 32; if (flags & PFI_AFLAG_BROADCAST) pfi_address_add(ia->ifa_broadaddr, af, net2); else if (flags & PFI_AFLAG_PEER) pfi_address_add(ia->ifa_dstaddr, af, net2); else pfi_address_add(ia->ifa_addr, af, net2); } IF_ADDR_RUNLOCK(ifp); } static void pfi_address_add(struct sockaddr *sa, int af, int net) { struct pfr_addr *p; int i; if (V_pfi_buffer_cnt >= V_pfi_buffer_max) { int new_max = V_pfi_buffer_max * 2; if (new_max > PFI_BUFFER_MAX) { printf("%s: address buffer full (%d/%d)\n", __func__, V_pfi_buffer_cnt, PFI_BUFFER_MAX); return; } p = malloc(new_max * sizeof(*V_pfi_buffer), PFI_MTYPE, M_NOWAIT); if (p == NULL) { printf("%s: no memory to grow buffer (%d/%d)\n", __func__, V_pfi_buffer_cnt, PFI_BUFFER_MAX); return; } memcpy(p, V_pfi_buffer, V_pfi_buffer_max * sizeof(*V_pfi_buffer)); /* no need to zero buffer */ free(V_pfi_buffer, PFI_MTYPE); V_pfi_buffer = p; V_pfi_buffer_max = new_max; } if (af == AF_INET && net > 32) net = 128; p = V_pfi_buffer + V_pfi_buffer_cnt++; bzero(p, sizeof(*p)); p->pfra_af = af; p->pfra_net = net; if (af == AF_INET) p->pfra_ip4addr = ((struct sockaddr_in *)sa)->sin_addr; else if (af == AF_INET6) { p->pfra_ip6addr = ((struct sockaddr_in6 *)sa)->sin6_addr; if (IN6_IS_SCOPE_EMBED(&p->pfra_ip6addr)) p->pfra_ip6addr.s6_addr16[1] = 0; } /* mask network address bits */ if (net < 128) ((caddr_t)p)[p->pfra_net/8] &= ~(0xFF >> (p->pfra_net%8)); for (i = (p->pfra_net+7)/8; i < sizeof(p->pfra_u); i++) ((caddr_t)p)[i] = 0; } void pfi_dynaddr_remove(struct pfi_dynaddr *dyn) { KASSERT(dyn->pfid_kif != NULL, ("%s: null pfid_kif", __func__)); KASSERT(dyn->pfid_kt != NULL, ("%s: null pfid_kt", __func__)); TAILQ_REMOVE(&dyn->pfid_kif->pfik_dynaddrs, dyn, entry); pfi_kif_unref(dyn->pfid_kif); pfr_detach_table(dyn->pfid_kt); free(dyn, PFI_MTYPE); } void pfi_dynaddr_copyout(struct pf_addr_wrap *aw) { KASSERT(aw->type == PF_ADDR_DYNIFTL, ("%s: type %u", __func__, aw->type)); if (aw->p.dyn == NULL || aw->p.dyn->pfid_kif == NULL) return; aw->p.dyncnt = aw->p.dyn->pfid_acnt4 + aw->p.dyn->pfid_acnt6; } static int pfi_if_compare(struct pfi_kif *p, struct pfi_kif *q) { return (strncmp(p->pfik_name, q->pfik_name, IFNAMSIZ)); } void pfi_update_status(const char *name, struct pf_status *pfs) { struct pfi_kif *p; struct pfi_kif_cmp key; struct ifg_member p_member, *ifgm; - TAILQ_HEAD(, ifg_member) ifg_members; + CK_STAILQ_HEAD(, ifg_member) ifg_members; int i, j, k; strlcpy(key.pfik_name, name, sizeof(key.pfik_name)); p = RB_FIND(pfi_ifhead, &V_pfi_ifs, (struct pfi_kif *)&key); if (p == NULL) return; if (p->pfik_group != NULL) { bcopy(&p->pfik_group->ifg_members, &ifg_members, sizeof(ifg_members)); } else { /* build a temporary list for p only */ bzero(&p_member, sizeof(p_member)); p_member.ifgm_ifp = p->pfik_ifp; - TAILQ_INIT(&ifg_members); - TAILQ_INSERT_TAIL(&ifg_members, &p_member, ifgm_next); + CK_STAILQ_INIT(&ifg_members); + CK_STAILQ_INSERT_TAIL(&ifg_members, &p_member, ifgm_next); } if (pfs) { bzero(pfs->pcounters, sizeof(pfs->pcounters)); bzero(pfs->bcounters, sizeof(pfs->bcounters)); } - TAILQ_FOREACH(ifgm, &ifg_members, ifgm_next) { + CK_STAILQ_FOREACH(ifgm, &ifg_members, ifgm_next) { if (ifgm->ifgm_ifp == NULL || ifgm->ifgm_ifp->if_pf_kif == NULL) continue; p = (struct pfi_kif *)ifgm->ifgm_ifp->if_pf_kif; /* just clear statistics */ if (pfs == NULL) { bzero(p->pfik_packets, sizeof(p->pfik_packets)); bzero(p->pfik_bytes, sizeof(p->pfik_bytes)); p->pfik_tzero = time_second; continue; } for (i = 0; i < 2; i++) for (j = 0; j < 2; j++) for (k = 0; k < 2; k++) { pfs->pcounters[i][j][k] += p->pfik_packets[i][j][k]; pfs->bcounters[i][j] += p->pfik_bytes[i][j][k]; } } } void pfi_get_ifaces(const char *name, struct pfi_kif *buf, int *size) { struct pfi_kif *p, *nextp; int n = 0; for (p = RB_MIN(pfi_ifhead, &V_pfi_ifs); p; p = nextp) { nextp = RB_NEXT(pfi_ifhead, &V_pfi_ifs, p); if (pfi_skip_if(name, p)) continue; if (*size <= n++) break; if (!p->pfik_tzero) p->pfik_tzero = time_second; bcopy(p, buf++, sizeof(*buf)); nextp = RB_NEXT(pfi_ifhead, &V_pfi_ifs, p); } *size = n; } static int pfi_skip_if(const char *filter, struct pfi_kif *p) { int n; if (filter == NULL || !*filter) return (0); if (!strcmp(p->pfik_name, filter)) return (0); /* exact match */ n = strlen(filter); if (n < 1 || n >= IFNAMSIZ) return (1); /* sanity check */ if (filter[n-1] >= '0' && filter[n-1] <= '9') return (1); /* only do exact match in that case */ if (strncmp(p->pfik_name, filter, n)) return (1); /* prefix doesn't match */ return (p->pfik_name[n] < '0' || p->pfik_name[n] > '9'); } int pfi_set_flags(const char *name, int flags) { struct pfi_kif *p; RB_FOREACH(p, pfi_ifhead, &V_pfi_ifs) { if (pfi_skip_if(name, p)) continue; p->pfik_flags |= flags; } return (0); } int pfi_clear_flags(const char *name, int flags) { struct pfi_kif *p; RB_FOREACH(p, pfi_ifhead, &V_pfi_ifs) { if (pfi_skip_if(name, p)) continue; p->pfik_flags &= ~flags; } return (0); } /* from pf_print_state.c */ static int pfi_unmask(void *addr) { struct pf_addr *m = addr; int i = 31, j = 0, b = 0; u_int32_t tmp; while (j < 4 && m->addr32[j] == 0xffffffff) { b += 32; j++; } if (j < 4) { tmp = ntohl(m->addr32[j]); for (i = 31; tmp & (1 << i); --i) b++; } return (b); } static void pfi_attach_ifnet_event(void *arg __unused, struct ifnet *ifp) { if (V_pf_vnet_active == 0) { /* Avoid teardown race in the least expensive way. */ return; } pfi_attach_ifnet(ifp); #ifdef ALTQ PF_RULES_WLOCK(); pf_altq_ifnet_event(ifp, 0); PF_RULES_WUNLOCK(); #endif } static void pfi_detach_ifnet_event(void *arg __unused, struct ifnet *ifp) { struct pfi_kif *kif = (struct pfi_kif *)ifp->if_pf_kif; if (kif == NULL) return; if (V_pf_vnet_active == 0) { /* Avoid teardown race in the least expensive way. */ return; } PF_RULES_WLOCK(); V_pfi_update++; pfi_kif_update(kif); kif->pfik_ifp = NULL; ifp->if_pf_kif = NULL; #ifdef ALTQ pf_altq_ifnet_event(ifp, 1); #endif PF_RULES_WUNLOCK(); } static void pfi_attach_group_event(void *arg __unused, struct ifg_group *ifg) { if (V_pf_vnet_active == 0) { /* Avoid teardown race in the least expensive way. */ return; } pfi_attach_ifgroup(ifg); } static void pfi_change_group_event(void *arg __unused, char *gname) { struct pfi_kif *kif; if (V_pf_vnet_active == 0) { /* Avoid teardown race in the least expensive way. */ return; } kif = malloc(sizeof(*kif), PFI_MTYPE, M_WAITOK); PF_RULES_WLOCK(); V_pfi_update++; kif = pfi_kif_attach(kif, gname); pfi_kif_update(kif); PF_RULES_WUNLOCK(); } static void pfi_detach_group_event(void *arg __unused, struct ifg_group *ifg) { struct pfi_kif *kif = (struct pfi_kif *)ifg->ifg_pf_kif; if (kif == NULL) return; if (V_pf_vnet_active == 0) { /* Avoid teardown race in the least expensive way. */ return; } PF_RULES_WLOCK(); V_pfi_update++; kif->pfik_group = NULL; ifg->ifg_pf_kif = NULL; PF_RULES_WUNLOCK(); } static void pfi_ifaddr_event(void *arg __unused, struct ifnet *ifp) { if (ifp->if_pf_kif == NULL) return; if (V_pf_vnet_active == 0) { /* Avoid teardown race in the least expensive way. */ return; } PF_RULES_WLOCK(); if (ifp && ifp->if_pf_kif) { V_pfi_update++; pfi_kif_update(ifp->if_pf_kif); } PF_RULES_WUNLOCK(); } Index: head/sys/nfs/bootp_subr.c =================================================================== --- head/sys/nfs/bootp_subr.c (revision 334117) +++ head/sys/nfs/bootp_subr.c (revision 334118) @@ -1,1903 +1,1903 @@ /*- * SPDX-License-Identifier: BSD-4-Clause * * Copyright (c) 1995 Gordon Ross, Adam Glass * Copyright (c) 1992 Regents of the University of California. * All rights reserved. * * This software was developed by the Computer Systems Engineering group * at Lawrence Berkeley Laboratory under DARPA contract BG 91-66 and * contributed to Berkeley. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Lawrence Berkeley Laboratory and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * based on: * nfs/krpc_subr.c * $NetBSD: krpc_subr.c,v 1.10 1995/08/08 20:43:43 gwr Exp $ */ #include __FBSDID("$FreeBSD$"); #include "opt_bootp.h" #include "opt_nfs.h" #include "opt_rootdevname.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef BOOTP_DEBUG #include #endif #include #include #include #include #include #include #include #include #include #include #define BOOTP_MIN_LEN 300 /* Minimum size of bootp udp packet */ #ifndef BOOTP_SETTLE_DELAY #define BOOTP_SETTLE_DELAY 3 #endif /* * Wait 10 seconds for interface appearance * USB ethernet adapters might require some time to pop up */ #ifndef BOOTP_IFACE_WAIT_TIMEOUT #define BOOTP_IFACE_WAIT_TIMEOUT 10 #endif /* * What is the longest we will wait before re-sending a request? * Note this is also the frequency of "RPC timeout" messages. * The re-send loop count sup linearly to this maximum, so the * first complaint will happen after (1+2+3+4+5)=15 seconds. */ #define MAX_RESEND_DELAY 5 /* seconds */ /* Definitions from RFC951 */ struct bootp_packet { u_int8_t op; u_int8_t htype; u_int8_t hlen; u_int8_t hops; u_int32_t xid; u_int16_t secs; u_int16_t flags; struct in_addr ciaddr; struct in_addr yiaddr; struct in_addr siaddr; struct in_addr giaddr; unsigned char chaddr[16]; char sname[64]; char file[128]; unsigned char vend[1222]; }; struct bootpc_ifcontext { STAILQ_ENTRY(bootpc_ifcontext) next; struct bootp_packet call; struct bootp_packet reply; int replylen; int overload; union { struct ifreq _ifreq; struct in_aliasreq _in_alias_req; } _req; #define ireq _req._ifreq #define iareq _req._in_alias_req struct ifnet *ifp; struct sockaddr_dl *sdl; struct sockaddr_in myaddr; struct sockaddr_in netmask; struct sockaddr_in gw; int gotgw; int gotnetmask; int gotrootpath; int outstanding; int sentmsg; u_int32_t xid; enum { IF_BOOTP_UNRESOLVED, IF_BOOTP_RESOLVED, IF_BOOTP_FAILED, IF_DHCP_UNRESOLVED, IF_DHCP_OFFERED, IF_DHCP_RESOLVED, IF_DHCP_FAILED, } state; int dhcpquerytype; /* dhcp type sent */ struct in_addr dhcpserver; int gotdhcpserver; uint16_t mtu; }; #define TAG_MAXLEN 1024 struct bootpc_tagcontext { char buf[TAG_MAXLEN + 1]; int overload; int badopt; int badtag; int foundopt; int taglen; }; struct bootpc_globalcontext { STAILQ_HEAD(, bootpc_ifcontext) interfaces; u_int32_t xid; int any_root_overrides; int gotrootpath; int gotgw; int ifnum; int secs; int starttime; struct bootp_packet reply; int replylen; struct bootpc_ifcontext *setrootfs; struct bootpc_ifcontext *sethostname; struct bootpc_tagcontext tmptag; struct bootpc_tagcontext tag; }; #define IPPORT_BOOTPC 68 #define IPPORT_BOOTPS 67 #define BOOTP_REQUEST 1 #define BOOTP_REPLY 2 /* Common tags */ #define TAG_PAD 0 /* Pad option, implicit length 1 */ #define TAG_SUBNETMASK 1 /* RFC 950 subnet mask */ #define TAG_ROUTERS 3 /* Routers (in order of preference) */ #define TAG_HOSTNAME 12 /* Client host name */ #define TAG_ROOT 17 /* Root path */ #define TAG_INTF_MTU 26 /* Interface MTU Size (RFC2132) */ /* DHCP specific tags */ #define TAG_OVERLOAD 52 /* Option Overload */ #define TAG_MAXMSGSIZE 57 /* Maximum DHCP Message Size */ #define TAG_END 255 /* End Option (i.e. no more options) */ /* Overload values */ #define OVERLOAD_FILE 1 #define OVERLOAD_SNAME 2 /* Site specific tags: */ #define TAG_ROOTOPTS 130 #define TAG_COOKIE 134 /* ascii info for userland, via sysctl */ #define TAG_DHCP_MSGTYPE 53 #define TAG_DHCP_REQ_ADDR 50 #define TAG_DHCP_SERVERID 54 #define TAG_DHCP_LEASETIME 51 #define TAG_VENDOR_INDENTIFIER 60 #define DHCP_NOMSG 0 #define DHCP_DISCOVER 1 #define DHCP_OFFER 2 #define DHCP_REQUEST 3 #define DHCP_ACK 5 /* NFS read/write block size */ #ifndef BOOTP_BLOCKSIZE #define BOOTP_BLOCKSIZE 8192 #endif static char bootp_cookie[128]; static struct socket *bootp_so; SYSCTL_STRING(_kern, OID_AUTO, bootp_cookie, CTLFLAG_RD, bootp_cookie, 0, "Cookie (T134) supplied by bootp server"); /* mountd RPC */ static int md_mount(struct sockaddr_in *mdsin, char *path, u_char *fhp, int *fhsizep, struct nfs_args *args, struct thread *td); static int setfs(struct sockaddr_in *addr, char *path, char *p, const struct in_addr *siaddr); static int getdec(char **ptr); static int getip(char **ptr, struct in_addr *ip); static void mountopts(struct nfs_args *args, char *p); static int xdr_opaque_decode(struct mbuf **ptr, u_char *buf, int len); static int xdr_int_decode(struct mbuf **ptr, int *iptr); static void print_in_addr(struct in_addr addr); static void print_sin_addr(struct sockaddr_in *addr); static void clear_sinaddr(struct sockaddr_in *sin); static void allocifctx(struct bootpc_globalcontext *gctx); static void bootpc_compose_query(struct bootpc_ifcontext *ifctx, struct thread *td); static unsigned char *bootpc_tag(struct bootpc_tagcontext *tctx, struct bootp_packet *bp, int len, int tag); static void bootpc_tag_helper(struct bootpc_tagcontext *tctx, unsigned char *start, int len, int tag); #ifdef BOOTP_DEBUG void bootpboot_p_sa(struct sockaddr *sa, struct sockaddr *ma); void bootpboot_p_rtentry(struct rtentry *rt); void bootpboot_p_tree(struct radix_node *rn); void bootpboot_p_rtlist(void); void bootpboot_p_if(struct ifnet *ifp, struct ifaddr *ifa); void bootpboot_p_iflist(void); #endif static int bootpc_call(struct bootpc_globalcontext *gctx, struct thread *td); static void bootpc_fakeup_interface(struct bootpc_ifcontext *ifctx, struct thread *td); static void bootpc_adjust_interface(struct bootpc_ifcontext *ifctx, struct bootpc_globalcontext *gctx, struct thread *td); static void bootpc_decode_reply(struct nfsv3_diskless *nd, struct bootpc_ifcontext *ifctx, struct bootpc_globalcontext *gctx); static int bootpc_received(struct bootpc_globalcontext *gctx, struct bootpc_ifcontext *ifctx); static __inline int bootpc_ifctx_isresolved(struct bootpc_ifcontext *ifctx); static __inline int bootpc_ifctx_isunresolved(struct bootpc_ifcontext *ifctx); static __inline int bootpc_ifctx_isfailed(struct bootpc_ifcontext *ifctx); /* * In order to have multiple active interfaces with address 0.0.0.0 * and be able to send data to a selected interface, we first set * mask to /8 on all interfaces, and temporarily set it to /0 when * doing sosend(). */ #ifdef BOOTP_DEBUG void bootpboot_p_sa(struct sockaddr *sa, struct sockaddr *ma) { if (sa == NULL) { printf("(sockaddr *) "); return; } switch (sa->sa_family) { case AF_INET: { struct sockaddr_in *sin; sin = (struct sockaddr_in *) sa; printf("inet "); print_sin_addr(sin); if (ma != NULL) { sin = (struct sockaddr_in *) ma; printf(" mask "); print_sin_addr(sin); } } break; case AF_LINK: { struct sockaddr_dl *sli; int i; sli = (struct sockaddr_dl *) sa; printf("link %.*s ", sli->sdl_nlen, sli->sdl_data); for (i = 0; i < sli->sdl_alen; i++) { if (i > 0) printf(":"); printf("%x", ((unsigned char *) LLADDR(sli))[i]); } } break; default: printf("af%d", sa->sa_family); } } void bootpboot_p_rtentry(struct rtentry *rt) { bootpboot_p_sa(rt_key(rt), rt_mask(rt)); printf(" "); bootpboot_p_sa(rt->rt_gateway, NULL); printf(" "); printf("flags %x", (unsigned short) rt->rt_flags); printf(" %d", (int) rt->rt_expire); printf(" %s\n", rt->rt_ifp->if_xname); } void bootpboot_p_tree(struct radix_node *rn) { while (rn != NULL) { if (rn->rn_bit < 0) { if ((rn->rn_flags & RNF_ROOT) != 0) { } else { bootpboot_p_rtentry((struct rtentry *) rn); } rn = rn->rn_dupedkey; } else { bootpboot_p_tree(rn->rn_left); bootpboot_p_tree(rn->rn_right); return; } } } void bootpboot_p_rtlist(void) { struct rib_head *rnh; printf("Routing table:\n"); rnh = rt_tables_get_rnh(0, AF_INET); if (rnh == NULL) return; RIB_RLOCK(rnh); /* could sleep XXX */ bootpboot_p_tree(rnh->rnh_treetop); RIB_RUNLOCK(rnh); } void bootpboot_p_if(struct ifnet *ifp, struct ifaddr *ifa) { printf("%s flags %x, addr ", ifp->if_xname, ifp->if_flags); print_sin_addr((struct sockaddr_in *) ifa->ifa_addr); printf(", broadcast "); print_sin_addr((struct sockaddr_in *) ifa->ifa_dstaddr); printf(", netmask "); print_sin_addr((struct sockaddr_in *) ifa->ifa_netmask); printf("\n"); } void bootpboot_p_iflist(void) { struct ifnet *ifp; struct ifaddr *ifa; printf("Interface list:\n"); IFNET_RLOCK(); - for (ifp = TAILQ_FIRST(&V_ifnet); + for (ifp = CK_STAILQ_FIRST(&V_ifnet); ifp != NULL; - ifp = TAILQ_NEXT(ifp, if_link)) { + ifp = CK_STAILQ_NEXT(ifp, if_link)) { for (ifa = CK_STAILQ_FIRST(&ifp->if_addrhead); ifa != NULL; - ifa = TAILQ_NEXT(ifa, ifa_link)) + ifa = CK_STAILQ_NEXT(ifa, ifa_link)) if (ifa->ifa_addr->sa_family == AF_INET) bootpboot_p_if(ifp, ifa); } IFNET_RUNLOCK(); } #endif /* defined(BOOTP_DEBUG) */ static void clear_sinaddr(struct sockaddr_in *sin) { bzero(sin, sizeof(*sin)); sin->sin_len = sizeof(*sin); sin->sin_family = AF_INET; sin->sin_addr.s_addr = INADDR_ANY; /* XXX: htonl(INAADDR_ANY) ? */ sin->sin_port = 0; } static void allocifctx(struct bootpc_globalcontext *gctx) { struct bootpc_ifcontext *ifctx; ifctx = malloc(sizeof(*ifctx), M_TEMP, M_WAITOK | M_ZERO); ifctx->xid = gctx->xid; #ifdef BOOTP_NO_DHCP ifctx->state = IF_BOOTP_UNRESOLVED; #else ifctx->state = IF_DHCP_UNRESOLVED; #endif gctx->xid += 0x100; STAILQ_INSERT_TAIL(&gctx->interfaces, ifctx, next); } static __inline int bootpc_ifctx_isresolved(struct bootpc_ifcontext *ifctx) { if (ifctx->state == IF_BOOTP_RESOLVED || ifctx->state == IF_DHCP_RESOLVED) return 1; return 0; } static __inline int bootpc_ifctx_isunresolved(struct bootpc_ifcontext *ifctx) { if (ifctx->state == IF_BOOTP_UNRESOLVED || ifctx->state == IF_DHCP_UNRESOLVED) return 1; return 0; } static __inline int bootpc_ifctx_isfailed(struct bootpc_ifcontext *ifctx) { if (ifctx->state == IF_BOOTP_FAILED || ifctx->state == IF_DHCP_FAILED) return 1; return 0; } static int bootpc_received(struct bootpc_globalcontext *gctx, struct bootpc_ifcontext *ifctx) { unsigned char dhcpreplytype; char *p; /* * Need timeout for fallback to less * desirable alternative. */ /* This call used for the side effect (badopt flag) */ (void) bootpc_tag(&gctx->tmptag, &gctx->reply, gctx->replylen, TAG_END); /* If packet is invalid, ignore it */ if (gctx->tmptag.badopt != 0) return 0; p = bootpc_tag(&gctx->tmptag, &gctx->reply, gctx->replylen, TAG_DHCP_MSGTYPE); if (p != NULL) dhcpreplytype = *p; else dhcpreplytype = DHCP_NOMSG; switch (ifctx->dhcpquerytype) { case DHCP_DISCOVER: if (dhcpreplytype != DHCP_OFFER /* Normal DHCP offer */ #ifndef BOOTP_FORCE_DHCP && dhcpreplytype != DHCP_NOMSG /* Fallback to BOOTP */ #endif ) return 0; break; case DHCP_REQUEST: if (dhcpreplytype != DHCP_ACK) return 0; case DHCP_NOMSG: break; } /* Ignore packet unless it gives us a root tag we didn't have */ if ((ifctx->state == IF_BOOTP_RESOLVED || (ifctx->dhcpquerytype == DHCP_DISCOVER && (ifctx->state == IF_DHCP_OFFERED || ifctx->state == IF_DHCP_RESOLVED))) && (bootpc_tag(&gctx->tmptag, &ifctx->reply, ifctx->replylen, TAG_ROOT) != NULL || bootpc_tag(&gctx->tmptag, &gctx->reply, gctx->replylen, TAG_ROOT) == NULL)) return 0; bcopy(&gctx->reply, &ifctx->reply, gctx->replylen); ifctx->replylen = gctx->replylen; /* XXX: Only reset if 'perfect' response */ if (ifctx->state == IF_BOOTP_UNRESOLVED) ifctx->state = IF_BOOTP_RESOLVED; else if (ifctx->state == IF_DHCP_UNRESOLVED && ifctx->dhcpquerytype == DHCP_DISCOVER) { if (dhcpreplytype == DHCP_OFFER) ifctx->state = IF_DHCP_OFFERED; else ifctx->state = IF_BOOTP_RESOLVED; /* Fallback */ } else if (ifctx->state == IF_DHCP_OFFERED && ifctx->dhcpquerytype == DHCP_REQUEST) ifctx->state = IF_DHCP_RESOLVED; if (ifctx->dhcpquerytype == DHCP_DISCOVER && ifctx->state != IF_BOOTP_RESOLVED) { p = bootpc_tag(&gctx->tmptag, &ifctx->reply, ifctx->replylen, TAG_DHCP_SERVERID); if (p != NULL && gctx->tmptag.taglen == 4) { memcpy(&ifctx->dhcpserver, p, 4); ifctx->gotdhcpserver = 1; } else ifctx->gotdhcpserver = 0; return 1; } ifctx->gotrootpath = (bootpc_tag(&gctx->tmptag, &ifctx->reply, ifctx->replylen, TAG_ROOT) != NULL); ifctx->gotgw = (bootpc_tag(&gctx->tmptag, &ifctx->reply, ifctx->replylen, TAG_ROUTERS) != NULL); ifctx->gotnetmask = (bootpc_tag(&gctx->tmptag, &ifctx->reply, ifctx->replylen, TAG_SUBNETMASK) != NULL); return 1; } static int bootpc_call(struct bootpc_globalcontext *gctx, struct thread *td) { struct sockaddr_in *sin, dst; struct uio auio; struct sockopt sopt; struct iovec aio; int error, on, rcvflg, timo, len; time_t atimo; time_t rtimo; struct timeval tv; struct bootpc_ifcontext *ifctx; int outstanding; int gotrootpath; int retry; const char *s; tv.tv_sec = 1; tv.tv_usec = 0; bzero(&sopt, sizeof(sopt)); sopt.sopt_dir = SOPT_SET; sopt.sopt_level = SOL_SOCKET; sopt.sopt_name = SO_RCVTIMEO; sopt.sopt_val = &tv; sopt.sopt_valsize = sizeof tv; error = sosetopt(bootp_so, &sopt); if (error != 0) goto out; /* * Enable broadcast. */ on = 1; sopt.sopt_name = SO_BROADCAST; sopt.sopt_val = &on; sopt.sopt_valsize = sizeof on; error = sosetopt(bootp_so, &sopt); if (error != 0) goto out; /* * Disable routing. */ on = 1; sopt.sopt_name = SO_DONTROUTE; sopt.sopt_val = &on; sopt.sopt_valsize = sizeof on; error = sosetopt(bootp_so, &sopt); if (error != 0) goto out; /* * Bind the local endpoint to a bootp client port. */ sin = &dst; clear_sinaddr(sin); sin->sin_port = htons(IPPORT_BOOTPC); error = sobind(bootp_so, (struct sockaddr *)sin, td); if (error != 0) { printf("bind failed\n"); goto out; } /* * Setup socket address for the server. */ sin = &dst; clear_sinaddr(sin); sin->sin_addr.s_addr = INADDR_BROADCAST; sin->sin_port = htons(IPPORT_BOOTPS); /* * Send it, repeatedly, until a reply is received, * but delay each re-send by an increasing amount. * If the delay hits the maximum, start complaining. */ timo = 0; rtimo = 0; for (;;) { outstanding = 0; gotrootpath = 0; STAILQ_FOREACH(ifctx, &gctx->interfaces, next) { if (bootpc_ifctx_isresolved(ifctx) != 0 && bootpc_tag(&gctx->tmptag, &ifctx->reply, ifctx->replylen, TAG_ROOT) != NULL) gotrootpath = 1; } STAILQ_FOREACH(ifctx, &gctx->interfaces, next) { struct in_aliasreq *ifra = &ifctx->iareq; sin = (struct sockaddr_in *)&ifra->ifra_mask; ifctx->outstanding = 0; if (bootpc_ifctx_isresolved(ifctx) != 0 && gotrootpath != 0) { continue; } if (bootpc_ifctx_isfailed(ifctx) != 0) continue; outstanding++; ifctx->outstanding = 1; /* Proceed to next step in DHCP negotiation */ if ((ifctx->state == IF_DHCP_OFFERED && ifctx->dhcpquerytype != DHCP_REQUEST) || (ifctx->state == IF_DHCP_UNRESOLVED && ifctx->dhcpquerytype != DHCP_DISCOVER) || (ifctx->state == IF_BOOTP_UNRESOLVED && ifctx->dhcpquerytype != DHCP_NOMSG)) { ifctx->sentmsg = 0; bootpc_compose_query(ifctx, td); } /* Send BOOTP request (or re-send). */ if (ifctx->sentmsg == 0) { switch(ifctx->dhcpquerytype) { case DHCP_DISCOVER: s = "DHCP Discover"; break; case DHCP_REQUEST: s = "DHCP Request"; break; case DHCP_NOMSG: default: s = "BOOTP Query"; break; } printf("Sending %s packet from " "interface %s (%*D)\n", s, ifctx->ireq.ifr_name, ifctx->sdl->sdl_alen, (unsigned char *) LLADDR(ifctx->sdl), ":"); ifctx->sentmsg = 1; } aio.iov_base = (caddr_t) &ifctx->call; aio.iov_len = sizeof(ifctx->call); auio.uio_iov = &aio; auio.uio_iovcnt = 1; auio.uio_segflg = UIO_SYSSPACE; auio.uio_rw = UIO_WRITE; auio.uio_offset = 0; auio.uio_resid = sizeof(ifctx->call); auio.uio_td = td; /* Set netmask to 0.0.0.0 */ clear_sinaddr(sin); error = ifioctl(bootp_so, SIOCAIFADDR, (caddr_t)ifra, td); if (error != 0) panic("%s: SIOCAIFADDR, error=%d", __func__, error); error = sosend(bootp_so, (struct sockaddr *) &dst, &auio, NULL, NULL, 0, td); if (error != 0) printf("%s: sosend: %d state %08x\n", __func__, error, (int )bootp_so->so_state); /* Set netmask to 255.0.0.0 */ sin->sin_addr.s_addr = htonl(IN_CLASSA_NET); error = ifioctl(bootp_so, SIOCAIFADDR, (caddr_t)ifra, td); if (error != 0) panic("%s: SIOCAIFADDR, error=%d", __func__, error); } if (outstanding == 0 && (rtimo == 0 || time_second >= rtimo)) { error = 0; goto out; } /* Determine new timeout. */ if (timo < MAX_RESEND_DELAY) timo++; else { printf("DHCP/BOOTP timeout for server "); print_sin_addr(&dst); printf("\n"); } /* * Wait for up to timo seconds for a reply. * The socket receive timeout was set to 1 second. */ atimo = timo + time_second; while (time_second < atimo) { aio.iov_base = (caddr_t) &gctx->reply; aio.iov_len = sizeof(gctx->reply); auio.uio_iov = &aio; auio.uio_iovcnt = 1; auio.uio_segflg = UIO_SYSSPACE; auio.uio_rw = UIO_READ; auio.uio_offset = 0; auio.uio_resid = sizeof(gctx->reply); auio.uio_td = td; rcvflg = 0; error = soreceive(bootp_so, NULL, &auio, NULL, NULL, &rcvflg); gctx->secs = time_second - gctx->starttime; STAILQ_FOREACH(ifctx, &gctx->interfaces, next) { if (bootpc_ifctx_isresolved(ifctx) != 0 || bootpc_ifctx_isfailed(ifctx) != 0) continue; ifctx->call.secs = htons(gctx->secs); } if (error == EWOULDBLOCK) continue; if (error != 0) goto out; len = sizeof(gctx->reply) - auio.uio_resid; /* Do we have the required number of bytes ? */ if (len < BOOTP_MIN_LEN) continue; gctx->replylen = len; /* Is it a reply? */ if (gctx->reply.op != BOOTP_REPLY) continue; /* Is this an answer to our query */ STAILQ_FOREACH(ifctx, &gctx->interfaces, next) { if (gctx->reply.xid != ifctx->call.xid) continue; /* Same HW address size ? */ if (gctx->reply.hlen != ifctx->call.hlen) continue; /* Correct HW address ? */ if (bcmp(gctx->reply.chaddr, ifctx->call.chaddr, ifctx->call.hlen) != 0) continue; break; } if (ifctx != NULL) { s = bootpc_tag(&gctx->tmptag, &gctx->reply, gctx->replylen, TAG_DHCP_MSGTYPE); if (s != NULL) { switch (*s) { case DHCP_OFFER: s = "DHCP Offer"; break; case DHCP_ACK: s = "DHCP Ack"; break; default: s = "DHCP (unexpected)"; break; } } else s = "BOOTP Reply"; printf("Received %s packet" " on %s from ", s, ifctx->ireq.ifr_name); print_in_addr(gctx->reply.siaddr); if (gctx->reply.giaddr.s_addr != htonl(INADDR_ANY)) { printf(" via "); print_in_addr(gctx->reply.giaddr); } if (bootpc_received(gctx, ifctx) != 0) { printf(" (accepted)"); if (ifctx->outstanding) { ifctx->outstanding = 0; outstanding--; } /* Network settle delay */ if (outstanding == 0) atimo = time_second + BOOTP_SETTLE_DELAY; } else printf(" (ignored)"); if (ifctx->gotrootpath || gctx->any_root_overrides) { gotrootpath = 1; rtimo = time_second + BOOTP_SETTLE_DELAY; if (ifctx->gotrootpath) printf(" (got root path)"); } printf("\n"); } } /* while secs */ #ifdef BOOTP_TIMEOUT if (gctx->secs > BOOTP_TIMEOUT && BOOTP_TIMEOUT > 0) break; #endif /* Force a retry if halfway in DHCP negotiation */ retry = 0; STAILQ_FOREACH(ifctx, &gctx->interfaces, next) if (ifctx->state == IF_DHCP_OFFERED) { if (ifctx->dhcpquerytype == DHCP_DISCOVER) retry = 1; else ifctx->state = IF_DHCP_UNRESOLVED; } if (retry != 0) continue; if (gotrootpath != 0) { gctx->gotrootpath = gotrootpath; if (rtimo != 0 && time_second >= rtimo) break; } } /* forever send/receive */ /* * XXX: These are errors of varying seriousness being silently * ignored */ STAILQ_FOREACH(ifctx, &gctx->interfaces, next) if (bootpc_ifctx_isresolved(ifctx) == 0) { printf("%s timeout for interface %s\n", ifctx->dhcpquerytype != DHCP_NOMSG ? "DHCP" : "BOOTP", ifctx->ireq.ifr_name); } if (gctx->gotrootpath != 0) { #if 0 printf("Got a root path, ignoring remaining timeout\n"); #endif error = 0; goto out; } #ifndef BOOTP_NFSROOT STAILQ_FOREACH(ifctx, &gctx->interfaces, next) if (bootpc_ifctx_isresolved(ifctx) != 0) { error = 0; goto out; } #endif error = ETIMEDOUT; out: return (error); } static void bootpc_fakeup_interface(struct bootpc_ifcontext *ifctx, struct thread *td) { struct ifreq *ifr; struct in_aliasreq *ifra; struct sockaddr_in *sin; int error; ifr = &ifctx->ireq; ifra = &ifctx->iareq; /* * Bring up the interface. * * Get the old interface flags and or IFF_UP into them; if * IFF_UP set blindly, interface selection can be clobbered. */ error = ifioctl(bootp_so, SIOCGIFFLAGS, (caddr_t)ifr, td); if (error != 0) panic("%s: SIOCGIFFLAGS, error=%d", __func__, error); ifr->ifr_flags |= IFF_UP; error = ifioctl(bootp_so, SIOCSIFFLAGS, (caddr_t)ifr, td); if (error != 0) panic("%s: SIOCSIFFLAGS, error=%d", __func__, error); /* * Do enough of ifconfig(8) so that the chosen interface * can talk to the servers. Set address to 0.0.0.0/8 and * broadcast address to local broadcast. */ sin = (struct sockaddr_in *)&ifra->ifra_addr; clear_sinaddr(sin); sin = (struct sockaddr_in *)&ifra->ifra_mask; clear_sinaddr(sin); sin->sin_addr.s_addr = htonl(IN_CLASSA_NET); sin = (struct sockaddr_in *)&ifra->ifra_broadaddr; clear_sinaddr(sin); sin->sin_addr.s_addr = htonl(INADDR_BROADCAST); error = ifioctl(bootp_so, SIOCAIFADDR, (caddr_t)ifra, td); if (error != 0) panic("%s: SIOCAIFADDR, error=%d", __func__, error); } static void bootpc_shutdown_interface(struct bootpc_ifcontext *ifctx, struct thread *td) { struct ifreq *ifr; struct sockaddr_in *sin; int error; ifr = &ifctx->ireq; printf("Shutdown interface %s\n", ifctx->ireq.ifr_name); error = ifioctl(bootp_so, SIOCGIFFLAGS, (caddr_t)ifr, td); if (error != 0) panic("%s: SIOCGIFFLAGS, error=%d", __func__, error); ifr->ifr_flags &= ~IFF_UP; error = ifioctl(bootp_so, SIOCSIFFLAGS, (caddr_t)ifr, td); if (error != 0) panic("%s: SIOCSIFFLAGS, error=%d", __func__, error); sin = (struct sockaddr_in *) &ifr->ifr_addr; clear_sinaddr(sin); error = ifioctl(bootp_so, SIOCDIFADDR, (caddr_t) ifr, td); if (error != 0) panic("%s: SIOCDIFADDR, error=%d", __func__, error); } static void bootpc_adjust_interface(struct bootpc_ifcontext *ifctx, struct bootpc_globalcontext *gctx, struct thread *td) { int error; struct sockaddr_in *sin; struct ifreq *ifr; struct in_aliasreq *ifra; struct sockaddr_in *myaddr; struct sockaddr_in *netmask; ifr = &ifctx->ireq; ifra = &ifctx->iareq; myaddr = &ifctx->myaddr; netmask = &ifctx->netmask; if (bootpc_ifctx_isresolved(ifctx) == 0) { /* Shutdown interfaces where BOOTP failed */ bootpc_shutdown_interface(ifctx, td); return; } printf("Adjusted interface %s", ifctx->ireq.ifr_name); /* Do BOOTP interface options */ if (ifctx->mtu != 0) { printf(" (MTU=%d%s)", ifctx->mtu, (ifctx->mtu > 1514) ? "/JUMBO" : ""); ifr->ifr_mtu = ifctx->mtu; error = ifioctl(bootp_so, SIOCSIFMTU, (caddr_t) ifr, td); if (error != 0) panic("%s: SIOCSIFMTU, error=%d", __func__, error); } printf("\n"); /* * Do enough of ifconfig(8) so that the chosen interface * can talk to the servers. (just set the address) */ sin = (struct sockaddr_in *) &ifr->ifr_addr; clear_sinaddr(sin); error = ifioctl(bootp_so, SIOCDIFADDR, (caddr_t) ifr, td); if (error != 0) panic("%s: SIOCDIFADDR, error=%d", __func__, error); bcopy(myaddr, &ifra->ifra_addr, sizeof(*myaddr)); bcopy(netmask, &ifra->ifra_mask, sizeof(*netmask)); clear_sinaddr(&ifra->ifra_broadaddr); ifra->ifra_broadaddr.sin_addr.s_addr = myaddr->sin_addr.s_addr | ~netmask->sin_addr.s_addr; error = ifioctl(bootp_so, SIOCAIFADDR, (caddr_t)ifra, td); if (error != 0) panic("%s: SIOCAIFADDR, error=%d", __func__, error); } static void bootpc_add_default_route(struct bootpc_ifcontext *ifctx) { int error; struct sockaddr_in defdst; struct sockaddr_in defmask; if (ifctx->gw.sin_addr.s_addr == htonl(INADDR_ANY)) return; clear_sinaddr(&defdst); clear_sinaddr(&defmask); error = rtrequest_fib(RTM_ADD, (struct sockaddr *)&defdst, (struct sockaddr *) &ifctx->gw, (struct sockaddr *)&defmask, (RTF_UP | RTF_GATEWAY | RTF_STATIC), NULL, RT_DEFAULT_FIB); if (error != 0) { printf("%s: RTM_ADD, error=%d\n", __func__, error); } } static void bootpc_remove_default_route(struct bootpc_ifcontext *ifctx) { int error; struct sockaddr_in defdst; struct sockaddr_in defmask; if (ifctx->gw.sin_addr.s_addr == htonl(INADDR_ANY)) return; clear_sinaddr(&defdst); clear_sinaddr(&defmask); error = rtrequest_fib(RTM_DELETE, (struct sockaddr *)&defdst, (struct sockaddr *) &ifctx->gw, (struct sockaddr *)&defmask, (RTF_UP | RTF_GATEWAY | RTF_STATIC), NULL, RT_DEFAULT_FIB); if (error != 0) { printf("%s: RTM_DELETE, error=%d\n", __func__, error); } } static int setfs(struct sockaddr_in *addr, char *path, char *p, const struct in_addr *siaddr) { if (getip(&p, &addr->sin_addr) == 0) { if (siaddr != NULL && *p == '/') bcopy(siaddr, &addr->sin_addr, sizeof(struct in_addr)); else return 0; } else { if (*p != ':') return 0; p++; } addr->sin_len = sizeof(struct sockaddr_in); addr->sin_family = AF_INET; strlcpy(path, p, MNAMELEN); return 1; } static int getip(char **ptr, struct in_addr *addr) { char *p; unsigned int ip; int val; p = *ptr; ip = 0; if (((val = getdec(&p)) < 0) || (val > 255)) return 0; ip = val << 24; if (*p != '.') return 0; p++; if (((val = getdec(&p)) < 0) || (val > 255)) return 0; ip |= (val << 16); if (*p != '.') return 0; p++; if (((val = getdec(&p)) < 0) || (val > 255)) return 0; ip |= (val << 8); if (*p != '.') return 0; p++; if (((val = getdec(&p)) < 0) || (val > 255)) return 0; ip |= val; addr->s_addr = htonl(ip); *ptr = p; return 1; } static int getdec(char **ptr) { char *p; int ret; p = *ptr; ret = 0; if ((*p < '0') || (*p > '9')) return -1; while ((*p >= '0') && (*p <= '9')) { ret = ret * 10 + (*p - '0'); p++; } *ptr = p; return ret; } static void mountopts(struct nfs_args *args, char *p) { args->version = NFS_ARGSVERSION; args->rsize = BOOTP_BLOCKSIZE; args->wsize = BOOTP_BLOCKSIZE; args->flags = NFSMNT_RSIZE | NFSMNT_WSIZE | NFSMNT_RESVPORT; args->sotype = SOCK_DGRAM; if (p != NULL) nfs_parse_options(p, args); } static int xdr_opaque_decode(struct mbuf **mptr, u_char *buf, int len) { struct mbuf *m; int alignedlen; m = *mptr; alignedlen = ( len + 3 ) & ~3; if (m->m_len < alignedlen) { m = m_pullup(m, alignedlen); if (m == NULL) { *mptr = NULL; return EBADRPC; } } bcopy(mtod(m, u_char *), buf, len); m_adj(m, alignedlen); *mptr = m; return 0; } static int xdr_int_decode(struct mbuf **mptr, int *iptr) { u_int32_t i; if (xdr_opaque_decode(mptr, (u_char *) &i, sizeof(u_int32_t)) != 0) return EBADRPC; *iptr = fxdr_unsigned(u_int32_t, i); return 0; } static void print_sin_addr(struct sockaddr_in *sin) { print_in_addr(sin->sin_addr); } static void print_in_addr(struct in_addr addr) { unsigned int ip; ip = ntohl(addr.s_addr); printf("%d.%d.%d.%d", ip >> 24, (ip >> 16) & 255, (ip >> 8) & 255, ip & 255); } static void bootpc_compose_query(struct bootpc_ifcontext *ifctx, struct thread *td) { unsigned char *vendp; unsigned char vendor_client[64]; uint32_t leasetime; uint8_t vendor_client_len; ifctx->gotrootpath = 0; bzero((caddr_t) &ifctx->call, sizeof(ifctx->call)); /* bootpc part */ ifctx->call.op = BOOTP_REQUEST; /* BOOTREQUEST */ ifctx->call.htype = 1; /* 10mb ethernet */ ifctx->call.hlen = ifctx->sdl->sdl_alen;/* Hardware address length */ ifctx->call.hops = 0; if (bootpc_ifctx_isunresolved(ifctx) != 0) ifctx->xid++; ifctx->call.xid = txdr_unsigned(ifctx->xid); bcopy(LLADDR(ifctx->sdl), &ifctx->call.chaddr, ifctx->sdl->sdl_alen); vendp = ifctx->call.vend; *vendp++ = 99; /* RFC1048 cookie */ *vendp++ = 130; *vendp++ = 83; *vendp++ = 99; *vendp++ = TAG_MAXMSGSIZE; *vendp++ = 2; *vendp++ = (sizeof(struct bootp_packet) >> 8) & 255; *vendp++ = sizeof(struct bootp_packet) & 255; snprintf(vendor_client, sizeof(vendor_client), "%s:%s:%s", ostype, MACHINE, osrelease); vendor_client_len = strlen(vendor_client); *vendp++ = TAG_VENDOR_INDENTIFIER; *vendp++ = vendor_client_len; memcpy(vendp, vendor_client, vendor_client_len); vendp += vendor_client_len; ifctx->dhcpquerytype = DHCP_NOMSG; switch (ifctx->state) { case IF_DHCP_UNRESOLVED: *vendp++ = TAG_DHCP_MSGTYPE; *vendp++ = 1; *vendp++ = DHCP_DISCOVER; ifctx->dhcpquerytype = DHCP_DISCOVER; ifctx->gotdhcpserver = 0; break; case IF_DHCP_OFFERED: *vendp++ = TAG_DHCP_MSGTYPE; *vendp++ = 1; *vendp++ = DHCP_REQUEST; ifctx->dhcpquerytype = DHCP_REQUEST; *vendp++ = TAG_DHCP_REQ_ADDR; *vendp++ = 4; memcpy(vendp, &ifctx->reply.yiaddr, 4); vendp += 4; if (ifctx->gotdhcpserver != 0) { *vendp++ = TAG_DHCP_SERVERID; *vendp++ = 4; memcpy(vendp, &ifctx->dhcpserver, 4); vendp += 4; } *vendp++ = TAG_DHCP_LEASETIME; *vendp++ = 4; leasetime = htonl(300); memcpy(vendp, &leasetime, 4); vendp += 4; break; default: break; } *vendp = TAG_END; ifctx->call.secs = 0; ifctx->call.flags = htons(0x8000); /* We need a broadcast answer */ } static int bootpc_hascookie(struct bootp_packet *bp) { return (bp->vend[0] == 99 && bp->vend[1] == 130 && bp->vend[2] == 83 && bp->vend[3] == 99); } static void bootpc_tag_helper(struct bootpc_tagcontext *tctx, unsigned char *start, int len, int tag) { unsigned char *j; unsigned char *ej; unsigned char code; if (tctx->badtag != 0 || tctx->badopt != 0) return; j = start; ej = j + len; while (j < ej) { code = *j++; if (code == TAG_PAD) continue; if (code == TAG_END) return; if (j >= ej || j + *j + 1 > ej) { tctx->badopt = 1; return; } len = *j++; if (code == tag) { if (tctx->taglen + len > TAG_MAXLEN) { tctx->badtag = 1; return; } tctx->foundopt = 1; if (len > 0) memcpy(tctx->buf + tctx->taglen, j, len); tctx->taglen += len; } if (code == TAG_OVERLOAD) tctx->overload = *j; j += len; } } static unsigned char * bootpc_tag(struct bootpc_tagcontext *tctx, struct bootp_packet *bp, int len, int tag) { tctx->overload = 0; tctx->badopt = 0; tctx->badtag = 0; tctx->foundopt = 0; tctx->taglen = 0; if (bootpc_hascookie(bp) == 0) return NULL; bootpc_tag_helper(tctx, &bp->vend[4], (unsigned char *) bp + len - &bp->vend[4], tag); if ((tctx->overload & OVERLOAD_FILE) != 0) bootpc_tag_helper(tctx, (unsigned char *) bp->file, sizeof(bp->file), tag); if ((tctx->overload & OVERLOAD_SNAME) != 0) bootpc_tag_helper(tctx, (unsigned char *) bp->sname, sizeof(bp->sname), tag); if (tctx->badopt != 0 || tctx->badtag != 0 || tctx->foundopt == 0) return NULL; tctx->buf[tctx->taglen] = '\0'; return tctx->buf; } static void bootpc_decode_reply(struct nfsv3_diskless *nd, struct bootpc_ifcontext *ifctx, struct bootpc_globalcontext *gctx) { char *p, *s; unsigned int ip; ifctx->gotgw = 0; ifctx->gotnetmask = 0; clear_sinaddr(&ifctx->myaddr); clear_sinaddr(&ifctx->netmask); clear_sinaddr(&ifctx->gw); ifctx->myaddr.sin_addr = ifctx->reply.yiaddr; ip = ntohl(ifctx->myaddr.sin_addr.s_addr); printf("%s at ", ifctx->ireq.ifr_name); print_sin_addr(&ifctx->myaddr); printf(" server "); print_in_addr(ifctx->reply.siaddr); ifctx->gw.sin_addr = ifctx->reply.giaddr; if (ifctx->reply.giaddr.s_addr != htonl(INADDR_ANY)) { printf(" via gateway "); print_in_addr(ifctx->reply.giaddr); } /* This call used for the side effect (overload flag) */ (void) bootpc_tag(&gctx->tmptag, &ifctx->reply, ifctx->replylen, TAG_END); if ((gctx->tmptag.overload & OVERLOAD_SNAME) == 0) if (ifctx->reply.sname[0] != '\0') printf(" server name %s", ifctx->reply.sname); if ((gctx->tmptag.overload & OVERLOAD_FILE) == 0) if (ifctx->reply.file[0] != '\0') printf(" boot file %s", ifctx->reply.file); printf("\n"); p = bootpc_tag(&gctx->tag, &ifctx->reply, ifctx->replylen, TAG_SUBNETMASK); if (p != NULL) { if (gctx->tag.taglen != 4) panic("bootpc: subnet mask len is %d", gctx->tag.taglen); bcopy(p, &ifctx->netmask.sin_addr, 4); ifctx->gotnetmask = 1; printf("subnet mask "); print_sin_addr(&ifctx->netmask); printf(" "); } p = bootpc_tag(&gctx->tag, &ifctx->reply, ifctx->replylen, TAG_ROUTERS); if (p != NULL) { /* Routers */ if (gctx->tag.taglen % 4) panic("bootpc: Router Len is %d", gctx->tag.taglen); if (gctx->tag.taglen > 0) { bcopy(p, &ifctx->gw.sin_addr, 4); printf("router "); print_sin_addr(&ifctx->gw); printf(" "); ifctx->gotgw = 1; gctx->gotgw = 1; } } /* * Choose a root filesystem. If a value is forced in the environment * and it contains "nfs:", use it unconditionally. Otherwise, if the * kernel is compiled with the ROOTDEVNAME option, then use it if: * - The server doesn't provide a pathname. * - The boothowto flags include RB_DFLTROOT (user said to override * the server value). */ p = NULL; if ((s = kern_getenv("vfs.root.mountfrom")) != NULL) { if ((p = strstr(s, "nfs:")) != NULL) p = strdup(p + 4, M_TEMP); freeenv(s); } if (p == NULL) { p = bootpc_tag(&gctx->tag, &ifctx->reply, ifctx->replylen, TAG_ROOT); if (p != NULL) ifctx->gotrootpath = 1; } #ifdef ROOTDEVNAME if ((p == NULL || (boothowto & RB_DFLTROOT) != 0) && (p = strstr(ROOTDEVNAME, "nfs:")) != NULL) { p += 4; } #endif if (p != NULL) { if (gctx->setrootfs != NULL) { printf("rootfs %s (ignored) ", p); } else if (setfs(&nd->root_saddr, nd->root_hostnam, p, &ifctx->reply.siaddr)) { if (*p == '/') { printf("root_server "); print_sin_addr(&nd->root_saddr); printf(" "); } printf("rootfs %s ", p); gctx->gotrootpath = 1; gctx->setrootfs = ifctx; p = bootpc_tag(&gctx->tag, &ifctx->reply, ifctx->replylen, TAG_ROOTOPTS); if (p != NULL) { mountopts(&nd->root_args, p); printf("rootopts %s ", p); } } else panic("Failed to set rootfs to %s", p); } p = bootpc_tag(&gctx->tag, &ifctx->reply, ifctx->replylen, TAG_HOSTNAME); if (p != NULL) { if (gctx->tag.taglen >= MAXHOSTNAMELEN) panic("bootpc: hostname >= %d bytes", MAXHOSTNAMELEN); if (gctx->sethostname != NULL) { printf("hostname %s (ignored) ", p); } else { strcpy(nd->my_hostnam, p); mtx_lock(&prison0.pr_mtx); strcpy(prison0.pr_hostname, p); mtx_unlock(&prison0.pr_mtx); printf("hostname %s ", p); gctx->sethostname = ifctx; } } p = bootpc_tag(&gctx->tag, &ifctx->reply, ifctx->replylen, TAG_COOKIE); if (p != NULL) { /* store in a sysctl variable */ int i, l = sizeof(bootp_cookie) - 1; for (i = 0; i < l && p[i] != '\0'; i++) bootp_cookie[i] = p[i]; p[i] = '\0'; } p = bootpc_tag(&gctx->tag, &ifctx->reply, ifctx->replylen, TAG_INTF_MTU); if (p != NULL) { ifctx->mtu = be16dec(p); } printf("\n"); if (ifctx->gotnetmask == 0) { if (IN_CLASSA(ntohl(ifctx->myaddr.sin_addr.s_addr))) ifctx->netmask.sin_addr.s_addr = htonl(IN_CLASSA_NET); else if (IN_CLASSB(ntohl(ifctx->myaddr.sin_addr.s_addr))) ifctx->netmask.sin_addr.s_addr = htonl(IN_CLASSB_NET); else ifctx->netmask.sin_addr.s_addr = htonl(IN_CLASSC_NET); } } void bootpc_init(void) { struct bootpc_ifcontext *ifctx; /* Interface BOOTP contexts */ struct bootpc_globalcontext *gctx; /* Global BOOTP context */ struct ifnet *ifp; struct sockaddr_dl *sdl; struct ifaddr *ifa; int error; #ifndef BOOTP_WIRED_TO int ifcnt; #endif struct nfsv3_diskless *nd; struct thread *td; int timeout; int delay; timeout = BOOTP_IFACE_WAIT_TIMEOUT * hz; delay = hz / 10; nd = &nfsv3_diskless; td = curthread; /* * If already filled in, don't touch it here */ if (nfs_diskless_valid != 0) return; gctx = malloc(sizeof(*gctx), M_TEMP, M_WAITOK | M_ZERO); STAILQ_INIT(&gctx->interfaces); gctx->xid = ~0xFFFF; gctx->starttime = time_second; /* * If ROOTDEVNAME is defined or vfs.root.mountfrom is set then we have * root-path overrides that can potentially let us boot even if we don't * get a root path from the server, so we can treat that as a non-error. */ #ifdef ROOTDEVNAME gctx->any_root_overrides = 1; #else gctx->any_root_overrides = testenv("vfs.root.mountfrom"); #endif /* * Find a network interface. */ CURVNET_SET(TD_TO_VNET(td)); #ifdef BOOTP_WIRED_TO printf("%s: wired to interface '%s'\n", __func__, __XSTRING(BOOTP_WIRED_TO)); allocifctx(gctx); #else /* * Preallocate interface context storage, if another interface * attaches and wins the race, it won't be eligible for bootp. */ ifcnt = 0; IFNET_RLOCK(); - TAILQ_FOREACH(ifp, &V_ifnet, if_link) { + CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) { if ((ifp->if_flags & (IFF_LOOPBACK | IFF_POINTOPOINT | IFF_BROADCAST)) != IFF_BROADCAST) continue; switch (ifp->if_alloctype) { case IFT_ETHER: break; default: continue; } ifcnt++; } IFNET_RUNLOCK(); if (ifcnt == 0) panic("%s: no eligible interfaces", __func__); for (; ifcnt > 0; ifcnt--) allocifctx(gctx); #endif retry: ifctx = STAILQ_FIRST(&gctx->interfaces); IFNET_RLOCK(); - TAILQ_FOREACH(ifp, &V_ifnet, if_link) { + CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) { if (ifctx == NULL) break; #ifdef BOOTP_WIRED_TO if (strcmp(ifp->if_xname, __XSTRING(BOOTP_WIRED_TO)) != 0) continue; #else if ((ifp->if_flags & (IFF_LOOPBACK | IFF_POINTOPOINT | IFF_BROADCAST)) != IFF_BROADCAST) continue; switch (ifp->if_alloctype) { case IFT_ETHER: break; default: continue; } #endif strlcpy(ifctx->ireq.ifr_name, ifp->if_xname, sizeof(ifctx->ireq.ifr_name)); ifctx->ifp = ifp; /* Get HW address */ sdl = NULL; CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) if (ifa->ifa_addr->sa_family == AF_LINK) { sdl = (struct sockaddr_dl *)ifa->ifa_addr; if (sdl->sdl_type == IFT_ETHER) break; } if (sdl == NULL) panic("bootpc: Unable to find HW address for %s", ifctx->ireq.ifr_name); ifctx->sdl = sdl; ifctx = STAILQ_NEXT(ifctx, next); } IFNET_RUNLOCK(); CURVNET_RESTORE(); if (STAILQ_EMPTY(&gctx->interfaces) || STAILQ_FIRST(&gctx->interfaces)->ifp == NULL) { if (timeout > 0) { pause("bootpc", delay); timeout -= delay; goto retry; } #ifdef BOOTP_WIRED_TO panic("%s: Could not find interface specified " "by BOOTP_WIRED_TO: " __XSTRING(BOOTP_WIRED_TO), __func__); #else panic("%s: no suitable interface", __func__); #endif } error = socreate(AF_INET, &bootp_so, SOCK_DGRAM, 0, td->td_ucred, td); if (error != 0) panic("%s: socreate, error=%d", __func__, error); STAILQ_FOREACH(ifctx, &gctx->interfaces, next) bootpc_fakeup_interface(ifctx, td); STAILQ_FOREACH(ifctx, &gctx->interfaces, next) bootpc_compose_query(ifctx, td); error = bootpc_call(gctx, td); if (error != 0) { printf("BOOTP call failed\n"); } mountopts(&nd->root_args, NULL); STAILQ_FOREACH(ifctx, &gctx->interfaces, next) if (bootpc_ifctx_isresolved(ifctx) != 0) bootpc_decode_reply(nd, ifctx, gctx); #ifdef BOOTP_NFSROOT if (gctx->gotrootpath == 0 && gctx->any_root_overrides == 0) panic("bootpc: No root path offered"); #endif STAILQ_FOREACH(ifctx, &gctx->interfaces, next) bootpc_adjust_interface(ifctx, gctx, td); soclose(bootp_so); STAILQ_FOREACH(ifctx, &gctx->interfaces, next) if (ifctx->gotrootpath != 0) break; if (ifctx == NULL) { STAILQ_FOREACH(ifctx, &gctx->interfaces, next) if (bootpc_ifctx_isresolved(ifctx) != 0) break; } if (ifctx == NULL) goto out; if (gctx->gotrootpath != 0) { kern_setenv("boot.netif.name", ifctx->ifp->if_xname); bootpc_add_default_route(ifctx); error = md_mount(&nd->root_saddr, nd->root_hostnam, nd->root_fh, &nd->root_fhsize, &nd->root_args, td); bootpc_remove_default_route(ifctx); if (error != 0) { if (gctx->any_root_overrides == 0) panic("nfs_boot: mount root, error=%d", error); else goto out; } rootdevnames[0] = "nfs:"; nfs_diskless_valid = 3; } strcpy(nd->myif.ifra_name, ifctx->ireq.ifr_name); bcopy(&ifctx->myaddr, &nd->myif.ifra_addr, sizeof(ifctx->myaddr)); bcopy(&ifctx->myaddr, &nd->myif.ifra_broadaddr, sizeof(ifctx->myaddr)); ((struct sockaddr_in *) &nd->myif.ifra_broadaddr)->sin_addr.s_addr = ifctx->myaddr.sin_addr.s_addr | ~ ifctx->netmask.sin_addr.s_addr; bcopy(&ifctx->netmask, &nd->myif.ifra_mask, sizeof(ifctx->netmask)); bcopy(&ifctx->gw, &nd->mygateway, sizeof(ifctx->gw)); out: while((ifctx = STAILQ_FIRST(&gctx->interfaces)) != NULL) { STAILQ_REMOVE_HEAD(&gctx->interfaces, next); free(ifctx, M_TEMP); } free(gctx, M_TEMP); } /* * RPC: mountd/mount * Given a server pathname, get an NFS file handle. * Also, sets sin->sin_port to the NFS service port. */ static int md_mount(struct sockaddr_in *mdsin, char *path, u_char *fhp, int *fhsizep, struct nfs_args *args, struct thread *td) { struct mbuf *m; int error; int authunixok; int authcount; int authver; #define RPCPROG_MNT 100005 #define RPCMNT_VER1 1 #define RPCMNT_VER3 3 #define RPCMNT_MOUNT 1 #define AUTH_SYS 1 /* unix style (uid, gids) */ #define AUTH_UNIX AUTH_SYS /* XXX honor v2/v3 flags in args->flags? */ #ifdef BOOTP_NFSV3 /* First try NFS v3 */ /* Get port number for MOUNTD. */ error = krpc_portmap(mdsin, RPCPROG_MNT, RPCMNT_VER3, &mdsin->sin_port, td); if (error == 0) { m = xdr_string_encode(path, strlen(path)); /* Do RPC to mountd. */ error = krpc_call(mdsin, RPCPROG_MNT, RPCMNT_VER3, RPCMNT_MOUNT, &m, NULL, td); } if (error == 0) { args->flags |= NFSMNT_NFSV3; } else { #endif /* Fallback to NFS v2 */ /* Get port number for MOUNTD. */ error = krpc_portmap(mdsin, RPCPROG_MNT, RPCMNT_VER1, &mdsin->sin_port, td); if (error != 0) return error; m = xdr_string_encode(path, strlen(path)); /* Do RPC to mountd. */ error = krpc_call(mdsin, RPCPROG_MNT, RPCMNT_VER1, RPCMNT_MOUNT, &m, NULL, td); if (error != 0) return error; /* message already freed */ #ifdef BOOTP_NFSV3 } #endif if (xdr_int_decode(&m, &error) != 0 || error != 0) goto bad; if ((args->flags & NFSMNT_NFSV3) != 0) { if (xdr_int_decode(&m, fhsizep) != 0 || *fhsizep > NFSX_V3FHMAX || *fhsizep <= 0) goto bad; } else *fhsizep = NFSX_V2FH; if (xdr_opaque_decode(&m, fhp, *fhsizep) != 0) goto bad; if (args->flags & NFSMNT_NFSV3) { if (xdr_int_decode(&m, &authcount) != 0) goto bad; authunixok = 0; if (authcount < 0 || authcount > 100) goto bad; while (authcount > 0) { if (xdr_int_decode(&m, &authver) != 0) goto bad; if (authver == AUTH_UNIX) authunixok = 1; authcount--; } if (authunixok == 0) goto bad; } /* Set port number for NFS use. */ error = krpc_portmap(mdsin, NFS_PROG, (args->flags & NFSMNT_NFSV3) ? NFS_VER3 : NFS_VER2, &mdsin->sin_port, td); goto out; bad: error = EBADRPC; out: m_freem(m); return error; } SYSINIT(bootp_rootconf, SI_SUB_ROOT_CONF, SI_ORDER_FIRST, bootpc_init, NULL); Index: head/sys/nfs/nfs_diskless.c =================================================================== --- head/sys/nfs/nfs_diskless.c (revision 334117) +++ head/sys/nfs/nfs_diskless.c (revision 334118) @@ -1,439 +1,439 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1990 The Regents of the University of California. * All rights reserved. * * This code is derived from software contributed to Berkeley by * William Jolitz. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)autoconf.c 7.1 (Berkeley) 5/9/91 */ #include __FBSDID("$FreeBSD$"); #include "opt_bootp.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define NFS_IFACE_TIMEOUT_SECS 10 /* Timeout for interface to appear. */ static int inaddr_to_sockaddr(char *ev, struct sockaddr_in *sa); static int hwaddr_to_sockaddr(char *ev, struct sockaddr_dl *sa); static int decode_nfshandle(char *ev, u_char *fh, int maxfh); /* * This structure must be filled in by a primary bootstrap or bootstrap * server for a diskless/dataless machine. It is initialized below just * to ensure that it is allocated to initialized data (.data not .bss). */ struct nfs_diskless nfs_diskless = { { { 0 } } }; struct nfsv3_diskless nfsv3_diskless = { { { 0 } } }; int nfs_diskless_valid = 0; /* * Validate/sanity check a rsize/wsize parameter. */ static int checkrwsize(unsigned long v, const char *name) { /* * 32K is used as an upper bound because most servers * limit block size to satisfy IPv4's limit of * 64K/reassembled packet. The lower bound is pretty * much arbitrary. */ if (!(4 <= v && v <= 32*1024)) { printf("nfs_parse_options: invalid %s %lu ignored\n", name, v); return 0; } else return 1; } /* * Parse mount options and apply them to the supplied * nfs_diskless state. Used also by bootp/dhcp support. */ void nfs_parse_options(const char *envopts, struct nfs_args *nd) { char *opts, *o, *otmp; unsigned long v; opts = strdup(envopts, M_TEMP); otmp = opts; while ((o = strsep(&otmp, ":;, ")) != NULL) { if (*o == '\0') ; /* Skip empty options. */ else if (strcmp(o, "soft") == 0) nd->flags |= NFSMNT_SOFT; else if (strcmp(o, "intr") == 0) nd->flags |= NFSMNT_INT; else if (strcmp(o, "conn") == 0) nd->flags |= NFSMNT_NOCONN; else if (strcmp(o, "nolockd") == 0) nd->flags |= NFSMNT_NOLOCKD; else if (strcmp(o, "nocto") == 0) nd->flags |= NFSMNT_NOCTO; else if (strcmp(o, "nfsv2") == 0) nd->flags &= ~(NFSMNT_NFSV3 | NFSMNT_NFSV4); else if (strcmp(o, "nfsv3") == 0) { nd->flags &= ~NFSMNT_NFSV4; nd->flags |= NFSMNT_NFSV3; } else if (strcmp(o, "tcp") == 0) nd->sotype = SOCK_STREAM; else if (strcmp(o, "udp") == 0) nd->sotype = SOCK_DGRAM; else if (strncmp(o, "rsize=", 6) == 0) { v = strtoul(o+6, NULL, 10); if (checkrwsize(v, "rsize")) { nd->rsize = (int) v; nd->flags |= NFSMNT_RSIZE; } } else if (strncmp(o, "wsize=", 6) == 0) { v = strtoul(o+6, NULL, 10); if (checkrwsize(v, "wsize")) { nd->wsize = (int) v; nd->flags |= NFSMNT_WSIZE; } } else printf("%s: skipping unknown option \"%s\"\n", __func__, o); } free(opts, M_TEMP); } /* * Populate the essential fields in the nfsv3_diskless structure. * * The loader is expected to export the following environment variables: * * boot.netif.name name of boot interface * boot.netif.ip IP address on boot interface * boot.netif.netmask netmask on boot interface * boot.netif.gateway default gateway (optional) * boot.netif.hwaddr hardware address of boot interface * boot.netif.mtu interface mtu from bootp/dhcp (optional) * boot.nfsroot.server IP address of root filesystem server * boot.nfsroot.path path of the root filesystem on server * boot.nfsroot.nfshandle NFS handle for root filesystem on server * boot.nfsroot.nfshandlelen and length of this handle (for NFSv3 only) * boot.nfsroot.options NFS options for the root filesystem */ void nfs_setup_diskless(void) { struct nfs_diskless *nd = &nfs_diskless; struct nfsv3_diskless *nd3 = &nfsv3_diskless; struct ifnet *ifp; struct ifaddr *ifa; struct sockaddr_dl *sdl, ourdl; struct sockaddr_in myaddr, netmask; char *cp; int cnt, fhlen, is_nfsv3; uint32_t len; time_t timeout_at; if (nfs_diskless_valid != 0) return; /* get handle size. If this succeeds, it's an NFSv3 setup. */ if ((cp = kern_getenv("boot.nfsroot.nfshandlelen")) != NULL) { cnt = sscanf(cp, "%d", &len); freeenv(cp); if (cnt != 1 || len == 0 || len > NFSX_V3FHMAX) { printf("nfs_diskless: bad NFS handle len\n"); return; } nd3->root_fhsize = len; is_nfsv3 = 1; } else is_nfsv3 = 0; /* set up interface */ if (inaddr_to_sockaddr("boot.netif.ip", &myaddr)) return; if (inaddr_to_sockaddr("boot.netif.netmask", &netmask)) { printf("nfs_diskless: no netmask\n"); return; } if (is_nfsv3 != 0) { bcopy(&myaddr, &nd3->myif.ifra_addr, sizeof(myaddr)); bcopy(&myaddr, &nd3->myif.ifra_broadaddr, sizeof(myaddr)); ((struct sockaddr_in *) &nd3->myif.ifra_broadaddr)->sin_addr.s_addr = myaddr.sin_addr.s_addr | ~ netmask.sin_addr.s_addr; bcopy(&netmask, &nd3->myif.ifra_mask, sizeof(netmask)); } else { bcopy(&myaddr, &nd->myif.ifra_addr, sizeof(myaddr)); bcopy(&myaddr, &nd->myif.ifra_broadaddr, sizeof(myaddr)); ((struct sockaddr_in *) &nd->myif.ifra_broadaddr)->sin_addr.s_addr = myaddr.sin_addr.s_addr | ~ netmask.sin_addr.s_addr; bcopy(&netmask, &nd->myif.ifra_mask, sizeof(netmask)); } if (hwaddr_to_sockaddr("boot.netif.hwaddr", &ourdl)) { printf("nfs_diskless: no hardware address\n"); return; } ifa = NULL; timeout_at = time_uptime + NFS_IFACE_TIMEOUT_SECS; retry: CURVNET_SET(TD_TO_VNET(curthread)); IFNET_RLOCK(); - TAILQ_FOREACH(ifp, &V_ifnet, if_link) { + CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) { CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family == AF_LINK) { sdl = (struct sockaddr_dl *)ifa->ifa_addr; if ((sdl->sdl_type == ourdl.sdl_type) && (sdl->sdl_alen == ourdl.sdl_alen) && !bcmp(LLADDR(sdl), LLADDR(&ourdl), sdl->sdl_alen)) { IFNET_RUNLOCK(); CURVNET_RESTORE(); goto match_done; } } } } IFNET_RUNLOCK(); CURVNET_RESTORE(); if (time_uptime < timeout_at) { pause("nfssdl", hz / 5); goto retry; } printf("nfs_diskless: no interface\n"); return; /* no matching interface */ match_done: kern_setenv("boot.netif.name", ifp->if_xname); if (is_nfsv3 != 0) { strlcpy(nd3->myif.ifra_name, ifp->if_xname, sizeof(nd3->myif.ifra_name)); /* set up gateway */ inaddr_to_sockaddr("boot.netif.gateway", &nd3->mygateway); /* set up root mount */ nd3->root_args.rsize = 32768; /* XXX tunable? */ nd3->root_args.wsize = 32768; nd3->root_args.sotype = SOCK_STREAM; nd3->root_args.flags = (NFSMNT_NFSV3 | NFSMNT_WSIZE | NFSMNT_RSIZE | NFSMNT_RESVPORT); if (inaddr_to_sockaddr("boot.nfsroot.server", &nd3->root_saddr)) { printf("nfs_diskless: no server\n"); return; } nd3->root_saddr.sin_port = htons(NFS_PORT); fhlen = decode_nfshandle("boot.nfsroot.nfshandle", &nd3->root_fh[0], NFSX_V3FHMAX); if (fhlen == 0) { printf("nfs_diskless: no NFS handle\n"); return; } if (fhlen != nd3->root_fhsize) { printf("nfs_diskless: bad NFS handle len=%d\n", fhlen); return; } if ((cp = kern_getenv("boot.nfsroot.path")) != NULL) { strncpy(nd3->root_hostnam, cp, MNAMELEN - 1); freeenv(cp); } if ((cp = kern_getenv("boot.nfsroot.options")) != NULL) { nfs_parse_options(cp, &nd3->root_args); freeenv(cp); } nfs_diskless_valid = 3; } else { strlcpy(nd->myif.ifra_name, ifp->if_xname, sizeof(nd->myif.ifra_name)); /* set up gateway */ inaddr_to_sockaddr("boot.netif.gateway", &nd->mygateway); /* set up root mount */ nd->root_args.rsize = 8192; /* XXX tunable? */ nd->root_args.wsize = 8192; nd->root_args.sotype = SOCK_STREAM; nd->root_args.flags = (NFSMNT_WSIZE | NFSMNT_RSIZE | NFSMNT_RESVPORT); if (inaddr_to_sockaddr("boot.nfsroot.server", &nd->root_saddr)) { printf("nfs_diskless: no server\n"); return; } nd->root_saddr.sin_port = htons(NFS_PORT); if (decode_nfshandle("boot.nfsroot.nfshandle", &nd->root_fh[0], NFSX_V2FH) == 0) { printf("nfs_diskless: no NFS handle\n"); return; } if ((cp = kern_getenv("boot.nfsroot.path")) != NULL) { strncpy(nd->root_hostnam, cp, MNAMELEN - 1); freeenv(cp); } if ((cp = kern_getenv("boot.nfsroot.options")) != NULL) { struct nfs_args args; /* * XXX yech, convert between old and current * arg format */ args.flags = nd->root_args.flags; args.sotype = nd->root_args.sotype; args.rsize = nd->root_args.rsize; args.wsize = nd->root_args.wsize; nfs_parse_options(cp, &args); nd->root_args.flags = args.flags; nd->root_args.sotype = args.sotype; nd->root_args.rsize = args.rsize; nd->root_args.wsize = args.wsize; freeenv(cp); } nfs_diskless_valid = 1; } } static int inaddr_to_sockaddr(char *ev, struct sockaddr_in *sa) { u_int32_t a[4]; char *cp; int count; bzero(sa, sizeof(*sa)); sa->sin_len = sizeof(*sa); sa->sin_family = AF_INET; if ((cp = kern_getenv(ev)) == NULL) return (1); count = sscanf(cp, "%d.%d.%d.%d", &a[0], &a[1], &a[2], &a[3]); freeenv(cp); if (count != 4) return (1); sa->sin_addr.s_addr = htonl((a[0] << 24) | (a[1] << 16) | (a[2] << 8) | a[3]); return (0); } static int hwaddr_to_sockaddr(char *ev, struct sockaddr_dl *sa) { char *cp; u_int32_t a[6]; int count; bzero(sa, sizeof(*sa)); sa->sdl_len = sizeof(*sa); sa->sdl_family = AF_LINK; sa->sdl_type = IFT_ETHER; sa->sdl_alen = ETHER_ADDR_LEN; if ((cp = kern_getenv(ev)) == NULL) return (1); count = sscanf(cp, "%x:%x:%x:%x:%x:%x", &a[0], &a[1], &a[2], &a[3], &a[4], &a[5]); freeenv(cp); if (count != 6) return (1); sa->sdl_data[0] = a[0]; sa->sdl_data[1] = a[1]; sa->sdl_data[2] = a[2]; sa->sdl_data[3] = a[3]; sa->sdl_data[4] = a[4]; sa->sdl_data[5] = a[5]; return (0); } static int decode_nfshandle(char *ev, u_char *fh, int maxfh) { u_char *cp, *ep; int len, val; ep = cp = kern_getenv(ev); if (cp == NULL) return (0); if ((strlen(cp) < 2) || (*cp != 'X')) { freeenv(ep); return (0); } len = 0; cp++; for (;;) { if (*cp == 'X') { freeenv(ep); return (len); } if ((sscanf(cp, "%2x", &val) != 1) || (val > 0xff)) { freeenv(ep); return (0); } *(fh++) = val; len++; cp += 2; if (len > maxfh) { freeenv(ep); return (0); } } } #if !defined(BOOTP_NFSROOT) static void nfs_rootconf(void) { nfs_setup_diskless(); if (nfs_diskless_valid) rootdevnames[0] = "nfs:"; } SYSINIT(cpu_rootconf, SI_SUB_ROOT_CONF, SI_ORDER_FIRST, nfs_rootconf, NULL); #endif Index: head/sys/ofed/drivers/infiniband/core/ib_roce_gid_mgmt.c =================================================================== --- head/sys/ofed/drivers/infiniband/core/ib_roce_gid_mgmt.c (revision 334117) +++ head/sys/ofed/drivers/infiniband/core/ib_roce_gid_mgmt.c (revision 334118) @@ -1,465 +1,465 @@ /*- * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0 * * Copyright (c) 2015-2017, Mellanox Technologies inc. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. * * $FreeBSD$ */ #include "core_priv.h" #include #include #include #include #include #include static struct workqueue_struct *roce_gid_mgmt_wq; enum gid_op_type { GID_DEL = 0, GID_ADD }; struct roce_netdev_event_work { struct work_struct work; struct net_device *ndev; }; struct roce_rescan_work { struct work_struct work; struct ib_device *ib_dev; }; static const struct { bool (*is_supported)(const struct ib_device *device, u8 port_num); enum ib_gid_type gid_type; } PORT_CAP_TO_GID_TYPE[] = { {rdma_protocol_roce_eth_encap, IB_GID_TYPE_ROCE}, {rdma_protocol_roce_udp_encap, IB_GID_TYPE_ROCE_UDP_ENCAP}, }; #define CAP_TO_GID_TABLE_SIZE ARRAY_SIZE(PORT_CAP_TO_GID_TYPE) unsigned long roce_gid_type_mask_support(struct ib_device *ib_dev, u8 port) { int i; unsigned int ret_flags = 0; if (!rdma_protocol_roce(ib_dev, port)) return 1UL << IB_GID_TYPE_IB; for (i = 0; i < CAP_TO_GID_TABLE_SIZE; i++) if (PORT_CAP_TO_GID_TYPE[i].is_supported(ib_dev, port)) ret_flags |= 1UL << PORT_CAP_TO_GID_TYPE[i].gid_type; return ret_flags; } EXPORT_SYMBOL(roce_gid_type_mask_support); static void update_gid(enum gid_op_type gid_op, struct ib_device *ib_dev, u8 port, union ib_gid *gid, struct net_device *ndev) { int i; unsigned long gid_type_mask = roce_gid_type_mask_support(ib_dev, port); struct ib_gid_attr gid_attr; memset(&gid_attr, 0, sizeof(gid_attr)); gid_attr.ndev = ndev; for (i = 0; i != IB_GID_TYPE_SIZE; i++) { if ((1UL << i) & gid_type_mask) { gid_attr.gid_type = i; switch (gid_op) { case GID_ADD: ib_cache_gid_add(ib_dev, port, gid, &gid_attr); break; case GID_DEL: ib_cache_gid_del(ib_dev, port, gid, &gid_attr); break; } } } } static int roce_gid_match_netdev(struct ib_device *ib_dev, u8 port, struct net_device *idev, void *cookie) { struct net_device *ndev = (struct net_device *)cookie; if (idev == NULL) return (0); return (ndev == idev); } static int roce_gid_match_all(struct ib_device *ib_dev, u8 port, struct net_device *idev, void *cookie) { if (idev == NULL) return (0); return (1); } static int roce_gid_enum_netdev_default(struct ib_device *ib_dev, u8 port, struct net_device *idev) { unsigned long gid_type_mask; gid_type_mask = roce_gid_type_mask_support(ib_dev, port); ib_cache_gid_set_default_gid(ib_dev, port, idev, gid_type_mask, IB_CACHE_GID_DEFAULT_MODE_SET); return (hweight_long(gid_type_mask)); } #define ETH_IPOIB_DRV_NAME "ib" static inline int is_eth_ipoib_intf(struct net_device *dev) { if (strcmp(dev->if_dname, ETH_IPOIB_DRV_NAME)) return 0; return 1; } static void roce_gid_update_addr_callback(struct ib_device *device, u8 port, struct net_device *ndev, void *cookie) { struct ipx_entry { STAILQ_ENTRY(ipx_entry) entry; union ipx_addr { struct sockaddr sa[0]; struct sockaddr_in v4; struct sockaddr_in6 v6; } ipx_addr; struct net_device *ndev; }; struct ipx_entry *entry; struct net_device *idev; #if defined(INET) || defined(INET6) struct ifaddr *ifa; #endif struct ib_gid_attr gid_attr; union ib_gid gid; int default_gids; u16 index_num; int i; STAILQ_HEAD(, ipx_entry) ipx_head; STAILQ_INIT(&ipx_head); /* make sure default GIDs are in */ default_gids = roce_gid_enum_netdev_default(device, port, ndev); CURVNET_SET(ndev->if_vnet); IFNET_RLOCK(); - TAILQ_FOREACH(idev, &V_ifnet, if_link) { + CK_STAILQ_FOREACH(idev, &V_ifnet, if_link) { if (idev != ndev) { if (idev->if_type != IFT_L2VLAN) continue; if (ndev != rdma_vlan_dev_real_dev(idev)) continue; } /* clone address information for IPv4 and IPv6 */ IF_ADDR_RLOCK(idev); #if defined(INET) CK_STAILQ_FOREACH(ifa, &idev->if_addrhead, ifa_link) { if (ifa->ifa_addr == NULL || ifa->ifa_addr->sa_family != AF_INET) continue; entry = kzalloc(sizeof(*entry), GFP_ATOMIC); if (entry == NULL) { pr_warn("roce_gid_update_addr_callback: " "couldn't allocate entry for IPv4 update\n"); continue; } entry->ipx_addr.v4 = *((struct sockaddr_in *)ifa->ifa_addr); entry->ndev = idev; STAILQ_INSERT_TAIL(&ipx_head, entry, entry); } #endif #if defined(INET6) CK_STAILQ_FOREACH(ifa, &idev->if_addrhead, ifa_link) { if (ifa->ifa_addr == NULL || ifa->ifa_addr->sa_family != AF_INET6) continue; entry = kzalloc(sizeof(*entry), GFP_ATOMIC); if (entry == NULL) { pr_warn("roce_gid_update_addr_callback: " "couldn't allocate entry for IPv6 update\n"); continue; } entry->ipx_addr.v6 = *((struct sockaddr_in6 *)ifa->ifa_addr); entry->ndev = idev; /* trash IPv6 scope ID */ sa6_recoverscope(&entry->ipx_addr.v6); entry->ipx_addr.v6.sin6_scope_id = 0; STAILQ_INSERT_TAIL(&ipx_head, entry, entry); } #endif IF_ADDR_RUNLOCK(idev); } IFNET_RUNLOCK(); CURVNET_RESTORE(); /* add missing GIDs, if any */ STAILQ_FOREACH(entry, &ipx_head, entry) { unsigned long gid_type_mask = roce_gid_type_mask_support(device, port); if (rdma_ip2gid(&entry->ipx_addr.sa[0], &gid) != 0) continue; for (i = 0; i != IB_GID_TYPE_SIZE; i++) { if (!((1UL << i) & gid_type_mask)) continue; /* check if entry found */ if (ib_find_cached_gid_by_port(device, &gid, i, port, entry->ndev, &index_num) == 0) break; } if (i != IB_GID_TYPE_SIZE) continue; /* add new GID */ update_gid(GID_ADD, device, port, &gid, entry->ndev); } /* remove stale GIDs, if any */ for (i = default_gids; ib_get_cached_gid(device, port, i, &gid, &gid_attr) == 0; i++) { union ipx_addr ipx; /* check for valid network device pointer */ ndev = gid_attr.ndev; if (ndev == NULL) continue; dev_put(ndev); /* don't delete empty entries */ if (memcmp(&gid, &zgid, sizeof(zgid)) == 0) continue; /* zero default */ memset(&ipx, 0, sizeof(ipx)); rdma_gid2ip(&ipx.sa[0], &gid); STAILQ_FOREACH(entry, &ipx_head, entry) { if (entry->ndev == ndev && memcmp(&entry->ipx_addr, &ipx, sizeof(ipx)) == 0) break; } /* check if entry found */ if (entry != NULL) continue; /* remove GID */ update_gid(GID_DEL, device, port, &gid, ndev); } while ((entry = STAILQ_FIRST(&ipx_head))) { STAILQ_REMOVE_HEAD(&ipx_head, entry); kfree(entry); } } static void roce_gid_queue_scan_event_handler(struct work_struct *_work) { struct roce_netdev_event_work *work = container_of(_work, struct roce_netdev_event_work, work); ib_enum_all_roce_netdevs(roce_gid_match_netdev, work->ndev, roce_gid_update_addr_callback, NULL); dev_put(work->ndev); kfree(work); } static void roce_gid_queue_scan_event(struct net_device *ndev) { struct roce_netdev_event_work *work; retry: if (is_eth_ipoib_intf(ndev)) return; if (ndev->if_type != IFT_ETHER) { if (ndev->if_type == IFT_L2VLAN) { ndev = rdma_vlan_dev_real_dev(ndev); if (ndev != NULL) goto retry; } return; } work = kmalloc(sizeof(*work), GFP_ATOMIC); if (!work) { pr_warn("roce_gid_mgmt: Couldn't allocate work for addr_event\n"); return; } INIT_WORK(&work->work, roce_gid_queue_scan_event_handler); dev_hold(ndev); work->ndev = ndev; queue_work(roce_gid_mgmt_wq, &work->work); } static void roce_gid_delete_all_event_handler(struct work_struct *_work) { struct roce_netdev_event_work *work = container_of(_work, struct roce_netdev_event_work, work); ib_cache_gid_del_all_by_netdev(work->ndev); dev_put(work->ndev); kfree(work); } static void roce_gid_delete_all_event(struct net_device *ndev) { struct roce_netdev_event_work *work; work = kmalloc(sizeof(*work), GFP_ATOMIC); if (!work) { pr_warn("roce_gid_mgmt: Couldn't allocate work for addr_event\n"); return; } INIT_WORK(&work->work, roce_gid_delete_all_event_handler); dev_hold(ndev); work->ndev = ndev; queue_work(roce_gid_mgmt_wq, &work->work); /* make sure job is complete before returning */ flush_workqueue(roce_gid_mgmt_wq); } static int inetaddr_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *ndev = ptr; switch (event) { case NETDEV_UNREGISTER: roce_gid_delete_all_event(ndev); break; case NETDEV_REGISTER: case NETDEV_CHANGEADDR: case NETDEV_CHANGEIFADDR: roce_gid_queue_scan_event(ndev); break; default: break; } return NOTIFY_DONE; } static struct notifier_block nb_inetaddr = { .notifier_call = inetaddr_event }; static void roce_rescan_device_handler(struct work_struct *_work) { struct roce_rescan_work *work = container_of(_work, struct roce_rescan_work, work); ib_enum_roce_netdev(work->ib_dev, roce_gid_match_all, NULL, roce_gid_update_addr_callback, NULL); kfree(work); } /* Caller must flush system workqueue before removing the ib_device */ int roce_rescan_device(struct ib_device *ib_dev) { struct roce_rescan_work *work = kmalloc(sizeof(*work), GFP_KERNEL); if (!work) return -ENOMEM; work->ib_dev = ib_dev; INIT_WORK(&work->work, roce_rescan_device_handler); queue_work(roce_gid_mgmt_wq, &work->work); return 0; } int __init roce_gid_mgmt_init(void) { roce_gid_mgmt_wq = alloc_ordered_workqueue("roce_gid_mgmt_wq", 0); if (!roce_gid_mgmt_wq) { pr_warn("roce_gid_mgmt: can't allocate work queue\n"); return -ENOMEM; } register_inetaddr_notifier(&nb_inetaddr); /* * We rely on the netdevice notifier to enumerate all existing * devices in the system. Register to this notifier last to * make sure we will not miss any IP add/del callbacks. */ register_netdevice_notifier(&nb_inetaddr); return 0; } void __exit roce_gid_mgmt_cleanup(void) { unregister_inetaddr_notifier(&nb_inetaddr); unregister_netdevice_notifier(&nb_inetaddr); /* * Ensure all gid deletion tasks complete before we go down, * to avoid any reference to free'd memory. By the time * ib-core is removed, all physical devices have been removed, * so no issue with remaining hardware contexts. */ synchronize_rcu(); drain_workqueue(roce_gid_mgmt_wq); destroy_workqueue(roce_gid_mgmt_wq); }