Index: projects/netbsd-tests-upstream-01-2017/lib/msun/tests/Makefile =================================================================== --- projects/netbsd-tests-upstream-01-2017/lib/msun/tests/Makefile (revision 312217) +++ projects/netbsd-tests-upstream-01-2017/lib/msun/tests/Makefile (revision 312218) @@ -1,105 +1,102 @@ # $FreeBSD$ .include TESTSRC= ${SRCTOP}/contrib/netbsd-tests/lib/libm # All architectures on FreeBSD have fenv.h CFLAGS+= -DHAVE_FENV_H # For isqemu.h CFLAGS+= -I${TESTSRC:H}/libc/gen # Not sure why this isn't defined for all architectures, since most # have long double. .if ${MACHINE_CPUARCH} == "aarch64" || \ ${MACHINE_CPUARCH} == "amd64" || \ ${MACHINE_CPUARCH} == "i386" CFLAGS+= -D__HAVE_LONG_DOUBLE .endif NETBSD_ATF_TESTS_C= acos_test NETBSD_ATF_TESTS_C+= asin_test NETBSD_ATF_TESTS_C+= atan_test NETBSD_ATF_TESTS_C+= cbrt_test NETBSD_ATF_TESTS_C+= ceil_test NETBSD_ATF_TESTS_C+= casinh_test NETBSD_ATF_TESTS_C+= cos_test NETBSD_ATF_TESTS_C+= cosh_test NETBSD_ATF_TESTS_C+= erf_test NETBSD_ATF_TESTS_C+= exp_test NETBSD_ATF_TESTS_C+= fmod_test NETBSD_ATF_TESTS_C+= fe_round_test NETBSD_ATF_TESTS_C+= infinity_test NETBSD_ATF_TESTS_C+= ilogb_test NETBSD_ATF_TESTS_C+= ldexp_test NETBSD_ATF_TESTS_C+= log_test NETBSD_ATF_TESTS_C+= pow_test NETBSD_ATF_TESTS_C+= precision_test NETBSD_ATF_TESTS_C+= round_test NETBSD_ATF_TESTS_C+= scalbn_test NETBSD_ATF_TESTS_C+= sin_test NETBSD_ATF_TESTS_C+= sinh_test NETBSD_ATF_TESTS_C+= sqrt_test NETBSD_ATF_TESTS_C+= tan_test NETBSD_ATF_TESTS_C+= tanh_test TAP_TESTS_C+= cexp_test TAP_TESTS_C+= conj_test .if ${MACHINE_CPUARCH} != "aarch64" # Hits an assert in llvm when building for arm64: # https://llvm.org/bugs/show_bug.cgi?id=26081 TAP_TESTS_C+= csqrt_test .endif TAP_TESTS_C+= ctrig_test TAP_TESTS_C+= exponential_test TAP_TESTS_C+= fenv_test TAP_TESTS_C+= fma_test -# clang 3.8.0 fails always fails this test. See: bug 208703 -.if ! (${COMPILER_TYPE} == "clang" && ${COMPILER_VERSION} >= 30800) TAP_TESTS_C+= fmaxmin_test -.endif TAP_TESTS_C+= ilogb2_test TAP_TESTS_C+= invtrig_test TAP_TESTS_C+= invctrig_test TAP_TESTS_C+= logarithm_test TAP_TESTS_C+= lrint_test # XXX: the testcase crashes on all platforms, but only on head # (bug 205451) #TAP_TESTS_C+= lround_test TAP_TESTS_C+= nan_test TAP_TESTS_C+= nearbyint_test TAP_TESTS_C+= next_test TAP_TESTS_C+= rem_test TAP_TESTS_C+= trig_test .if !empty(PROG) && !empty(TAP_TESTS_C:M${PROG}) CFLAGS+= -O0 .endif CSTD= c99 #COPTS+= -Wfloat-equal IGNORE_PRAGMA= SRCS.ilogb2_test= ilogb_test.c LIBADD+= m # Copied from lib/msun/Makefile .if ${MACHINE_CPUARCH} == "i386" ARCH_SUBDIR= i387 .else ARCH_SUBDIR= ${MACHINE_CPUARCH} .endif .include "../${ARCH_SUBDIR}/Makefile.inc" # XXX: for some odd reason float.h doesn't tell the full story about what the # precision is. CFLAGS+= -DLDBL_PREC=${LDBL_PREC} .include .include Index: projects/netbsd-tests-upstream-01-2017/lib/msun/tests/fmaxmin_test.c =================================================================== --- projects/netbsd-tests-upstream-01-2017/lib/msun/tests/fmaxmin_test.c (revision 312217) +++ projects/netbsd-tests-upstream-01-2017/lib/msun/tests/fmaxmin_test.c (revision 312218) @@ -1,136 +1,153 @@ /*- * Copyright (c) 2008 David Schultz * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * Tests for fmax{,f,l}() and fmin{,f,l}. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include "test-utils.h" #pragma STDC FENV_ACCESS ON /* * Test whether func(x, y) has the expected result, and make sure no * exceptions are raised. */ #define TEST(func, type, x, y, expected) do { \ type __x = (x); /* convert before we clear exceptions */ \ type __y = (y); \ feclearexcept(ALL_STD_EXCEPT); \ long double __result = func((__x), (__y)); \ if (fetestexcept(ALL_STD_EXCEPT)) { \ fprintf(stderr, #func "(%.20Lg, %.20Lg) raised 0x%x\n", \ (x), (y), fetestexcept(FE_ALL_EXCEPT)); \ ok = 0; \ } \ if (!fpequal(__result, (expected))) { \ fprintf(stderr, #func "(%.20Lg, %.20Lg) = %.20Lg, " \ "expected %.20Lg\n", (x), (y), __result, (expected)); \ ok = 0; \ } \ } while (0) int testall_r(long double big, long double small) { int ok; long double expected_max = isnan(big) ? small : big; long double expected_min = isnan(small) ? big : small; ok = 1; TEST(fmaxf, float, big, small, expected_max); TEST(fmaxf, float, small, big, expected_max); TEST(fmax, double, big, small, expected_max); TEST(fmax, double, small, big, expected_max); TEST(fmaxl, long double, big, small, expected_max); TEST(fmaxl, long double, small, big, expected_max); TEST(fminf, float, big, small, expected_min); TEST(fminf, float, small, big, expected_min); TEST(fmin, double, big, small, expected_min); TEST(fmin, double, small, big, expected_min); TEST(fminl, long double, big, small, expected_min); TEST(fminl, long double, small, big, expected_min); return (ok); } +const char *comment = NULL; + /* * Test all the functions: fmaxf, fmax, fmaxl, fminf, fmin, and fminl, * in all rounding modes and with the arguments in different orders. * The input 'big' must be >= 'small'. */ void testall(int testnum, long double big, long double small) { static const int rmodes[] = { FE_TONEAREST, FE_UPWARD, FE_DOWNWARD, FE_TOWARDZERO }; int i; for (i = 0; i < 4; i++) { fesetround(rmodes[i]); if (!testall_r(big, small)) { fprintf(stderr, "FAILURE in rounding mode %d\n", rmodes[i]); break; } } - printf("%sok %d - big = %.20Lg, small = %.20Lg\n", - (i == 4) ? "" : "not ", testnum, big, small); + printf("%sok %d - big = %.20Lg, small = %.20Lg%s\n", + (i == 4) ? "" : "not ", testnum, big, small, + comment == NULL ? "" : comment); } +/* Clang 3.8.0+ fails the invariants for testcase 6, 7, 10, and 11. */ +#if defined(__clang__) && \ + (__clang_major__ >= 3 && __clang_minor__ >= 8 && __clang_patchlevel__ >= 0) +#define affected_by_bug_208703 +#endif + int main(int argc, char *argv[]) { printf("1..12\n"); testall(1, 1.0, 0.0); testall(2, 42.0, nextafterf(42.0, -INFINITY)); testall(3, nextafterf(42.0, INFINITY), 42.0); testall(4, -5.0, -5.0); testall(5, -3.0, -4.0); +#ifdef affected_by_bug_208703 + comment = "# TODO: testcase 6-7 fails invariant with clang 3.8+ (bug 208703)"; +#endif testall(6, 1.0, NAN); testall(7, INFINITY, NAN); + comment = NULL; testall(8, INFINITY, 1.0); testall(9, -3.0, -INFINITY); testall(10, 3.0, -INFINITY); +#ifdef affected_by_bug_208703 + comment = "# TODO: testcase 11-12 fails invariant with clang 3.8+ (bug 208703)"; +#endif testall(11, NAN, NAN); /* This test isn't strictly required to work by C99. */ testall(12, 0.0, -0.0); + comment = NULL; return (0); } Index: projects/netbsd-tests-upstream-01-2017/release/tools/ec2.conf =================================================================== --- projects/netbsd-tests-upstream-01-2017/release/tools/ec2.conf (revision 312217) +++ projects/netbsd-tests-upstream-01-2017/release/tools/ec2.conf (revision 312218) @@ -1,85 +1,90 @@ #!/bin/sh # # $FreeBSD$ # # Packages to install into the image we're creating. This is a deliberately # minimalist set, providing only the packages necessary to bootstrap further # package installation as specified via EC2 user-data. -export VM_EXTRA_PACKAGES="ec2-scripts firstboot-freebsd-update firstboot-pkgs" +export VM_EXTRA_PACKAGES="ec2-scripts firstboot-freebsd-update firstboot-pkgs dual-dhclient" # Set to a list of third-party software to enable in rc.conf(5). export VM_RC_LIST="ec2_configinit ec2_fetchkey ec2_ephemeralswap ec2_loghostkey firstboot_freebsd_update firstboot_pkgs" # Build with a 1.5 GB UFS partition; the growfs rc.d script will expand # the partition to fill the root disk after the EC2 instance is launched. # Note that if this is set to G, we will end up with an GB disk # image since VMSIZE is the size of the UFS partition, not the disk which # it resides within. export VMSIZE=1536M # No swap space; the ec2_ephemeralswap rc.d script will allocate swap # space on EC2 ephemeral disks. (If they exist -- the T2 low-cost instances # and the C4 compute-optimized instances don't have ephemeral disks. But # it would be silly to bloat the image and increase costs for every instance # just for those two families, especially since instances ranging in size # from 1 GB of RAM to 60 GB of RAM would need different sizes of swap space # anyway.) export NOSWAP=YES vm_extra_pre_umount() { # The firstboot_pkgs rc.d script will download the repository # catalogue and install or update pkg when the instance first # launches, so these files would just be replaced anyway; removing # them from the image allows it to boot faster. env ASSUME_ALWAYS_YES=yes pkg -c ${DESTDIR} delete -f -y pkg rm ${DESTDIR}/var/db/pkg/repo-*.sqlite # The size of the EC2 root disk can be configured at instance launch # time; expand our filesystem to fill the disk. echo 'growfs_enable="YES"' >> ${DESTDIR}/etc/rc.conf - # EC2 instances use DHCP to get their network configuration. - echo 'ifconfig_DEFAULT="SYNCDHCP"' >> ${DESTDIR}/etc/rc.conf + # EC2 instances use DHCP to get their network configuration. IPv6 + # requires accept_rtadv. + echo 'ifconfig_DEFAULT="SYNCDHCP accept_rtadv"' >> ${DESTDIR}/etc/rc.conf # Unless the system has been configured via EC2 user-data, the user # will need to SSH in to do anything. echo 'sshd_enable="YES"' >> ${DESTDIR}/etc/rc.conf # The AWS CLI tools are generally useful, and small enough that they # will download quickly; but users will often override this setting # via EC2 user-data. echo 'firstboot_pkgs_list="awscli"' >> ${DESTDIR}/etc/rc.conf + + # Enable IPv6 on all interfaces, and use DHCP on both IPv4 and IPv6. + echo 'ipv6_activate_all_interfaces="YES"' >> ${DESTDIR}/etc/rc.conf + echo 'dhclient_program="/usr/local/sbin/dual-dhclient"' >> ${DESTDIR}/etc/rc.conf # The EC2 console is output-only, so while printing a backtrace can # be useful, there's no point dropping into a debugger or waiting # for a keypress. echo 'debug.trace_on_panic=1' >> ${DESTDIR}/etc/sysctl.conf echo 'debug.debugger_on_panic=0' >> ${DESTDIR}/etc/sysctl.conf echo 'kern.panic_reboot_wait_time=0' >> ${DESTDIR}/etc/sysctl.conf # The console is not interactive, so we might as well boot quickly. echo 'autoboot_delay="-1"' >> ${DESTDIR}/boot/loader.conf echo 'beastie_disable="YES"' >> ${DESTDIR}/boot/loader.conf # EC2 has two consoles: An emulated serial port ("system log"), # which has been present since 2006; and a VGA console ("instance # screenshot") which was introduced in 2016. echo 'boot_multicons="YES"' >> ${DESTDIR}/boot/loader.conf # Some older EC2 hardware used a version of Xen with a bug in its # emulated serial port. It is not clear if EC2 still has any such # nodes, but apply the workaround just in case. echo 'hw.broken_txfifo="1"' >> ${DESTDIR}/boot/loader.conf # The first time the AMI boots, the installed "first boot" scripts # should be allowed to run: # * ec2_configinit (download and process EC2 user-data) # * ec2_fetchkey (arrange for SSH using the EC2-provided public key) # * growfs (expand the filesystem to fill the provided disk) # * firstboot_freebsd_update (install critical updates) # * firstboot_pkgs (install packages) touch ${DESTDIR}/firstboot return 0 } Index: projects/netbsd-tests-upstream-01-2017/sys/ddb/db_input.c =================================================================== --- projects/netbsd-tests-upstream-01-2017/sys/ddb/db_input.c (revision 312217) +++ projects/netbsd-tests-upstream-01-2017/sys/ddb/db_input.c (revision 312218) @@ -1,367 +1,374 @@ /*- * Mach Operating System * Copyright (c) 1991,1990 Carnegie Mellon University * All Rights Reserved. * * Permission to use, copy, modify and distribute this software and its * documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation. * * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. * * Carnegie Mellon requests users of this software to return to * * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU * School of Computer Science * Carnegie Mellon University * Pittsburgh PA 15213-3890 * * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. */ /* * Author: David B. Golub, Carnegie Mellon University * Date: 7/90 */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include /* * Character input and editing. */ /* * We don't track output position while editing input, * since input always ends with a new-line. We just * reset the line position at the end. */ static char * db_lbuf_start; /* start of input line buffer */ static char * db_lbuf_end; /* end of input line buffer */ static char * db_lc; /* current character */ static char * db_le; /* one past last character */ /* * Simple input line history support. */ static char db_lhistory[2048]; static int db_lhistlsize, db_lhistidx, db_lhistcur; static int db_lhist_nlines; #define CTRL(c) ((c) & 0x1f) #define BLANK ' ' #define BACKUP '\b' +static int cnmaygetc(void); static void db_delete(int n, int bwd); static int db_inputchar(int c); static void db_putnchars(int c, int count); static void db_putstring(char *s, int count); static void db_putstring(s, count) char *s; int count; { while (--count >= 0) cnputc(*s++); } static void db_putnchars(c, count) int c; int count; { while (--count >= 0) cnputc(c); } /* * Delete N characters, forward or backward */ #define DEL_FWD 0 #define DEL_BWD 1 static void db_delete(n, bwd) int n; int bwd; { char *p; if (bwd) { db_lc -= n; db_putnchars(BACKUP, n); } for (p = db_lc; p < db_le-n; p++) { *p = *(p+n); cnputc(*p); } db_putnchars(BLANK, n); db_putnchars(BACKUP, db_le - db_lc); db_le -= n; } /* returns true at end-of-line */ static int db_inputchar(c) int c; { static int escstate; if (escstate == 1) { /* ESC seen, look for [ or O */ if (c == '[' || c == 'O') escstate++; else escstate = 0; /* re-init state machine */ return (0); } else if (escstate == 2) { escstate = 0; /* * If a valid cursor key has been found, translate * into an emacs-style control key, and fall through. * Otherwise, drop off. */ switch (c) { case 'A': /* up */ c = CTRL('p'); break; case 'B': /* down */ c = CTRL('n'); break; case 'C': /* right */ c = CTRL('f'); break; case 'D': /* left */ c = CTRL('b'); break; default: return (0); } } switch (c) { case CTRL('['): escstate = 1; break; case CTRL('b'): /* back up one character */ if (db_lc > db_lbuf_start) { cnputc(BACKUP); db_lc--; } break; case CTRL('f'): /* forward one character */ if (db_lc < db_le) { cnputc(*db_lc); db_lc++; } break; case CTRL('a'): /* beginning of line */ while (db_lc > db_lbuf_start) { cnputc(BACKUP); db_lc--; } break; case CTRL('e'): /* end of line */ while (db_lc < db_le) { cnputc(*db_lc); db_lc++; } break; case CTRL('h'): case 0177: /* erase previous character */ if (db_lc > db_lbuf_start) db_delete(1, DEL_BWD); break; case CTRL('d'): /* erase next character */ if (db_lc < db_le) db_delete(1, DEL_FWD); break; case CTRL('u'): /* kill entire line: */ /* at first, delete to beginning of line */ if (db_lc > db_lbuf_start) db_delete(db_lc - db_lbuf_start, DEL_BWD); /* FALLTHROUGH */ case CTRL('k'): /* delete to end of line */ if (db_lc < db_le) db_delete(db_le - db_lc, DEL_FWD); break; case CTRL('t'): /* twiddle last 2 characters */ if (db_lc >= db_lbuf_start + 2) { c = db_lc[-2]; db_lc[-2] = db_lc[-1]; db_lc[-1] = c; cnputc(BACKUP); cnputc(BACKUP); cnputc(db_lc[-2]); cnputc(db_lc[-1]); } break; case CTRL('r'): db_putstring("^R\n", 3); redraw: if (db_le > db_lbuf_start) { db_putstring(db_lbuf_start, db_le - db_lbuf_start); db_putnchars(BACKUP, db_le - db_lc); } break; case CTRL('p'): /* Make previous history line the active one. */ if (db_lhistcur >= 0) { bcopy(db_lhistory + db_lhistcur * db_lhistlsize, db_lbuf_start, db_lhistlsize); db_lhistcur--; goto hist_redraw; } break; case CTRL('n'): /* Make next history line the active one. */ if (db_lhistcur < db_lhistidx - 1) { db_lhistcur += 2; bcopy(db_lhistory + db_lhistcur * db_lhistlsize, db_lbuf_start, db_lhistlsize); } else { /* * ^N through tail of history, reset the * buffer to zero length. */ *db_lbuf_start = '\0'; db_lhistcur = db_lhistidx; } hist_redraw: db_putnchars(BACKUP, db_lc - db_lbuf_start); db_putnchars(BLANK, db_le - db_lbuf_start); db_putnchars(BACKUP, db_le - db_lbuf_start); db_le = strchr(db_lbuf_start, '\0'); if (db_le[-1] == '\r' || db_le[-1] == '\n') *--db_le = '\0'; db_lc = db_le; goto redraw; case -1: /* * eek! the console returned eof. * probably that means we HAVE no console.. we should try bail * XXX */ c = '\r'; case '\n': /* FALLTHROUGH */ case '\r': *db_le++ = c; return (1); default: if (db_le == db_lbuf_end) { cnputc('\007'); } else if (c >= ' ' && c <= '~') { char *p; for (p = db_le; p > db_lc; p--) *p = *(p-1); *db_lc++ = c; db_le++; cnputc(c); db_putstring(db_lc, db_le - db_lc); db_putnchars(BACKUP, db_le - db_lc); } break; } return (0); } +static int +cnmaygetc() +{ + return (-1); +} + int db_readline(lstart, lsize) char * lstart; int lsize; { if (lsize < 2) return (0); if (lsize != db_lhistlsize) { /* * (Re)initialize input line history. Throw away any * existing history. */ db_lhist_nlines = sizeof(db_lhistory) / lsize; db_lhistlsize = lsize; db_lhistidx = -1; } db_lhistcur = db_lhistidx; db_force_whitespace(); /* synch output position */ db_lbuf_start = lstart; db_lbuf_end = lstart + lsize - 2; /* Will append NL and NUL. */ db_lc = lstart; db_le = lstart; while (!db_inputchar(cngetc())) continue; db_capture_write(lstart, db_le - db_lbuf_start); db_printf("\n"); /* synch output position */ *db_le = 0; if (db_le - db_lbuf_start > 1) { /* Maintain input line history for non-empty lines. */ if (++db_lhistidx == db_lhist_nlines) { /* Rotate history. */ bcopy(db_lhistory + db_lhistlsize, db_lhistory, db_lhistlsize * (db_lhist_nlines - 1)); db_lhistidx--; } bcopy(lstart, db_lhistory + db_lhistidx * db_lhistlsize, db_lhistlsize); } return (db_le - db_lbuf_start); } void db_check_interrupt(void) { int c; - c = cncheckc(); + c = cnmaygetc(); switch (c) { case -1: /* no character */ return; case CTRL('c'): db_error((char *)0); /*NOTREACHED*/ case CTRL('s'): do { - c = cncheckc(); + c = cnmaygetc(); if (c == CTRL('c')) db_error((char *)0); } while (c != CTRL('q')); break; default: /* drop on floor */ break; } } Index: projects/netbsd-tests-upstream-01-2017/sys/dev/etherswitch/micrel/ksz8995ma.c =================================================================== --- projects/netbsd-tests-upstream-01-2017/sys/dev/etherswitch/micrel/ksz8995ma.c (nonexistent) +++ projects/netbsd-tests-upstream-01-2017/sys/dev/etherswitch/micrel/ksz8995ma.c (revision 312218) @@ -0,0 +1,960 @@ +/*- + * Copyright (c) 2016 Hiroki Mori + * Copyright (c) 2013 Luiz Otavio O Souza. + * Copyright (c) 2011-2012 Stefan Bethke. + * Copyright (c) 2012 Adrian Chadd. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +/* + * This is Micrel KSZ8995MA driver code. KSZ8995MA use SPI bus on control. + * This code development on @SRCHACK's ksz8995ma board and FON2100 with + * gpiospi. + * etherswitchcfg command port option support addtag, ingress, striptag, + * dropuntagged. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include +#include +#include + +#include + +#include + +#include "spibus_if.h" +#include "miibus_if.h" +#include "etherswitch_if.h" + +#define KSZ8995MA_SPI_READ 0x03 +#define KSZ8995MA_SPI_WRITE 0x02 + +#define KSZ8995MA_CID0 0x00 +#define KSZ8995MA_CID1 0x01 + +#define KSZ8995MA_GC0 0x02 +#define KSZ8995MA_GC1 0x03 +#define KSZ8995MA_GC2 0x04 +#define KSZ8995MA_GC3 0x05 + +#define KSZ8995MA_PORT_SIZE 0x10 + +#define KSZ8995MA_PC0_BASE 0x10 +#define KSZ8995MA_PC1_BASE 0x11 +#define KSZ8995MA_PC2_BASE 0x12 +#define KSZ8995MA_PC3_BASE 0x13 +#define KSZ8995MA_PC4_BASE 0x14 +#define KSZ8995MA_PC5_BASE 0x15 +#define KSZ8995MA_PC6_BASE 0x16 +#define KSZ8995MA_PC7_BASE 0x17 +#define KSZ8995MA_PC8_BASE 0x18 +#define KSZ8995MA_PC9_BASE 0x19 +#define KSZ8995MA_PC10_BASE 0x1a +#define KSZ8995MA_PC11_BASE 0x1b +#define KSZ8995MA_PC12_BASE 0x1c +#define KSZ8995MA_PC13_BASE 0x1d + +#define KSZ8995MA_PS0_BASE 0x1e + +#define KSZ8995MA_PC14_BASE 0x1f + +#define KSZ8995MA_IAC0 0x6e +#define KSZ8995MA_IAC1 0x6f +#define KSZ8995MA_IDR8 0x70 +#define KSZ8995MA_IDR7 0x71 +#define KSZ8995MA_IDR6 0x72 +#define KSZ8995MA_IDR5 0x73 +#define KSZ8995MA_IDR4 0x74 +#define KSZ8995MA_IDR3 0x75 +#define KSZ8995MA_IDR2 0x76 +#define KSZ8995MA_IDR1 0x77 +#define KSZ8995MA_IDR0 0x78 + +#define KSZ8995MA_FAMILI_ID 0x95 +#define KSZ8995MA_CHIP_ID 0x00 +#define KSZ8995MA_CHIP_ID_MASK 0xf0 +#define KSZ8995MA_START 0x01 +#define KSZ8995MA_VLAN_ENABLE 0x80 +#define KSZ8995MA_TAG_INS 0x04 +#define KSZ8995MA_TAG_RM 0x02 +#define KSZ8995MA_INGR_FILT 0x40 +#define KSZ8995MA_DROP_NONPVID 0x20 + +#define KSZ8995MA_PDOWN 0x08 +#define KSZ8995MA_STARTNEG 0x20 + +#define KSZ8995MA_MII_STAT 0x7808 +#define KSZ8995MA_MII_PHYID_H 0x0022 +#define KSZ8995MA_MII_PHYID_L 0x1450 +#define KSZ8995MA_MII_AA 0x0401 + +#define KSZ8995MA_VLAN_TABLE_VALID 0x20 +#define KSZ8995MA_VLAN_TABLE_READ 0x14 +#define KSZ8995MA_VLAN_TABLE_WRITE 0x04 + +#define KSZ8995MA_MAX_PORT 5 + +MALLOC_DECLARE(M_KSZ8995MA); +MALLOC_DEFINE(M_KSZ8995MA, "ksz8995ma", "ksz8995ma data structures"); + +struct ksz8995ma_softc { + struct mtx sc_mtx; /* serialize access to softc */ + device_t sc_dev; + int vlan_mode; + int media; /* cpu port media */ + int cpuport; /* which PHY is connected to the CPU */ + int phymask; /* PHYs we manage */ + int numports; /* number of ports */ + int ifpport[KSZ8995MA_MAX_PORT]; + int *portphy; + char **ifname; + device_t **miibus; + struct ifnet **ifp; + struct callout callout_tick; + etherswitch_info_t info; +}; + +#define KSZ8995MA_LOCK(_sc) \ + mtx_lock(&(_sc)->sc_mtx) +#define KSZ8995MA_UNLOCK(_sc) \ + mtx_unlock(&(_sc)->sc_mtx) +#define KSZ8995MA_LOCK_ASSERT(_sc, _what) \ + mtx_assert(&(_sc)->sc_mtx, (_what)) +#define KSZ8995MA_TRYLOCK(_sc) \ + mtx_trylock(&(_sc)->sc_mtx) + +#if defined(DEBUG) +#define DPRINTF(dev, args...) device_printf(dev, args) +#else +#define DPRINTF(dev, args...) +#endif + +static inline int ksz8995ma_portforphy(struct ksz8995ma_softc *, int); +static void ksz8995ma_tick(void *); +static int ksz8995ma_ifmedia_upd(struct ifnet *); +static void ksz8995ma_ifmedia_sts(struct ifnet *, struct ifmediareq *); +static int ksz8995ma_readreg(device_t dev, int addr); +static int ksz8995ma_writereg(device_t dev, int addr, int value); +static void ksz8995ma_portvlanreset(device_t dev); + +static int +ksz8995ma_probe(device_t dev) +{ + int id0, id1; + struct ksz8995ma_softc *sc; + + sc = device_get_softc(dev); + bzero(sc, sizeof(*sc)); + + id0 = ksz8995ma_readreg(dev, KSZ8995MA_CID0); + id1 = ksz8995ma_readreg(dev, KSZ8995MA_CID1); + if (bootverbose) + device_printf(dev,"Chip Identifier Register %x %x\n", id0, id1); + + /* check Product Code */ + if (id0 != KSZ8995MA_FAMILI_ID || (id1 & KSZ8995MA_CHIP_ID_MASK) != + KSZ8995MA_CHIP_ID) { + return (ENXIO); + } + + device_set_desc_copy(dev, "Micrel KSZ8995MA SPI switch driver"); + return (BUS_PROBE_DEFAULT); +} + +static int +ksz8995ma_attach_phys(struct ksz8995ma_softc *sc) +{ + int phy, port, err; + char name[IFNAMSIZ]; + + port = 0; + err = 0; + /* PHYs need an interface, so we generate a dummy one */ + snprintf(name, IFNAMSIZ, "%sport", device_get_nameunit(sc->sc_dev)); + for (phy = 0; phy < sc->numports; phy++) { + if (phy == sc->cpuport) + continue; + if (((1 << phy) & sc->phymask) == 0) + continue; + sc->ifpport[phy] = port; + sc->portphy[port] = phy; + sc->ifp[port] = if_alloc(IFT_ETHER); + sc->ifp[port]->if_softc = sc; + sc->ifp[port]->if_flags |= IFF_UP | IFF_BROADCAST | + IFF_DRV_RUNNING | IFF_SIMPLEX; + if_initname(sc->ifp[port], name, port); + sc->miibus[port] = malloc(sizeof(device_t), M_KSZ8995MA, + M_WAITOK | M_ZERO); + if (sc->miibus[port] == NULL) { + err = ENOMEM; + goto failed; + } + err = mii_attach(sc->sc_dev, sc->miibus[port], sc->ifp[port], + ksz8995ma_ifmedia_upd, ksz8995ma_ifmedia_sts, \ + BMSR_DEFCAPMASK, phy, MII_OFFSET_ANY, 0); + DPRINTF(sc->sc_dev, "%s attached to pseudo interface %s\n", + device_get_nameunit(*sc->miibus[port]), + sc->ifp[port]->if_xname); + if (err != 0) { + device_printf(sc->sc_dev, + "attaching PHY %d failed\n", + phy); + goto failed; + } + ++port; + } + sc->info.es_nports = port; + if (sc->cpuport != -1) { + /* cpu port is MAC5 on ksz8995ma */ + sc->ifpport[sc->cpuport] = port; + sc->portphy[port] = sc->cpuport; + ++sc->info.es_nports; + } + + return (0); + +failed: + for (phy = 0; phy < sc->numports; phy++) { + if (((1 << phy) & sc->phymask) == 0) + continue; + port = ksz8995ma_portforphy(sc, phy); + if (sc->miibus[port] != NULL) + device_delete_child(sc->sc_dev, (*sc->miibus[port])); + if (sc->ifp[port] != NULL) + if_free(sc->ifp[port]); + if (sc->ifname[port] != NULL) + free(sc->ifname[port], M_KSZ8995MA); + if (sc->miibus[port] != NULL) + free(sc->miibus[port], M_KSZ8995MA); + } + return (err); +} + +static int +ksz8995ma_attach(device_t dev) +{ + struct ksz8995ma_softc *sc; + int err, reg; + + err = 0; + sc = device_get_softc(dev); + + sc->sc_dev = dev; + mtx_init(&sc->sc_mtx, "ksz8995ma", NULL, MTX_DEF); + strlcpy(sc->info.es_name, device_get_desc(dev), + sizeof(sc->info.es_name)); + + /* KSZ8995MA Defaults */ + sc->numports = KSZ8995MA_MAX_PORT; + sc->phymask = (1 << (KSZ8995MA_MAX_PORT + 1)) - 1; + sc->cpuport = -1; + sc->media = 100; + + (void) resource_int_value(device_get_name(dev), device_get_unit(dev), + "cpuport", &sc->cpuport); + + sc->info.es_nvlangroups = 16; + sc->info.es_vlan_caps = ETHERSWITCH_VLAN_PORT | ETHERSWITCH_VLAN_DOT1Q; + + sc->ifp = malloc(sizeof(struct ifnet *) * sc->numports, M_KSZ8995MA, + M_WAITOK | M_ZERO); + sc->ifname = malloc(sizeof(char *) * sc->numports, M_KSZ8995MA, + M_WAITOK | M_ZERO); + sc->miibus = malloc(sizeof(device_t *) * sc->numports, M_KSZ8995MA, + M_WAITOK | M_ZERO); + sc->portphy = malloc(sizeof(int) * sc->numports, M_KSZ8995MA, + M_WAITOK | M_ZERO); + + if (sc->ifp == NULL || sc->ifname == NULL || sc->miibus == NULL || + sc->portphy == NULL) { + err = ENOMEM; + goto failed; + } + + /* + * Attach the PHYs and complete the bus enumeration. + */ + err = ksz8995ma_attach_phys(sc); + if (err != 0) + goto failed; + + bus_generic_probe(dev); + bus_enumerate_hinted_children(dev); + err = bus_generic_attach(dev); + if (err != 0) + goto failed; + + callout_init(&sc->callout_tick, 0); + + ksz8995ma_tick(sc); + + /* start switch */ + sc->vlan_mode = 0; + reg = ksz8995ma_readreg(dev, KSZ8995MA_GC3); + ksz8995ma_writereg(dev, KSZ8995MA_GC3, + reg & ~KSZ8995MA_VLAN_ENABLE); + ksz8995ma_portvlanreset(dev); + ksz8995ma_writereg(dev, KSZ8995MA_CID1, KSZ8995MA_START); + + return (0); + +failed: + if (sc->portphy != NULL) + free(sc->portphy, M_KSZ8995MA); + if (sc->miibus != NULL) + free(sc->miibus, M_KSZ8995MA); + if (sc->ifname != NULL) + free(sc->ifname, M_KSZ8995MA); + if (sc->ifp != NULL) + free(sc->ifp, M_KSZ8995MA); + + return (err); +} + +static int +ksz8995ma_detach(device_t dev) +{ + struct ksz8995ma_softc *sc; + int i, port; + + sc = device_get_softc(dev); + + callout_drain(&sc->callout_tick); + + for (i = 0; i < KSZ8995MA_MAX_PORT; i++) { + if (((1 << i) & sc->phymask) == 0) + continue; + port = ksz8995ma_portforphy(sc, i); + if (sc->miibus[port] != NULL) + device_delete_child(dev, (*sc->miibus[port])); + if (sc->ifp[port] != NULL) + if_free(sc->ifp[port]); + free(sc->ifname[port], M_KSZ8995MA); + free(sc->miibus[port], M_KSZ8995MA); + } + + free(sc->portphy, M_KSZ8995MA); + free(sc->miibus, M_KSZ8995MA); + free(sc->ifname, M_KSZ8995MA); + free(sc->ifp, M_KSZ8995MA); + + bus_generic_detach(dev); + mtx_destroy(&sc->sc_mtx); + + return (0); +} + +/* + * Convert PHY number to port number. + */ +static inline int +ksz8995ma_portforphy(struct ksz8995ma_softc *sc, int phy) +{ + + return (sc->ifpport[phy]); +} + +static inline struct mii_data * +ksz8995ma_miiforport(struct ksz8995ma_softc *sc, int port) +{ + + if (port < 0 || port > sc->numports) + return (NULL); + if (port == sc->cpuport) + return (NULL); + return (device_get_softc(*sc->miibus[port])); +} + +static inline struct ifnet * +ksz8995ma_ifpforport(struct ksz8995ma_softc *sc, int port) +{ + + if (port < 0 || port > sc->numports) + return (NULL); + return (sc->ifp[port]); +} + +/* + * Poll the status for all PHYs. + */ +static void +ksz8995ma_miipollstat(struct ksz8995ma_softc *sc) +{ + int i, port; + struct mii_data *mii; + struct mii_softc *miisc; + + KSZ8995MA_LOCK_ASSERT(sc, MA_NOTOWNED); + + for (i = 0; i < KSZ8995MA_MAX_PORT; i++) { + if (i == sc->cpuport) + continue; + if (((1 << i) & sc->phymask) == 0) + continue; + port = ksz8995ma_portforphy(sc, i); + if ((*sc->miibus[port]) == NULL) + continue; + mii = device_get_softc(*sc->miibus[port]); + LIST_FOREACH(miisc, &mii->mii_phys, mii_list) { + if (IFM_INST(mii->mii_media.ifm_cur->ifm_media) != + miisc->mii_inst) + continue; + ukphy_status(miisc); + mii_phy_update(miisc, MII_POLLSTAT); + } + } +} + +static void +ksz8995ma_tick(void *arg) +{ + struct ksz8995ma_softc *sc; + + sc = arg; + + ksz8995ma_miipollstat(sc); + callout_reset(&sc->callout_tick, hz, ksz8995ma_tick, sc); +} + +static void +ksz8995ma_lock(device_t dev) +{ + struct ksz8995ma_softc *sc; + + sc = device_get_softc(dev); + + KSZ8995MA_LOCK_ASSERT(sc, MA_NOTOWNED); + KSZ8995MA_LOCK(sc); +} + +static void +ksz8995ma_unlock(device_t dev) +{ + struct ksz8995ma_softc *sc; + + sc = device_get_softc(dev); + + KSZ8995MA_LOCK_ASSERT(sc, MA_OWNED); + KSZ8995MA_UNLOCK(sc); +} + +static etherswitch_info_t * +ksz8995ma_getinfo(device_t dev) +{ + struct ksz8995ma_softc *sc; + + sc = device_get_softc(dev); + + return (&sc->info); +} + +static int +ksz8995ma_getport(device_t dev, etherswitch_port_t *p) +{ + struct ksz8995ma_softc *sc; + struct mii_data *mii; + struct ifmediareq *ifmr; + int phy, err; + int tag1, tag2, portreg; + + sc = device_get_softc(dev); + ifmr = &p->es_ifmr; + + if (p->es_port < 0 || p->es_port >= sc->numports) + return (ENXIO); + + if (sc->vlan_mode == ETHERSWITCH_VLAN_DOT1Q) { + tag1 = ksz8995ma_readreg(dev, KSZ8995MA_PC3_BASE + + KSZ8995MA_PORT_SIZE * p->es_port); + tag2 = ksz8995ma_readreg(dev, KSZ8995MA_PC4_BASE + + KSZ8995MA_PORT_SIZE * p->es_port); + p->es_pvid = (tag1 & 0x0f) << 8 | tag2; + + portreg = ksz8995ma_readreg(dev, KSZ8995MA_PC0_BASE + + KSZ8995MA_PORT_SIZE * p->es_port); + if (portreg & KSZ8995MA_TAG_INS) + p->es_flags |= ETHERSWITCH_PORT_ADDTAG; + if (portreg & KSZ8995MA_TAG_RM) + p->es_flags |= ETHERSWITCH_PORT_STRIPTAG; + + portreg = ksz8995ma_readreg(dev, KSZ8995MA_PC2_BASE + + KSZ8995MA_PORT_SIZE * p->es_port); + if (portreg & KSZ8995MA_DROP_NONPVID) + p->es_flags |= ETHERSWITCH_PORT_DROPUNTAGGED; + if (portreg & KSZ8995MA_INGR_FILT) + p->es_flags |= ETHERSWITCH_PORT_INGRESS; + } + + phy = sc->portphy[p->es_port]; + mii = ksz8995ma_miiforport(sc, p->es_port); + if (sc->cpuport != -1 && phy == sc->cpuport) { + /* fill in fixed values for CPU port */ + p->es_flags |= ETHERSWITCH_PORT_CPU; + ifmr->ifm_count = 0; + if (sc->media == 100) + ifmr->ifm_current = ifmr->ifm_active = + IFM_ETHER | IFM_100_TX | IFM_FDX; + else + ifmr->ifm_current = ifmr->ifm_active = + IFM_ETHER | IFM_1000_T | IFM_FDX; + ifmr->ifm_mask = 0; + ifmr->ifm_status = IFM_ACTIVE | IFM_AVALID; + } else if (mii != NULL) { + err = ifmedia_ioctl(mii->mii_ifp, &p->es_ifr, + &mii->mii_media, SIOCGIFMEDIA); + if (err) + return (err); + } else { + return (ENXIO); + } + + return (0); +} + +static int +ksz8995ma_setport(device_t dev, etherswitch_port_t *p) +{ + struct ksz8995ma_softc *sc; + struct mii_data *mii; + struct ifmedia *ifm; + struct ifnet *ifp; + int phy, err; + int portreg; + + sc = device_get_softc(dev); + + if (p->es_port < 0 || p->es_port >= sc->numports) + return (ENXIO); + + if (sc->vlan_mode == ETHERSWITCH_VLAN_DOT1Q) { + ksz8995ma_writereg(dev, KSZ8995MA_PC4_BASE + + KSZ8995MA_PORT_SIZE * p->es_port, p->es_pvid & 0xff); + portreg = ksz8995ma_readreg(dev, KSZ8995MA_PC3_BASE + + KSZ8995MA_PORT_SIZE * p->es_port); + ksz8995ma_writereg(dev, KSZ8995MA_PC3_BASE + + KSZ8995MA_PORT_SIZE * p->es_port, + (portreg & 0xf0) | ((p->es_pvid >> 8) & 0x0f)); + + portreg = ksz8995ma_readreg(dev, KSZ8995MA_PC0_BASE + + KSZ8995MA_PORT_SIZE * p->es_port); + if (p->es_flags & ETHERSWITCH_PORT_ADDTAG) + portreg |= KSZ8995MA_TAG_INS; + else + portreg &= ~KSZ8995MA_TAG_INS; + if (p->es_flags & ETHERSWITCH_PORT_STRIPTAG) + portreg |= KSZ8995MA_TAG_RM; + else + portreg &= ~KSZ8995MA_TAG_RM; + ksz8995ma_writereg(dev, KSZ8995MA_PC0_BASE + + KSZ8995MA_PORT_SIZE * p->es_port, portreg); + + portreg = ksz8995ma_readreg(dev, KSZ8995MA_PC2_BASE + + KSZ8995MA_PORT_SIZE * p->es_port); + if (p->es_flags & ETHERSWITCH_PORT_DROPUNTAGGED) + portreg |= KSZ8995MA_DROP_NONPVID; + else + portreg &= ~KSZ8995MA_DROP_NONPVID; + if (p->es_flags & ETHERSWITCH_PORT_INGRESS) + portreg |= KSZ8995MA_INGR_FILT; + else + portreg &= ~KSZ8995MA_INGR_FILT; + ksz8995ma_writereg(dev, KSZ8995MA_PC2_BASE + + KSZ8995MA_PORT_SIZE * p->es_port, portreg); + } + + phy = sc->portphy[p->es_port]; + mii = ksz8995ma_miiforport(sc, p->es_port); + if (phy != sc->cpuport) { + if (mii == NULL) + return (ENXIO); + ifp = ksz8995ma_ifpforport(sc, p->es_port); + ifm = &mii->mii_media; + err = ifmedia_ioctl(ifp, &p->es_ifr, ifm, SIOCSIFMEDIA); + } + return (0); +} + +static int +ksz8995ma_getvgroup(device_t dev, etherswitch_vlangroup_t *vg) +{ + int data0, data1, data2; + int vlantab; + struct ksz8995ma_softc *sc; + + sc = device_get_softc(dev); + + if (sc->vlan_mode == ETHERSWITCH_VLAN_PORT) { + if (vg->es_vlangroup < sc->numports) { + vg->es_vid = ETHERSWITCH_VID_VALID; + vg->es_vid |= vg->es_vlangroup; + data0 = ksz8995ma_readreg(dev, KSZ8995MA_PC1_BASE + + KSZ8995MA_PORT_SIZE * vg->es_vlangroup); + vg->es_member_ports = data0 & 0x1f; + vg->es_untagged_ports = vg->es_member_ports; + vg->es_fid = 0; + } else { + vg->es_vid = 0; + } + } else if (sc->vlan_mode == ETHERSWITCH_VLAN_DOT1Q) { + ksz8995ma_writereg(dev, KSZ8995MA_IAC0, + KSZ8995MA_VLAN_TABLE_READ); + ksz8995ma_writereg(dev, KSZ8995MA_IAC1, vg->es_vlangroup); + data2 = ksz8995ma_readreg(dev, KSZ8995MA_IDR2); + data1 = ksz8995ma_readreg(dev, KSZ8995MA_IDR1); + data0 = ksz8995ma_readreg(dev, KSZ8995MA_IDR0); + vlantab = data2 << 16 | data1 << 8 | data0; + if (data2 & KSZ8995MA_VLAN_TABLE_VALID) { + vg->es_vid = ETHERSWITCH_VID_VALID; + vg->es_vid |= vlantab & 0xfff; + vg->es_member_ports = (vlantab >> 16) & 0x1f; + vg->es_untagged_ports = vg->es_member_ports; + vg->es_fid = (vlantab >> 12) & 0x0f; + } else { + vg->es_fid = 0; + } + } + + return (0); +} + +static int +ksz8995ma_setvgroup(device_t dev, etherswitch_vlangroup_t *vg) +{ + struct ksz8995ma_softc *sc; + int data0; + + sc = device_get_softc(dev); + + if (sc->vlan_mode == ETHERSWITCH_VLAN_PORT) { + data0 = ksz8995ma_readreg(dev, KSZ8995MA_PC1_BASE + + KSZ8995MA_PORT_SIZE * vg->es_vlangroup); + ksz8995ma_writereg(dev, KSZ8995MA_PC1_BASE + + KSZ8995MA_PORT_SIZE * vg->es_vlangroup, + (data0 & 0xe0) | (vg->es_member_ports & 0x1f)); + } else if (sc->vlan_mode == ETHERSWITCH_VLAN_DOT1Q) { + if (vg->es_member_ports != 0) { + ksz8995ma_writereg(dev, KSZ8995MA_IDR2, + KSZ8995MA_VLAN_TABLE_VALID | + (vg->es_member_ports & 0x1f)); + ksz8995ma_writereg(dev, KSZ8995MA_IDR1, + vg->es_fid << 4 | vg->es_vid >> 8); + ksz8995ma_writereg(dev, KSZ8995MA_IDR0, + vg->es_vid & 0xff); + } else { + ksz8995ma_writereg(dev, KSZ8995MA_IDR2, 0); + ksz8995ma_writereg(dev, KSZ8995MA_IDR1, 0); + ksz8995ma_writereg(dev, KSZ8995MA_IDR0, 0); + } + ksz8995ma_writereg(dev, KSZ8995MA_IAC0, + KSZ8995MA_VLAN_TABLE_WRITE); + ksz8995ma_writereg(dev, KSZ8995MA_IAC1, vg->es_vlangroup); + } + + return (0); +} + +static int +ksz8995ma_getconf(device_t dev, etherswitch_conf_t *conf) +{ + struct ksz8995ma_softc *sc; + + sc = device_get_softc(dev); + + /* Return the VLAN mode. */ + conf->cmd = ETHERSWITCH_CONF_VLAN_MODE; + conf->vlan_mode = sc->vlan_mode; + + return (0); +} + +static void +ksz8995ma_portvlanreset(device_t dev) +{ + int i, data; + struct ksz8995ma_softc *sc; + + sc = device_get_softc(dev); + + for (i = 0; i < sc->numports; ++i) { + data = ksz8995ma_readreg(dev, KSZ8995MA_PC1_BASE + + KSZ8995MA_PORT_SIZE * i); + ksz8995ma_writereg(dev, KSZ8995MA_PC1_BASE + + KSZ8995MA_PORT_SIZE * i, (data & 0xe0) | 0x1f); + } +} + +static int +ksz8995ma_setconf(device_t dev, etherswitch_conf_t *conf) +{ + int reg; + struct ksz8995ma_softc *sc; + + sc = device_get_softc(dev); + + if ((conf->cmd & ETHERSWITCH_CONF_VLAN_MODE) == 0) + return (0); + + if (conf->vlan_mode == ETHERSWITCH_VLAN_PORT) { + sc->vlan_mode = ETHERSWITCH_VLAN_PORT; + reg = ksz8995ma_readreg(dev, KSZ8995MA_GC3); + ksz8995ma_writereg(dev, KSZ8995MA_GC3, + reg & ~KSZ8995MA_VLAN_ENABLE); + ksz8995ma_portvlanreset(dev); + } else if (conf->vlan_mode == ETHERSWITCH_VLAN_DOT1Q) { + sc->vlan_mode = ETHERSWITCH_VLAN_DOT1Q; + reg = ksz8995ma_readreg(dev, KSZ8995MA_GC3); + ksz8995ma_writereg(dev, KSZ8995MA_GC3, + reg | KSZ8995MA_VLAN_ENABLE); + } else { + sc->vlan_mode = 0; + reg = ksz8995ma_readreg(dev, KSZ8995MA_GC3); + ksz8995ma_writereg(dev, KSZ8995MA_GC3, + reg & ~KSZ8995MA_VLAN_ENABLE); + ksz8995ma_portvlanreset(dev); + } + return (0); +} + +static void +ksz8995ma_statchg(device_t dev) +{ + + DPRINTF(dev, "%s\n", __func__); +} + +static int +ksz8995ma_ifmedia_upd(struct ifnet *ifp) +{ + struct ksz8995ma_softc *sc; + struct mii_data *mii; + + sc = ifp->if_softc; + mii = ksz8995ma_miiforport(sc, ifp->if_dunit); + + DPRINTF(sc->sc_dev, "%s\n", __func__); + if (mii == NULL) + return (ENXIO); + mii_mediachg(mii); + return (0); +} + +static void +ksz8995ma_ifmedia_sts(struct ifnet *ifp, struct ifmediareq *ifmr) +{ + struct ksz8995ma_softc *sc; + struct mii_data *mii; + + sc = ifp->if_softc; + mii = ksz8995ma_miiforport(sc, ifp->if_dunit); + + DPRINTF(sc->sc_dev, "%s\n", __func__); + + if (mii == NULL) + return; + mii_pollstat(mii); + ifmr->ifm_active = mii->mii_media_active; + ifmr->ifm_status = mii->mii_media_status; +} + +static int +ksz8995ma_readphy(device_t dev, int phy, int reg) +{ +int portreg; + + /* + * This is no mdio/mdc connection code. + * simulate MIIM Registers via the SPI interface + */ + if (reg == MII_BMSR) { + portreg = ksz8995ma_readreg(dev, KSZ8995MA_PS0_BASE + + KSZ8995MA_PORT_SIZE * phy); + return (KSZ8995MA_MII_STAT | + (portreg & 0x20 ? BMSR_LINK : 0x00) | + (portreg & 0x40 ? BMSR_ACOMP : 0x00)); + } else if (reg == MII_PHYIDR1) { + return (KSZ8995MA_MII_PHYID_H); + } else if (reg == MII_PHYIDR2) { + return (KSZ8995MA_MII_PHYID_L); + } else if (reg == MII_ANAR) { + portreg = ksz8995ma_readreg(dev, KSZ8995MA_PC12_BASE + + KSZ8995MA_PORT_SIZE * phy); + return (KSZ8995MA_MII_AA | (portreg & 0x0f) << 5); + } else if (reg == MII_ANLPAR) { + portreg = ksz8995ma_readreg(dev, KSZ8995MA_PS0_BASE + + KSZ8995MA_PORT_SIZE * phy); + return (((portreg & 0x0f) << 5) | 0x01); + } + + return (0); +} + +static int +ksz8995ma_writephy(device_t dev, int phy, int reg, int data) +{ +int portreg; + + /* + * This is no mdio/mdc connection code. + * simulate MIIM Registers via the SPI interface + */ + if (reg == MII_BMCR) { + portreg = ksz8995ma_readreg(dev, KSZ8995MA_PC13_BASE + + KSZ8995MA_PORT_SIZE * phy); + if (data & BMCR_PDOWN) + portreg |= KSZ8995MA_PDOWN; + else + portreg &= ~KSZ8995MA_PDOWN; + if (data & BMCR_STARTNEG) + portreg |= KSZ8995MA_STARTNEG; + else + portreg &= ~KSZ8995MA_STARTNEG; + ksz8995ma_writereg(dev, KSZ8995MA_PC13_BASE + + KSZ8995MA_PORT_SIZE * phy, portreg); + } else if (reg == MII_ANAR) { + portreg = ksz8995ma_readreg(dev, KSZ8995MA_PC12_BASE + + KSZ8995MA_PORT_SIZE * phy); + portreg &= 0xf; + portreg |= ((data >> 5) & 0x0f); + ksz8995ma_writereg(dev, KSZ8995MA_PC12_BASE + + KSZ8995MA_PORT_SIZE * phy, portreg); + } + return (0); +} + +static int +ksz8995ma_readreg(device_t dev, int addr) +{ + uint8_t txBuf[8], rxBuf[8]; + struct spi_command cmd; + int err; + + memset(&cmd, 0, sizeof(cmd)); + memset(txBuf, 0, sizeof(txBuf)); + memset(rxBuf, 0, sizeof(rxBuf)); + + /* read spi */ + txBuf[0] = KSZ8995MA_SPI_READ; + txBuf[1] = addr; + cmd.tx_cmd = &txBuf; + cmd.rx_cmd = &rxBuf; + cmd.tx_cmd_sz = 3; + cmd.rx_cmd_sz = 3; + err = SPIBUS_TRANSFER(device_get_parent(dev), dev, &cmd); + if (err) + return(0); + + return (rxBuf[2]); +} + +static int +ksz8995ma_writereg(device_t dev, int addr, int value) +{ + uint8_t txBuf[8], rxBuf[8]; + struct spi_command cmd; + int err; + + memset(&cmd, 0, sizeof(cmd)); + memset(txBuf, 0, sizeof(txBuf)); + memset(rxBuf, 0, sizeof(rxBuf)); + + /* write spi */ + txBuf[0] = KSZ8995MA_SPI_WRITE; + txBuf[1] = addr; + txBuf[2] = value; + cmd.tx_cmd = &txBuf; + cmd.rx_cmd = &rxBuf; + cmd.tx_cmd_sz = 3; + cmd.rx_cmd_sz = 3; + err = SPIBUS_TRANSFER(device_get_parent(dev), dev, &cmd); + if (err) + return(0); + + return (0); +} + +static device_method_t ksz8995ma_methods[] = { + /* Device interface */ + DEVMETHOD(device_probe, ksz8995ma_probe), + DEVMETHOD(device_attach, ksz8995ma_attach), + DEVMETHOD(device_detach, ksz8995ma_detach), + + /* bus interface */ + DEVMETHOD(bus_add_child, device_add_child_ordered), + + /* MII interface */ + DEVMETHOD(miibus_readreg, ksz8995ma_readphy), + DEVMETHOD(miibus_writereg, ksz8995ma_writephy), + DEVMETHOD(miibus_statchg, ksz8995ma_statchg), + + /* etherswitch interface */ + DEVMETHOD(etherswitch_lock, ksz8995ma_lock), + DEVMETHOD(etherswitch_unlock, ksz8995ma_unlock), + DEVMETHOD(etherswitch_getinfo, ksz8995ma_getinfo), + DEVMETHOD(etherswitch_readreg, ksz8995ma_readreg), + DEVMETHOD(etherswitch_writereg, ksz8995ma_writereg), + DEVMETHOD(etherswitch_readphyreg, ksz8995ma_readphy), + DEVMETHOD(etherswitch_writephyreg, ksz8995ma_writephy), + DEVMETHOD(etherswitch_getport, ksz8995ma_getport), + DEVMETHOD(etherswitch_setport, ksz8995ma_setport), + DEVMETHOD(etherswitch_getvgroup, ksz8995ma_getvgroup), + DEVMETHOD(etherswitch_setvgroup, ksz8995ma_setvgroup), + DEVMETHOD(etherswitch_setconf, ksz8995ma_setconf), + DEVMETHOD(etherswitch_getconf, ksz8995ma_getconf), + + DEVMETHOD_END +}; + +DEFINE_CLASS_0(ksz8995ma, ksz8995ma_driver, ksz8995ma_methods, + sizeof(struct ksz8995ma_softc)); +static devclass_t ksz8995ma_devclass; + +DRIVER_MODULE(ksz8995ma, spibus, ksz8995ma_driver, ksz8995ma_devclass, 0, 0); +DRIVER_MODULE(miibus, ksz8995ma, miibus_driver, miibus_devclass, 0, 0); +DRIVER_MODULE(etherswitch, ksz8995ma, etherswitch_driver, etherswitch_devclass, + 0, 0); +MODULE_VERSION(ksz8995ma, 1); +MODULE_DEPEND(ksz8995ma, spibus, 1, 1, 1); /* XXX which versions? */ +MODULE_DEPEND(ksz8995ma, miibus, 1, 1, 1); /* XXX which versions? */ +MODULE_DEPEND(ksz8995ma, etherswitch, 1, 1, 1); /* XXX which versions? */ Property changes on: projects/netbsd-tests-upstream-01-2017/sys/dev/etherswitch/micrel/ksz8995ma.c ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +FreeBSD=%H \ No newline at end of property Added: svn:mime-type ## -0,0 +1 ## +text/plain \ No newline at end of property Index: projects/netbsd-tests-upstream-01-2017/sys/i386/i386/pmap.c =================================================================== --- projects/netbsd-tests-upstream-01-2017/sys/i386/i386/pmap.c (revision 312217) +++ projects/netbsd-tests-upstream-01-2017/sys/i386/i386/pmap.c (revision 312218) @@ -1,5639 +1,5645 @@ /*- * Copyright (c) 1991 Regents of the University of California. * All rights reserved. * Copyright (c) 1994 John S. Dyson * All rights reserved. * Copyright (c) 1994 David Greenman * All rights reserved. * Copyright (c) 2005-2010 Alan L. Cox * All rights reserved. * * This code is derived from software contributed to Berkeley by * the Systems Programming Group of the University of Utah Computer * Science Department and William Jolitz of UUNET Technologies Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)pmap.c 7.7 (Berkeley) 5/12/91 */ /*- * Copyright (c) 2003 Networks Associates Technology, Inc. * All rights reserved. * * This software was developed for the FreeBSD Project by Jake Burkholder, * Safeport Network Services, and Network Associates Laboratories, the * Security Research Division of Network Associates, Inc. under * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA * CHATS research program. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); /* * Manages physical address maps. * * Since the information managed by this module is * also stored by the logical address mapping module, * this module may throw away valid virtual-to-physical * mappings at almost any time. However, invalidations * of virtual-to-physical mappings must be done as * requested. * * In order to cope with hardware architectures which * make virtual-to-physical map invalidates expensive, * this module may delay invalidate or reduced protection * operations until such time as they are actually * necessary. This module is given full information as * to which processors are currently using which maps, * and to when physical maps must be made correct. */ #include "opt_apic.h" #include "opt_cpu.h" #include "opt_pmap.h" #include "opt_smp.h" #include "opt_xbox.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DEV_APIC #include #include #include #endif #include #include #include #include #include #ifdef SMP #include #endif #ifdef XBOX #include #endif #if !defined(CPU_DISABLE_SSE) && defined(I686_CPU) #define CPU_ENABLE_SSE #endif #ifndef PMAP_SHPGPERPROC #define PMAP_SHPGPERPROC 200 #endif #if !defined(DIAGNOSTIC) #ifdef __GNUC_GNU_INLINE__ #define PMAP_INLINE __attribute__((__gnu_inline__)) inline #else #define PMAP_INLINE extern inline #endif #else #define PMAP_INLINE #endif #ifdef PV_STATS #define PV_STAT(x) do { x ; } while (0) #else #define PV_STAT(x) do { } while (0) #endif #define pa_index(pa) ((pa) >> PDRSHIFT) #define pa_to_pvh(pa) (&pv_table[pa_index(pa)]) /* * Get PDEs and PTEs for user/kernel address space */ #define pmap_pde(m, v) (&((m)->pm_pdir[(vm_offset_t)(v) >> PDRSHIFT])) #define pdir_pde(m, v) (m[(vm_offset_t)(v) >> PDRSHIFT]) #define pmap_pde_v(pte) ((*(int *)pte & PG_V) != 0) #define pmap_pte_w(pte) ((*(int *)pte & PG_W) != 0) #define pmap_pte_m(pte) ((*(int *)pte & PG_M) != 0) #define pmap_pte_u(pte) ((*(int *)pte & PG_A) != 0) #define pmap_pte_v(pte) ((*(int *)pte & PG_V) != 0) #define pmap_pte_set_w(pte, v) ((v) ? atomic_set_int((u_int *)(pte), PG_W) : \ atomic_clear_int((u_int *)(pte), PG_W)) #define pmap_pte_set_prot(pte, v) ((*(int *)pte &= ~PG_PROT), (*(int *)pte |= (v))) struct pmap kernel_pmap_store; LIST_HEAD(pmaplist, pmap); static struct pmaplist allpmaps; static struct mtx allpmaps_lock; vm_offset_t virtual_avail; /* VA of first avail page (after kernel bss) */ vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */ int pgeflag = 0; /* PG_G or-in */ int pseflag = 0; /* PG_PS or-in */ static int nkpt = NKPT; vm_offset_t kernel_vm_end = KERNBASE + NKPT * NBPDR; extern u_int32_t KERNend; extern u_int32_t KPTphys; #if defined(PAE) || defined(PAE_TABLES) pt_entry_t pg_nx; static uma_zone_t pdptzone; #endif static SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD, 0, "VM/pmap parameters"); static int pat_works = 1; SYSCTL_INT(_vm_pmap, OID_AUTO, pat_works, CTLFLAG_RD, &pat_works, 1, "Is page attribute table fully functional?"); static int pg_ps_enabled = 1; SYSCTL_INT(_vm_pmap, OID_AUTO, pg_ps_enabled, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &pg_ps_enabled, 0, "Are large page mappings enabled?"); #define PAT_INDEX_SIZE 8 static int pat_index[PAT_INDEX_SIZE]; /* cache mode to PAT index conversion */ /* * pmap_mapdev support pre initialization (i.e. console) */ #define PMAP_PREINIT_MAPPING_COUNT 8 static struct pmap_preinit_mapping { vm_paddr_t pa; vm_offset_t va; vm_size_t sz; int mode; } pmap_preinit_mapping[PMAP_PREINIT_MAPPING_COUNT]; static int pmap_initialized; static struct rwlock_padalign pvh_global_lock; /* * Data for the pv entry allocation mechanism */ static TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks); static int pv_entry_count = 0, pv_entry_max = 0, pv_entry_high_water = 0; static struct md_page *pv_table; static int shpgperproc = PMAP_SHPGPERPROC; struct pv_chunk *pv_chunkbase; /* KVA block for pv_chunks */ int pv_maxchunks; /* How many chunks we have KVA for */ vm_offset_t pv_vafree; /* freelist stored in the PTE */ /* * All those kernel PT submaps that BSD is so fond of */ pt_entry_t *CMAP3; static pd_entry_t *KPTD; caddr_t ptvmmap = 0; caddr_t CADDR3; struct msgbuf *msgbufp = NULL; /* * Crashdump maps. */ static caddr_t crashdumpmap; static pt_entry_t *PMAP1 = NULL, *PMAP2; static pt_entry_t *PADDR1 = NULL, *PADDR2; #ifdef SMP static int PMAP1cpu; static int PMAP1changedcpu; SYSCTL_INT(_debug, OID_AUTO, PMAP1changedcpu, CTLFLAG_RD, &PMAP1changedcpu, 0, "Number of times pmap_pte_quick changed CPU with same PMAP1"); #endif static int PMAP1changed; SYSCTL_INT(_debug, OID_AUTO, PMAP1changed, CTLFLAG_RD, &PMAP1changed, 0, "Number of times pmap_pte_quick changed PMAP1"); static int PMAP1unchanged; SYSCTL_INT(_debug, OID_AUTO, PMAP1unchanged, CTLFLAG_RD, &PMAP1unchanged, 0, "Number of times pmap_pte_quick didn't change PMAP1"); static struct mtx PMAP2mutex; static void free_pv_chunk(struct pv_chunk *pc); static void free_pv_entry(pmap_t pmap, pv_entry_t pv); static pv_entry_t get_pv_entry(pmap_t pmap, boolean_t try); static void pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa); static boolean_t pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa); static void pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa); static void pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va); static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va); static int pmap_pvh_wired_mappings(struct md_page *pvh, int count); static boolean_t pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va); static boolean_t pmap_enter_pde(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot); static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, vm_page_t mpte); static void pmap_flush_page(vm_page_t m); static int pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte); static void pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte); static boolean_t pmap_is_modified_pvh(struct md_page *pvh); static boolean_t pmap_is_referenced_pvh(struct md_page *pvh); static void pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode); static void pmap_kenter_pde(vm_offset_t va, pd_entry_t newpde); static void pmap_pde_attr(pd_entry_t *pde, int cache_bits); static void pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va); static boolean_t pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva, vm_prot_t prot); static void pmap_pte_attr(pt_entry_t *pte, int cache_bits); static void pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva, struct spglist *free); static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t sva, struct spglist *free); static vm_page_t pmap_remove_pt_page(pmap_t pmap, vm_offset_t va); static void pmap_remove_page(struct pmap *pmap, vm_offset_t va, struct spglist *free); static void pmap_remove_entry(struct pmap *pmap, vm_page_t m, vm_offset_t va); static void pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m); static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m); static void pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde); static void pmap_update_pde_invalidate(vm_offset_t va, pd_entry_t newpde); static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va, u_int flags); static vm_page_t _pmap_allocpte(pmap_t pmap, u_int ptepindex, u_int flags); static void _pmap_unwire_ptp(pmap_t pmap, vm_page_t m, struct spglist *free); static pt_entry_t *pmap_pte_quick(pmap_t pmap, vm_offset_t va); static void pmap_pte_release(pt_entry_t *pte); static int pmap_unuse_pt(pmap_t, vm_offset_t, struct spglist *); #if defined(PAE) || defined(PAE_TABLES) static void *pmap_pdpt_allocf(uma_zone_t zone, vm_size_t bytes, uint8_t *flags, int wait); #endif static void pmap_set_pg(void); static __inline void pagezero(void *page); CTASSERT(1 << PDESHIFT == sizeof(pd_entry_t)); CTASSERT(1 << PTESHIFT == sizeof(pt_entry_t)); /* * If you get an error here, then you set KVA_PAGES wrong! See the * description of KVA_PAGES in sys/i386/include/pmap.h. It must be * multiple of 4 for a normal kernel, or a multiple of 8 for a PAE. */ CTASSERT(KERNBASE % (1 << 24) == 0); /* * Bootstrap the system enough to run with virtual memory. * * On the i386 this is called after mapping has already been enabled * and just syncs the pmap module with what has already been done. * [We can't call it easily with mapping off since the kernel is not * mapped with PA == VA, hence we would have to relocate every address * from the linked base (virtual) address "KERNBASE" to the actual * (physical) address starting relative to 0] */ void pmap_bootstrap(vm_paddr_t firstaddr) { vm_offset_t va; pt_entry_t *pte, *unused; struct pcpu *pc; int i; /* * Add a physical memory segment (vm_phys_seg) corresponding to the * preallocated kernel page table pages so that vm_page structures * representing these pages will be created. The vm_page structures * are required for promotion of the corresponding kernel virtual * addresses to superpage mappings. */ vm_phys_add_seg(KPTphys, KPTphys + ptoa(nkpt)); /* * Initialize the first available kernel virtual address. However, * using "firstaddr" may waste a few pages of the kernel virtual * address space, because locore may not have mapped every physical * page that it allocated. Preferably, locore would provide a first * unused virtual address in addition to "firstaddr". */ virtual_avail = (vm_offset_t) KERNBASE + firstaddr; virtual_end = VM_MAX_KERNEL_ADDRESS; /* * Initialize the kernel pmap (which is statically allocated). */ PMAP_LOCK_INIT(kernel_pmap); kernel_pmap->pm_pdir = (pd_entry_t *) (KERNBASE + (u_int)IdlePTD); #if defined(PAE) || defined(PAE_TABLES) kernel_pmap->pm_pdpt = (pdpt_entry_t *) (KERNBASE + (u_int)IdlePDPT); #endif CPU_FILL(&kernel_pmap->pm_active); /* don't allow deactivation */ TAILQ_INIT(&kernel_pmap->pm_pvchunk); /* * Initialize the global pv list lock. */ rw_init(&pvh_global_lock, "pmap pv global"); LIST_INIT(&allpmaps); /* * Request a spin mutex so that changes to allpmaps cannot be * preempted by smp_rendezvous_cpus(). Otherwise, * pmap_update_pde_kernel() could access allpmaps while it is * being changed. */ mtx_init(&allpmaps_lock, "allpmaps", NULL, MTX_SPIN); mtx_lock_spin(&allpmaps_lock); LIST_INSERT_HEAD(&allpmaps, kernel_pmap, pm_list); mtx_unlock_spin(&allpmaps_lock); /* * Reserve some special page table entries/VA space for temporary * mapping of pages. */ #define SYSMAP(c, p, v, n) \ v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n); va = virtual_avail; pte = vtopte(va); /* * Initialize temporary map objects on the current CPU for use * during early boot. * CMAP1/CMAP2 are used for zeroing and copying pages. * CMAP3 is used for the boot-time memory test. */ pc = pcpu_find(curcpu); mtx_init(&pc->pc_cmap_lock, "SYSMAPS", NULL, MTX_DEF); SYSMAP(caddr_t, pc->pc_cmap_pte1, pc->pc_cmap_addr1, 1) SYSMAP(caddr_t, pc->pc_cmap_pte2, pc->pc_cmap_addr2, 1) SYSMAP(vm_offset_t, pte, pc->pc_qmap_addr, 1) SYSMAP(caddr_t, CMAP3, CADDR3, 1); /* * Crashdump maps. */ SYSMAP(caddr_t, unused, crashdumpmap, MAXDUMPPGS) /* * ptvmmap is used for reading arbitrary physical pages via /dev/mem. */ SYSMAP(caddr_t, unused, ptvmmap, 1) /* * msgbufp is used to map the system message buffer. */ SYSMAP(struct msgbuf *, unused, msgbufp, atop(round_page(msgbufsize))) /* * KPTmap is used by pmap_kextract(). * * KPTmap is first initialized by locore. However, that initial * KPTmap can only support NKPT page table pages. Here, a larger * KPTmap is created that can support KVA_PAGES page table pages. */ SYSMAP(pt_entry_t *, KPTD, KPTmap, KVA_PAGES) for (i = 0; i < NKPT; i++) KPTD[i] = (KPTphys + (i << PAGE_SHIFT)) | pgeflag | PG_RW | PG_V; /* * Adjust the start of the KPTD and KPTmap so that the implementation * of pmap_kextract() and pmap_growkernel() can be made simpler. */ KPTD -= KPTDI; KPTmap -= i386_btop(KPTDI << PDRSHIFT); /* * PADDR1 and PADDR2 are used by pmap_pte_quick() and pmap_pte(), * respectively. */ SYSMAP(pt_entry_t *, PMAP1, PADDR1, 1) SYSMAP(pt_entry_t *, PMAP2, PADDR2, 1) mtx_init(&PMAP2mutex, "PMAP2", NULL, MTX_DEF); virtual_avail = va; /* * Leave in place an identity mapping (virt == phys) for the low 1 MB * physical memory region that is used by the ACPI wakeup code. This * mapping must not have PG_G set. */ #ifdef XBOX /* FIXME: This is gross, but needed for the XBOX. Since we are in such * an early stadium, we cannot yet neatly map video memory ... :-( * Better fixes are very welcome! */ if (!arch_i386_is_xbox) #endif for (i = 1; i < NKPT; i++) PTD[i] = 0; /* Initialize the PAT MSR if present. */ pmap_init_pat(); /* Turn on PG_G on kernel page(s) */ pmap_set_pg(); } static void pmap_init_reserved_pages(void) { struct pcpu *pc; vm_offset_t pages; int i; CPU_FOREACH(i) { pc = pcpu_find(i); /* * Skip if the mapping has already been initialized, * i.e. this is the BSP. */ if (pc->pc_cmap_addr1 != 0) continue; mtx_init(&pc->pc_cmap_lock, "SYSMAPS", NULL, MTX_DEF); pages = kva_alloc(PAGE_SIZE * 3); if (pages == 0) panic("%s: unable to allocate KVA", __func__); pc->pc_cmap_pte1 = vtopte(pages); pc->pc_cmap_pte2 = vtopte(pages + PAGE_SIZE); pc->pc_cmap_addr1 = (caddr_t)pages; pc->pc_cmap_addr2 = (caddr_t)(pages + PAGE_SIZE); pc->pc_qmap_addr = pages + (PAGE_SIZE * 2); } } SYSINIT(rpages_init, SI_SUB_CPU, SI_ORDER_ANY, pmap_init_reserved_pages, NULL); /* * Setup the PAT MSR. */ void pmap_init_pat(void) { int pat_table[PAT_INDEX_SIZE]; uint64_t pat_msr; u_long cr0, cr4; int i; /* Set default PAT index table. */ for (i = 0; i < PAT_INDEX_SIZE; i++) pat_table[i] = -1; pat_table[PAT_WRITE_BACK] = 0; pat_table[PAT_WRITE_THROUGH] = 1; pat_table[PAT_UNCACHEABLE] = 3; pat_table[PAT_WRITE_COMBINING] = 3; pat_table[PAT_WRITE_PROTECTED] = 3; pat_table[PAT_UNCACHED] = 3; /* Bail if this CPU doesn't implement PAT. */ if ((cpu_feature & CPUID_PAT) == 0) { for (i = 0; i < PAT_INDEX_SIZE; i++) pat_index[i] = pat_table[i]; pat_works = 0; return; } /* * Due to some Intel errata, we can only safely use the lower 4 * PAT entries. * * Intel Pentium III Processor Specification Update * Errata E.27 (Upper Four PAT Entries Not Usable With Mode B * or Mode C Paging) * * Intel Pentium IV Processor Specification Update * Errata N46 (PAT Index MSB May Be Calculated Incorrectly) */ if (cpu_vendor_id == CPU_VENDOR_INTEL && !(CPUID_TO_FAMILY(cpu_id) == 6 && CPUID_TO_MODEL(cpu_id) >= 0xe)) pat_works = 0; /* Initialize default PAT entries. */ pat_msr = PAT_VALUE(0, PAT_WRITE_BACK) | PAT_VALUE(1, PAT_WRITE_THROUGH) | PAT_VALUE(2, PAT_UNCACHED) | PAT_VALUE(3, PAT_UNCACHEABLE) | PAT_VALUE(4, PAT_WRITE_BACK) | PAT_VALUE(5, PAT_WRITE_THROUGH) | PAT_VALUE(6, PAT_UNCACHED) | PAT_VALUE(7, PAT_UNCACHEABLE); if (pat_works) { /* * Leave the indices 0-3 at the default of WB, WT, UC-, and UC. * Program 5 and 6 as WP and WC. * Leave 4 and 7 as WB and UC. */ pat_msr &= ~(PAT_MASK(5) | PAT_MASK(6)); pat_msr |= PAT_VALUE(5, PAT_WRITE_PROTECTED) | PAT_VALUE(6, PAT_WRITE_COMBINING); pat_table[PAT_UNCACHED] = 2; pat_table[PAT_WRITE_PROTECTED] = 5; pat_table[PAT_WRITE_COMBINING] = 6; } else { /* * Just replace PAT Index 2 with WC instead of UC-. */ pat_msr &= ~PAT_MASK(2); pat_msr |= PAT_VALUE(2, PAT_WRITE_COMBINING); pat_table[PAT_WRITE_COMBINING] = 2; } /* Disable PGE. */ cr4 = rcr4(); load_cr4(cr4 & ~CR4_PGE); /* Disable caches (CD = 1, NW = 0). */ cr0 = rcr0(); load_cr0((cr0 & ~CR0_NW) | CR0_CD); /* Flushes caches and TLBs. */ wbinvd(); invltlb(); /* Update PAT and index table. */ wrmsr(MSR_PAT, pat_msr); for (i = 0; i < PAT_INDEX_SIZE; i++) pat_index[i] = pat_table[i]; /* Flush caches and TLBs again. */ wbinvd(); invltlb(); /* Restore caches and PGE. */ load_cr0(cr0); load_cr4(cr4); } /* * Set PG_G on kernel pages. Only the BSP calls this when SMP is turned on. */ static void pmap_set_pg(void) { pt_entry_t *pte; vm_offset_t va, endva; if (pgeflag == 0) return; endva = KERNBASE + KERNend; if (pseflag) { va = KERNBASE + KERNLOAD; while (va < endva) { pdir_pde(PTD, va) |= pgeflag; invltlb(); /* Flush non-PG_G entries. */ va += NBPDR; } } else { va = (vm_offset_t)btext; while (va < endva) { pte = vtopte(va); if (*pte) *pte |= pgeflag; invltlb(); /* Flush non-PG_G entries. */ va += PAGE_SIZE; } } } /* * Initialize a vm_page's machine-dependent fields. */ void pmap_page_init(vm_page_t m) { TAILQ_INIT(&m->md.pv_list); m->md.pat_mode = PAT_WRITE_BACK; } #if defined(PAE) || defined(PAE_TABLES) static void * pmap_pdpt_allocf(uma_zone_t zone, vm_size_t bytes, uint8_t *flags, int wait) { /* Inform UMA that this allocator uses kernel_map/object. */ *flags = UMA_SLAB_KERNEL; return ((void *)kmem_alloc_contig(kernel_arena, bytes, wait, 0x0ULL, 0xffffffffULL, 1, 0, VM_MEMATTR_DEFAULT)); } #endif /* * Abuse the pte nodes for unmapped kva to thread a kva freelist through. * Requirements: * - Must deal with pages in order to ensure that none of the PG_* bits * are ever set, PG_V in particular. * - Assumes we can write to ptes without pte_store() atomic ops, even * on PAE systems. This should be ok. * - Assumes nothing will ever test these addresses for 0 to indicate * no mapping instead of correctly checking PG_V. * - Assumes a vm_offset_t will fit in a pte (true for i386). * Because PG_V is never set, there can be no mappings to invalidate. */ static vm_offset_t pmap_ptelist_alloc(vm_offset_t *head) { pt_entry_t *pte; vm_offset_t va; va = *head; if (va == 0) panic("pmap_ptelist_alloc: exhausted ptelist KVA"); pte = vtopte(va); *head = *pte; if (*head & PG_V) panic("pmap_ptelist_alloc: va with PG_V set!"); *pte = 0; return (va); } static void pmap_ptelist_free(vm_offset_t *head, vm_offset_t va) { pt_entry_t *pte; if (va & PG_V) panic("pmap_ptelist_free: freeing va with PG_V set!"); pte = vtopte(va); *pte = *head; /* virtual! PG_V is 0 though */ *head = va; } static void pmap_ptelist_init(vm_offset_t *head, void *base, int npages) { int i; vm_offset_t va; *head = 0; for (i = npages - 1; i >= 0; i--) { va = (vm_offset_t)base + i * PAGE_SIZE; pmap_ptelist_free(head, va); } } /* * Initialize the pmap module. * Called by vm_init, to initialize any structures that the pmap * system needs to map virtual memory. */ void pmap_init(void) { struct pmap_preinit_mapping *ppim; vm_page_t mpte; vm_size_t s; int i, pv_npg; /* * Initialize the vm page array entries for the kernel pmap's * page table pages. */ for (i = 0; i < NKPT; i++) { mpte = PHYS_TO_VM_PAGE(KPTphys + (i << PAGE_SHIFT)); KASSERT(mpte >= vm_page_array && mpte < &vm_page_array[vm_page_array_size], ("pmap_init: page table page is out of range")); mpte->pindex = i + KPTDI; mpte->phys_addr = KPTphys + (i << PAGE_SHIFT); } /* * Initialize the address space (zone) for the pv entries. Set a * high water mark so that the system can recover from excessive * numbers of pv entries. */ TUNABLE_INT_FETCH("vm.pmap.shpgperproc", &shpgperproc); pv_entry_max = shpgperproc * maxproc + vm_cnt.v_page_count; TUNABLE_INT_FETCH("vm.pmap.pv_entries", &pv_entry_max); pv_entry_max = roundup(pv_entry_max, _NPCPV); pv_entry_high_water = 9 * (pv_entry_max / 10); /* * If the kernel is running on a virtual machine, then it must assume * that MCA is enabled by the hypervisor. Moreover, the kernel must * be prepared for the hypervisor changing the vendor and family that * are reported by CPUID. Consequently, the workaround for AMD Family * 10h Erratum 383 is enabled if the processor's feature set does not * include at least one feature that is only supported by older Intel * or newer AMD processors. */ if (vm_guest != VM_GUEST_NO && (cpu_feature & CPUID_SS) == 0 && (cpu_feature2 & (CPUID2_SSSE3 | CPUID2_SSE41 | CPUID2_AESNI | CPUID2_AVX | CPUID2_XSAVE)) == 0 && (amd_feature2 & (AMDID2_XOP | AMDID2_FMA4)) == 0) workaround_erratum383 = 1; /* * Are large page mappings supported and enabled? */ TUNABLE_INT_FETCH("vm.pmap.pg_ps_enabled", &pg_ps_enabled); if (pseflag == 0) pg_ps_enabled = 0; else if (pg_ps_enabled) { KASSERT(MAXPAGESIZES > 1 && pagesizes[1] == 0, ("pmap_init: can't assign to pagesizes[1]")); pagesizes[1] = NBPDR; } /* * Calculate the size of the pv head table for superpages. * Handle the possibility that "vm_phys_segs[...].end" is zero. */ pv_npg = trunc_4mpage(vm_phys_segs[vm_phys_nsegs - 1].end - PAGE_SIZE) / NBPDR + 1; /* * Allocate memory for the pv head table for superpages. */ s = (vm_size_t)(pv_npg * sizeof(struct md_page)); s = round_page(s); pv_table = (struct md_page *)kmem_malloc(kernel_arena, s, M_WAITOK | M_ZERO); for (i = 0; i < pv_npg; i++) TAILQ_INIT(&pv_table[i].pv_list); pv_maxchunks = MAX(pv_entry_max / _NPCPV, maxproc); pv_chunkbase = (struct pv_chunk *)kva_alloc(PAGE_SIZE * pv_maxchunks); if (pv_chunkbase == NULL) panic("pmap_init: not enough kvm for pv chunks"); pmap_ptelist_init(&pv_vafree, pv_chunkbase, pv_maxchunks); #if defined(PAE) || defined(PAE_TABLES) pdptzone = uma_zcreate("PDPT", NPGPTD * sizeof(pdpt_entry_t), NULL, NULL, NULL, NULL, (NPGPTD * sizeof(pdpt_entry_t)) - 1, UMA_ZONE_VM | UMA_ZONE_NOFREE); uma_zone_set_allocf(pdptzone, pmap_pdpt_allocf); #endif pmap_initialized = 1; if (!bootverbose) return; for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { ppim = pmap_preinit_mapping + i; if (ppim->va == 0) continue; printf("PPIM %u: PA=%#jx, VA=%#x, size=%#x, mode=%#x\n", i, (uintmax_t)ppim->pa, ppim->va, ppim->sz, ppim->mode); } } SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_max, CTLFLAG_RD, &pv_entry_max, 0, "Max number of PV entries"); SYSCTL_INT(_vm_pmap, OID_AUTO, shpgperproc, CTLFLAG_RD, &shpgperproc, 0, "Page share factor per proc"); static SYSCTL_NODE(_vm_pmap, OID_AUTO, pde, CTLFLAG_RD, 0, "2/4MB page mapping counters"); static u_long pmap_pde_demotions; SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, demotions, CTLFLAG_RD, &pmap_pde_demotions, 0, "2/4MB page demotions"); static u_long pmap_pde_mappings; SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, mappings, CTLFLAG_RD, &pmap_pde_mappings, 0, "2/4MB page mappings"); static u_long pmap_pde_p_failures; SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, p_failures, CTLFLAG_RD, &pmap_pde_p_failures, 0, "2/4MB page promotion failures"); static u_long pmap_pde_promotions; SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, promotions, CTLFLAG_RD, &pmap_pde_promotions, 0, "2/4MB page promotions"); /*************************************************** * Low level helper routines..... ***************************************************/ /* * Determine the appropriate bits to set in a PTE or PDE for a specified * caching mode. */ int pmap_cache_bits(int mode, boolean_t is_pde) { int cache_bits, pat_flag, pat_idx; if (mode < 0 || mode >= PAT_INDEX_SIZE || pat_index[mode] < 0) panic("Unknown caching mode %d\n", mode); /* The PAT bit is different for PTE's and PDE's. */ pat_flag = is_pde ? PG_PDE_PAT : PG_PTE_PAT; /* Map the caching mode to a PAT index. */ pat_idx = pat_index[mode]; /* Map the 3-bit index value into the PAT, PCD, and PWT bits. */ cache_bits = 0; if (pat_idx & 0x4) cache_bits |= pat_flag; if (pat_idx & 0x2) cache_bits |= PG_NC_PCD; if (pat_idx & 0x1) cache_bits |= PG_NC_PWT; return (cache_bits); } /* * The caller is responsible for maintaining TLB consistency. */ static void pmap_kenter_pde(vm_offset_t va, pd_entry_t newpde) { pd_entry_t *pde; pmap_t pmap; boolean_t PTD_updated; PTD_updated = FALSE; mtx_lock_spin(&allpmaps_lock); LIST_FOREACH(pmap, &allpmaps, pm_list) { if ((pmap->pm_pdir[PTDPTDI] & PG_FRAME) == (PTDpde[0] & PG_FRAME)) PTD_updated = TRUE; pde = pmap_pde(pmap, va); pde_store(pde, newpde); } mtx_unlock_spin(&allpmaps_lock); KASSERT(PTD_updated, ("pmap_kenter_pde: current page table is not in allpmaps")); } /* * After changing the page size for the specified virtual address in the page * table, flush the corresponding entries from the processor's TLB. Only the * calling processor's TLB is affected. * * The calling thread must be pinned to a processor. */ static void pmap_update_pde_invalidate(vm_offset_t va, pd_entry_t newpde) { u_long cr4; if ((newpde & PG_PS) == 0) /* Demotion: flush a specific 2MB page mapping. */ invlpg(va); else if ((newpde & PG_G) == 0) /* * Promotion: flush every 4KB page mapping from the TLB * because there are too many to flush individually. */ invltlb(); else { /* * Promotion: flush every 4KB page mapping from the TLB, * including any global (PG_G) mappings. */ cr4 = rcr4(); load_cr4(cr4 & ~CR4_PGE); /* * Although preemption at this point could be detrimental to * performance, it would not lead to an error. PG_G is simply * ignored if CR4.PGE is clear. Moreover, in case this block * is re-entered, the load_cr4() either above or below will * modify CR4.PGE flushing the TLB. */ load_cr4(cr4 | CR4_PGE); } } void invltlb_glob(void) { uint64_t cr4; if (pgeflag == 0) { invltlb(); } else { cr4 = rcr4(); load_cr4(cr4 & ~CR4_PGE); load_cr4(cr4 | CR4_PGE); } } #ifdef SMP /* * For SMP, these functions have to use the IPI mechanism for coherence. * * N.B.: Before calling any of the following TLB invalidation functions, * the calling processor must ensure that all stores updating a non- * kernel page table are globally performed. Otherwise, another * processor could cache an old, pre-update entry without being * invalidated. This can happen one of two ways: (1) The pmap becomes * active on another processor after its pm_active field is checked by * one of the following functions but before a store updating the page * table is globally performed. (2) The pmap becomes active on another * processor before its pm_active field is checked but due to * speculative loads one of the following functions stills reads the * pmap as inactive on the other processor. * * The kernel page table is exempt because its pm_active field is * immutable. The kernel page table is always active on every * processor. */ void pmap_invalidate_page(pmap_t pmap, vm_offset_t va) { cpuset_t *mask, other_cpus; u_int cpuid; sched_pin(); if (pmap == kernel_pmap || !CPU_CMP(&pmap->pm_active, &all_cpus)) { invlpg(va); mask = &all_cpus; } else { cpuid = PCPU_GET(cpuid); other_cpus = all_cpus; CPU_CLR(cpuid, &other_cpus); if (CPU_ISSET(cpuid, &pmap->pm_active)) invlpg(va); CPU_AND(&other_cpus, &pmap->pm_active); mask = &other_cpus; } smp_masked_invlpg(*mask, va); sched_unpin(); } /* 4k PTEs -- Chosen to exceed the total size of Broadwell L2 TLB */ #define PMAP_INVLPG_THRESHOLD (4 * 1024 * PAGE_SIZE) void pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { cpuset_t *mask, other_cpus; vm_offset_t addr; u_int cpuid; if (eva - sva >= PMAP_INVLPG_THRESHOLD) { pmap_invalidate_all(pmap); return; } sched_pin(); if (pmap == kernel_pmap || !CPU_CMP(&pmap->pm_active, &all_cpus)) { for (addr = sva; addr < eva; addr += PAGE_SIZE) invlpg(addr); mask = &all_cpus; } else { cpuid = PCPU_GET(cpuid); other_cpus = all_cpus; CPU_CLR(cpuid, &other_cpus); if (CPU_ISSET(cpuid, &pmap->pm_active)) for (addr = sva; addr < eva; addr += PAGE_SIZE) invlpg(addr); CPU_AND(&other_cpus, &pmap->pm_active); mask = &other_cpus; } smp_masked_invlpg_range(*mask, sva, eva); sched_unpin(); } void pmap_invalidate_all(pmap_t pmap) { cpuset_t *mask, other_cpus; u_int cpuid; sched_pin(); if (pmap == kernel_pmap) { invltlb_glob(); mask = &all_cpus; } else if (!CPU_CMP(&pmap->pm_active, &all_cpus)) { invltlb(); mask = &all_cpus; } else { cpuid = PCPU_GET(cpuid); other_cpus = all_cpus; CPU_CLR(cpuid, &other_cpus); if (CPU_ISSET(cpuid, &pmap->pm_active)) invltlb(); CPU_AND(&other_cpus, &pmap->pm_active); mask = &other_cpus; } smp_masked_invltlb(*mask, pmap); sched_unpin(); } void pmap_invalidate_cache(void) { sched_pin(); wbinvd(); smp_cache_flush(); sched_unpin(); } struct pde_action { cpuset_t invalidate; /* processors that invalidate their TLB */ vm_offset_t va; pd_entry_t *pde; pd_entry_t newpde; u_int store; /* processor that updates the PDE */ }; static void pmap_update_pde_kernel(void *arg) { struct pde_action *act = arg; pd_entry_t *pde; pmap_t pmap; if (act->store == PCPU_GET(cpuid)) { /* * Elsewhere, this operation requires allpmaps_lock for * synchronization. Here, it does not because it is being * performed in the context of an all_cpus rendezvous. */ LIST_FOREACH(pmap, &allpmaps, pm_list) { pde = pmap_pde(pmap, act->va); pde_store(pde, act->newpde); } } } static void pmap_update_pde_user(void *arg) { struct pde_action *act = arg; if (act->store == PCPU_GET(cpuid)) pde_store(act->pde, act->newpde); } static void pmap_update_pde_teardown(void *arg) { struct pde_action *act = arg; if (CPU_ISSET(PCPU_GET(cpuid), &act->invalidate)) pmap_update_pde_invalidate(act->va, act->newpde); } /* * Change the page size for the specified virtual address in a way that * prevents any possibility of the TLB ever having two entries that map the * same virtual address using different page sizes. This is the recommended * workaround for Erratum 383 on AMD Family 10h processors. It prevents a * machine check exception for a TLB state that is improperly diagnosed as a * hardware error. */ static void pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde) { struct pde_action act; cpuset_t active, other_cpus; u_int cpuid; sched_pin(); cpuid = PCPU_GET(cpuid); other_cpus = all_cpus; CPU_CLR(cpuid, &other_cpus); if (pmap == kernel_pmap) active = all_cpus; else active = pmap->pm_active; if (CPU_OVERLAP(&active, &other_cpus)) { act.store = cpuid; act.invalidate = active; act.va = va; act.pde = pde; act.newpde = newpde; CPU_SET(cpuid, &active); smp_rendezvous_cpus(active, smp_no_rendevous_barrier, pmap == kernel_pmap ? pmap_update_pde_kernel : pmap_update_pde_user, pmap_update_pde_teardown, &act); } else { if (pmap == kernel_pmap) pmap_kenter_pde(va, newpde); else pde_store(pde, newpde); if (CPU_ISSET(cpuid, &active)) pmap_update_pde_invalidate(va, newpde); } sched_unpin(); } #else /* !SMP */ /* * Normal, non-SMP, 486+ invalidation functions. * We inline these within pmap.c for speed. */ PMAP_INLINE void pmap_invalidate_page(pmap_t pmap, vm_offset_t va) { if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active)) invlpg(va); } PMAP_INLINE void pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { vm_offset_t addr; if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active)) for (addr = sva; addr < eva; addr += PAGE_SIZE) invlpg(addr); } PMAP_INLINE void pmap_invalidate_all(pmap_t pmap) { if (pmap == kernel_pmap) invltlb_glob(); else if (!CPU_EMPTY(&pmap->pm_active)) invltlb(); } PMAP_INLINE void pmap_invalidate_cache(void) { wbinvd(); } static void pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde) { if (pmap == kernel_pmap) pmap_kenter_pde(va, newpde); else pde_store(pde, newpde); if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active)) pmap_update_pde_invalidate(va, newpde); } #endif /* !SMP */ #define PMAP_CLFLUSH_THRESHOLD (2 * 1024 * 1024) void pmap_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva, boolean_t force) { if (force) { sva &= ~(vm_offset_t)cpu_clflush_line_size; } else { KASSERT((sva & PAGE_MASK) == 0, ("pmap_invalidate_cache_range: sva not page-aligned")); KASSERT((eva & PAGE_MASK) == 0, ("pmap_invalidate_cache_range: eva not page-aligned")); } if ((cpu_feature & CPUID_SS) != 0 && !force) ; /* If "Self Snoop" is supported and allowed, do nothing. */ else if ((cpu_stdext_feature & CPUID_STDEXT_CLFLUSHOPT) != 0 && eva - sva < PMAP_CLFLUSH_THRESHOLD) { #ifdef DEV_APIC /* * XXX: Some CPUs fault, hang, or trash the local APIC * registers if we use CLFLUSH on the local APIC * range. The local APIC is always uncached, so we * don't need to flush for that range anyway. */ if (pmap_kextract(sva) == lapic_paddr) return; #endif /* * Otherwise, do per-cache line flush. Use the mfence * instruction to insure that previous stores are * included in the write-back. The processor * propagates flush to other processors in the cache * coherence domain. */ mfence(); for (; sva < eva; sva += cpu_clflush_line_size) clflushopt(sva); mfence(); } else if ((cpu_feature & CPUID_CLFSH) != 0 && eva - sva < PMAP_CLFLUSH_THRESHOLD) { #ifdef DEV_APIC if (pmap_kextract(sva) == lapic_paddr) return; #endif /* * Writes are ordered by CLFLUSH on Intel CPUs. */ if (cpu_vendor_id != CPU_VENDOR_INTEL) mfence(); for (; sva < eva; sva += cpu_clflush_line_size) clflush(sva); if (cpu_vendor_id != CPU_VENDOR_INTEL) mfence(); } else { /* * No targeted cache flush methods are supported by CPU, * or the supplied range is bigger than 2MB. * Globally invalidate cache. */ pmap_invalidate_cache(); } } void pmap_invalidate_cache_pages(vm_page_t *pages, int count) { int i; if (count >= PMAP_CLFLUSH_THRESHOLD / PAGE_SIZE || (cpu_feature & CPUID_CLFSH) == 0) { pmap_invalidate_cache(); } else { for (i = 0; i < count; i++) pmap_flush_page(pages[i]); } } /* * Are we current address space or kernel? */ static __inline int pmap_is_current(pmap_t pmap) { return (pmap == kernel_pmap || pmap == vmspace_pmap(curthread->td_proc->p_vmspace)); } /* * If the given pmap is not the current or kernel pmap, the returned pte must * be released by passing it to pmap_pte_release(). */ pt_entry_t * pmap_pte(pmap_t pmap, vm_offset_t va) { pd_entry_t newpf; pd_entry_t *pde; pde = pmap_pde(pmap, va); if (*pde & PG_PS) return (pde); if (*pde != 0) { /* are we current address space or kernel? */ if (pmap_is_current(pmap)) return (vtopte(va)); mtx_lock(&PMAP2mutex); newpf = *pde & PG_FRAME; if ((*PMAP2 & PG_FRAME) != newpf) { *PMAP2 = newpf | PG_RW | PG_V | PG_A | PG_M; pmap_invalidate_page(kernel_pmap, (vm_offset_t)PADDR2); } return (PADDR2 + (i386_btop(va) & (NPTEPG - 1))); } return (NULL); } /* * Releases a pte that was obtained from pmap_pte(). Be prepared for the pte * being NULL. */ static __inline void pmap_pte_release(pt_entry_t *pte) { if ((pt_entry_t *)((vm_offset_t)pte & ~PAGE_MASK) == PADDR2) mtx_unlock(&PMAP2mutex); } /* * NB: The sequence of updating a page table followed by accesses to the * corresponding pages is subject to the situation described in the "AMD64 * Architecture Programmer's Manual Volume 2: System Programming" rev. 3.23, * "7.3.1 Special Coherency Considerations". Therefore, issuing the INVLPG * right after modifying the PTE bits is crucial. */ static __inline void invlcaddr(void *caddr) { invlpg((u_int)caddr); } /* * Super fast pmap_pte routine best used when scanning * the pv lists. This eliminates many coarse-grained * invltlb calls. Note that many of the pv list * scans are across different pmaps. It is very wasteful * to do an entire invltlb for checking a single mapping. * * If the given pmap is not the current pmap, pvh_global_lock * must be held and curthread pinned to a CPU. */ static pt_entry_t * pmap_pte_quick(pmap_t pmap, vm_offset_t va) { pd_entry_t newpf; pd_entry_t *pde; pde = pmap_pde(pmap, va); if (*pde & PG_PS) return (pde); if (*pde != 0) { /* are we current address space or kernel? */ if (pmap_is_current(pmap)) return (vtopte(va)); rw_assert(&pvh_global_lock, RA_WLOCKED); KASSERT(curthread->td_pinned > 0, ("curthread not pinned")); newpf = *pde & PG_FRAME; if ((*PMAP1 & PG_FRAME) != newpf) { *PMAP1 = newpf | PG_RW | PG_V | PG_A | PG_M; #ifdef SMP PMAP1cpu = PCPU_GET(cpuid); #endif invlcaddr(PADDR1); PMAP1changed++; } else #ifdef SMP if (PMAP1cpu != PCPU_GET(cpuid)) { PMAP1cpu = PCPU_GET(cpuid); invlcaddr(PADDR1); PMAP1changedcpu++; } else #endif PMAP1unchanged++; return (PADDR1 + (i386_btop(va) & (NPTEPG - 1))); } return (0); } /* * Routine: pmap_extract * Function: * Extract the physical page address associated * with the given map/virtual_address pair. */ vm_paddr_t pmap_extract(pmap_t pmap, vm_offset_t va) { vm_paddr_t rtval; pt_entry_t *pte; pd_entry_t pde; rtval = 0; PMAP_LOCK(pmap); pde = pmap->pm_pdir[va >> PDRSHIFT]; if (pde != 0) { if ((pde & PG_PS) != 0) rtval = (pde & PG_PS_FRAME) | (va & PDRMASK); else { pte = pmap_pte(pmap, va); rtval = (*pte & PG_FRAME) | (va & PAGE_MASK); pmap_pte_release(pte); } } PMAP_UNLOCK(pmap); return (rtval); } /* * Routine: pmap_extract_and_hold * Function: * Atomically extract and hold the physical page * with the given pmap and virtual address pair * if that mapping permits the given protection. */ vm_page_t pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot) { pd_entry_t pde; pt_entry_t pte, *ptep; vm_page_t m; vm_paddr_t pa; pa = 0; m = NULL; PMAP_LOCK(pmap); retry: pde = *pmap_pde(pmap, va); if (pde != 0) { if (pde & PG_PS) { if ((pde & PG_RW) || (prot & VM_PROT_WRITE) == 0) { if (vm_page_pa_tryrelock(pmap, (pde & PG_PS_FRAME) | (va & PDRMASK), &pa)) goto retry; m = PHYS_TO_VM_PAGE((pde & PG_PS_FRAME) | (va & PDRMASK)); vm_page_hold(m); } } else { ptep = pmap_pte(pmap, va); pte = *ptep; pmap_pte_release(ptep); if (pte != 0 && ((pte & PG_RW) || (prot & VM_PROT_WRITE) == 0)) { if (vm_page_pa_tryrelock(pmap, pte & PG_FRAME, &pa)) goto retry; m = PHYS_TO_VM_PAGE(pte & PG_FRAME); vm_page_hold(m); } } } PA_UNLOCK_COND(pa); PMAP_UNLOCK(pmap); return (m); } /*************************************************** * Low level mapping routines..... ***************************************************/ /* * Add a wired page to the kva. * Note: not SMP coherent. * * This function may be used before pmap_bootstrap() is called. */ PMAP_INLINE void pmap_kenter(vm_offset_t va, vm_paddr_t pa) { pt_entry_t *pte; pte = vtopte(va); pte_store(pte, pa | PG_RW | PG_V | pgeflag); } static __inline void pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode) { pt_entry_t *pte; pte = vtopte(va); pte_store(pte, pa | PG_RW | PG_V | pgeflag | pmap_cache_bits(mode, 0)); } /* * Remove a page from the kernel pagetables. * Note: not SMP coherent. * * This function may be used before pmap_bootstrap() is called. */ PMAP_INLINE void pmap_kremove(vm_offset_t va) { pt_entry_t *pte; pte = vtopte(va); pte_clear(pte); } /* * Used to map a range of physical addresses into kernel * virtual address space. * * The value passed in '*virt' is a suggested virtual address for * the mapping. Architectures which can support a direct-mapped * physical to virtual region can return the appropriate address * within that region, leaving '*virt' unchanged. Other * architectures should map the pages starting at '*virt' and * update '*virt' with the first usable address after the mapped * region. */ vm_offset_t pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot) { vm_offset_t va, sva; vm_paddr_t superpage_offset; pd_entry_t newpde; va = *virt; /* * Does the physical address range's size and alignment permit at * least one superpage mapping to be created? */ superpage_offset = start & PDRMASK; if ((end - start) - ((NBPDR - superpage_offset) & PDRMASK) >= NBPDR) { /* * Increase the starting virtual address so that its alignment * does not preclude the use of superpage mappings. */ if ((va & PDRMASK) < superpage_offset) va = (va & ~PDRMASK) + superpage_offset; else if ((va & PDRMASK) > superpage_offset) va = ((va + PDRMASK) & ~PDRMASK) + superpage_offset; } sva = va; while (start < end) { if ((start & PDRMASK) == 0 && end - start >= NBPDR && pseflag) { KASSERT((va & PDRMASK) == 0, ("pmap_map: misaligned va %#x", va)); newpde = start | PG_PS | pgeflag | PG_RW | PG_V; pmap_kenter_pde(va, newpde); va += NBPDR; start += NBPDR; } else { pmap_kenter(va, start); va += PAGE_SIZE; start += PAGE_SIZE; } } pmap_invalidate_range(kernel_pmap, sva, va); *virt = va; return (sva); } /* * Add a list of wired pages to the kva * this routine is only used for temporary * kernel mappings that do not need to have * page modification or references recorded. * Note that old mappings are simply written * over. The page *must* be wired. * Note: SMP coherent. Uses a ranged shootdown IPI. */ void pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count) { pt_entry_t *endpte, oldpte, pa, *pte; vm_page_t m; oldpte = 0; pte = vtopte(sva); endpte = pte + count; while (pte < endpte) { m = *ma++; pa = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(m->md.pat_mode, 0); if ((*pte & (PG_FRAME | PG_PTE_CACHE)) != pa) { oldpte |= *pte; pte_store(pte, pa | pgeflag | PG_RW | PG_V); } pte++; } if (__predict_false((oldpte & PG_V) != 0)) pmap_invalidate_range(kernel_pmap, sva, sva + count * PAGE_SIZE); } /* * This routine tears out page mappings from the * kernel -- it is meant only for temporary mappings. * Note: SMP coherent. Uses a ranged shootdown IPI. */ void pmap_qremove(vm_offset_t sva, int count) { vm_offset_t va; va = sva; while (count-- > 0) { pmap_kremove(va); va += PAGE_SIZE; } pmap_invalidate_range(kernel_pmap, sva, va); } /*************************************************** * Page table page management routines..... ***************************************************/ static __inline void pmap_free_zero_pages(struct spglist *free) { vm_page_t m; while ((m = SLIST_FIRST(free)) != NULL) { SLIST_REMOVE_HEAD(free, plinks.s.ss); /* Preserve the page's PG_ZERO setting. */ vm_page_free_toq(m); } } /* * Schedule the specified unused page table page to be freed. Specifically, * add the page to the specified list of pages that will be released to the * physical memory manager after the TLB has been updated. */ static __inline void pmap_add_delayed_free_list(vm_page_t m, struct spglist *free, boolean_t set_PG_ZERO) { if (set_PG_ZERO) m->flags |= PG_ZERO; else m->flags &= ~PG_ZERO; SLIST_INSERT_HEAD(free, m, plinks.s.ss); } /* * Inserts the specified page table page into the specified pmap's collection * of idle page table pages. Each of a pmap's page table pages is responsible * for mapping a distinct range of virtual addresses. The pmap's collection is * ordered by this virtual address range. */ static __inline int pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte) { PMAP_LOCK_ASSERT(pmap, MA_OWNED); return (vm_radix_insert(&pmap->pm_root, mpte)); } /* * Removes the page table page mapping the specified virtual address from the * specified pmap's collection of idle page table pages, and returns it. * Otherwise, returns NULL if there is no page table page corresponding to the * specified virtual address. */ static __inline vm_page_t pmap_remove_pt_page(pmap_t pmap, vm_offset_t va) { PMAP_LOCK_ASSERT(pmap, MA_OWNED); return (vm_radix_remove(&pmap->pm_root, va >> PDRSHIFT)); } /* * Decrements a page table page's wire count, which is used to record the * number of valid page table entries within the page. If the wire count * drops to zero, then the page table page is unmapped. Returns TRUE if the * page table page was unmapped and FALSE otherwise. */ static inline boolean_t pmap_unwire_ptp(pmap_t pmap, vm_page_t m, struct spglist *free) { --m->wire_count; if (m->wire_count == 0) { _pmap_unwire_ptp(pmap, m, free); return (TRUE); } else return (FALSE); } static void _pmap_unwire_ptp(pmap_t pmap, vm_page_t m, struct spglist *free) { vm_offset_t pteva; /* * unmap the page table page */ pmap->pm_pdir[m->pindex] = 0; --pmap->pm_stats.resident_count; /* * This is a release store so that the ordinary store unmapping * the page table page is globally performed before TLB shoot- * down is begun. */ atomic_subtract_rel_int(&vm_cnt.v_wire_count, 1); /* * Do an invltlb to make the invalidated mapping * take effect immediately. */ pteva = VM_MAXUSER_ADDRESS + i386_ptob(m->pindex); pmap_invalidate_page(pmap, pteva); /* * Put page on a list so that it is released after * *ALL* TLB shootdown is done */ pmap_add_delayed_free_list(m, free, TRUE); } /* * After removing a page table entry, this routine is used to * conditionally free the page, and manage the hold/wire counts. */ static int pmap_unuse_pt(pmap_t pmap, vm_offset_t va, struct spglist *free) { pd_entry_t ptepde; vm_page_t mpte; if (va >= VM_MAXUSER_ADDRESS) return (0); ptepde = *pmap_pde(pmap, va); mpte = PHYS_TO_VM_PAGE(ptepde & PG_FRAME); return (pmap_unwire_ptp(pmap, mpte, free)); } /* * Initialize the pmap for the swapper process. */ void pmap_pinit0(pmap_t pmap) { PMAP_LOCK_INIT(pmap); /* * Since the page table directory is shared with the kernel pmap, * which is already included in the list "allpmaps", this pmap does * not need to be inserted into that list. */ pmap->pm_pdir = (pd_entry_t *)(KERNBASE + (vm_offset_t)IdlePTD); #if defined(PAE) || defined(PAE_TABLES) pmap->pm_pdpt = (pdpt_entry_t *)(KERNBASE + (vm_offset_t)IdlePDPT); #endif pmap->pm_root.rt_root = 0; CPU_ZERO(&pmap->pm_active); PCPU_SET(curpmap, pmap); TAILQ_INIT(&pmap->pm_pvchunk); bzero(&pmap->pm_stats, sizeof pmap->pm_stats); } /* * Initialize a preallocated and zeroed pmap structure, * such as one in a vmspace structure. */ int pmap_pinit(pmap_t pmap) { vm_page_t m, ptdpg[NPGPTD]; vm_paddr_t pa; int i; /* * No need to allocate page table space yet but we do need a valid * page directory table. */ if (pmap->pm_pdir == NULL) { pmap->pm_pdir = (pd_entry_t *)kva_alloc(NBPTD); if (pmap->pm_pdir == NULL) return (0); #if defined(PAE) || defined(PAE_TABLES) pmap->pm_pdpt = uma_zalloc(pdptzone, M_WAITOK | M_ZERO); KASSERT(((vm_offset_t)pmap->pm_pdpt & ((NPGPTD * sizeof(pdpt_entry_t)) - 1)) == 0, ("pmap_pinit: pdpt misaligned")); KASSERT(pmap_kextract((vm_offset_t)pmap->pm_pdpt) < (4ULL<<30), ("pmap_pinit: pdpt above 4g")); #endif pmap->pm_root.rt_root = 0; } KASSERT(vm_radix_is_empty(&pmap->pm_root), ("pmap_pinit: pmap has reserved page table page(s)")); /* * allocate the page directory page(s) */ for (i = 0; i < NPGPTD;) { m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO); if (m == NULL) VM_WAIT; else { ptdpg[i++] = m; } } pmap_qenter((vm_offset_t)pmap->pm_pdir, ptdpg, NPGPTD); for (i = 0; i < NPGPTD; i++) if ((ptdpg[i]->flags & PG_ZERO) == 0) pagezero(pmap->pm_pdir + (i * NPDEPG)); mtx_lock_spin(&allpmaps_lock); LIST_INSERT_HEAD(&allpmaps, pmap, pm_list); /* Copy the kernel page table directory entries. */ bcopy(PTD + KPTDI, pmap->pm_pdir + KPTDI, nkpt * sizeof(pd_entry_t)); mtx_unlock_spin(&allpmaps_lock); /* install self-referential address mapping entry(s) */ for (i = 0; i < NPGPTD; i++) { pa = VM_PAGE_TO_PHYS(ptdpg[i]); pmap->pm_pdir[PTDPTDI + i] = pa | PG_V | PG_RW | PG_A | PG_M; #if defined(PAE) || defined(PAE_TABLES) pmap->pm_pdpt[i] = pa | PG_V; #endif } CPU_ZERO(&pmap->pm_active); TAILQ_INIT(&pmap->pm_pvchunk); bzero(&pmap->pm_stats, sizeof pmap->pm_stats); return (1); } /* * this routine is called if the page table page is not * mapped correctly. */ static vm_page_t _pmap_allocpte(pmap_t pmap, u_int ptepindex, u_int flags) { vm_paddr_t ptepa; vm_page_t m; /* * Allocate a page table page. */ if ((m = vm_page_alloc(NULL, ptepindex, VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) { if ((flags & PMAP_ENTER_NOSLEEP) == 0) { PMAP_UNLOCK(pmap); rw_wunlock(&pvh_global_lock); VM_WAIT; rw_wlock(&pvh_global_lock); PMAP_LOCK(pmap); } /* * Indicate the need to retry. While waiting, the page table * page may have been allocated. */ return (NULL); } if ((m->flags & PG_ZERO) == 0) pmap_zero_page(m); /* * Map the pagetable page into the process address space, if * it isn't already there. */ pmap->pm_stats.resident_count++; ptepa = VM_PAGE_TO_PHYS(m); pmap->pm_pdir[ptepindex] = (pd_entry_t) (ptepa | PG_U | PG_RW | PG_V | PG_A | PG_M); return (m); } static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va, u_int flags) { u_int ptepindex; pd_entry_t ptepa; vm_page_t m; /* * Calculate pagetable page index */ ptepindex = va >> PDRSHIFT; retry: /* * Get the page directory entry */ ptepa = pmap->pm_pdir[ptepindex]; /* * This supports switching from a 4MB page to a * normal 4K page. */ if (ptepa & PG_PS) { (void)pmap_demote_pde(pmap, &pmap->pm_pdir[ptepindex], va); ptepa = pmap->pm_pdir[ptepindex]; } /* * If the page table page is mapped, we just increment the * hold count, and activate it. */ if (ptepa) { m = PHYS_TO_VM_PAGE(ptepa & PG_FRAME); m->wire_count++; } else { /* * Here if the pte page isn't mapped, or if it has * been deallocated. */ m = _pmap_allocpte(pmap, ptepindex, flags); if (m == NULL && (flags & PMAP_ENTER_NOSLEEP) == 0) goto retry; } return (m); } /*************************************************** * Pmap allocation/deallocation routines. ***************************************************/ /* * Release any resources held by the given physical map. * Called when a pmap initialized by pmap_pinit is being released. * Should only be called if the map contains no valid mappings. */ void pmap_release(pmap_t pmap) { vm_page_t m, ptdpg[NPGPTD]; int i; KASSERT(pmap->pm_stats.resident_count == 0, ("pmap_release: pmap resident count %ld != 0", pmap->pm_stats.resident_count)); KASSERT(vm_radix_is_empty(&pmap->pm_root), ("pmap_release: pmap has reserved page table page(s)")); KASSERT(CPU_EMPTY(&pmap->pm_active), ("releasing active pmap %p", pmap)); mtx_lock_spin(&allpmaps_lock); LIST_REMOVE(pmap, pm_list); mtx_unlock_spin(&allpmaps_lock); for (i = 0; i < NPGPTD; i++) ptdpg[i] = PHYS_TO_VM_PAGE(pmap->pm_pdir[PTDPTDI + i] & PG_FRAME); bzero(pmap->pm_pdir + PTDPTDI, (nkpt + NPGPTD) * sizeof(*pmap->pm_pdir)); pmap_qremove((vm_offset_t)pmap->pm_pdir, NPGPTD); for (i = 0; i < NPGPTD; i++) { m = ptdpg[i]; #if defined(PAE) || defined(PAE_TABLES) KASSERT(VM_PAGE_TO_PHYS(m) == (pmap->pm_pdpt[i] & PG_FRAME), ("pmap_release: got wrong ptd page")); #endif m->wire_count--; atomic_subtract_int(&vm_cnt.v_wire_count, 1); vm_page_free_zero(m); } } static int kvm_size(SYSCTL_HANDLER_ARGS) { unsigned long ksize = VM_MAX_KERNEL_ADDRESS - KERNBASE; return (sysctl_handle_long(oidp, &ksize, 0, req)); } SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG|CTLFLAG_RD, 0, 0, kvm_size, "IU", "Size of KVM"); static int kvm_free(SYSCTL_HANDLER_ARGS) { unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end; return (sysctl_handle_long(oidp, &kfree, 0, req)); } SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG|CTLFLAG_RD, 0, 0, kvm_free, "IU", "Amount of KVM free"); /* * grow the number of kernel page table entries, if needed */ void pmap_growkernel(vm_offset_t addr) { vm_paddr_t ptppaddr; vm_page_t nkpg; pd_entry_t newpdir; mtx_assert(&kernel_map->system_mtx, MA_OWNED); addr = roundup2(addr, NBPDR); if (addr - 1 >= kernel_map->max_offset) addr = kernel_map->max_offset; while (kernel_vm_end < addr) { if (pdir_pde(PTD, kernel_vm_end)) { kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK; if (kernel_vm_end - 1 >= kernel_map->max_offset) { kernel_vm_end = kernel_map->max_offset; break; } continue; } nkpg = vm_page_alloc(NULL, kernel_vm_end >> PDRSHIFT, VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO); if (nkpg == NULL) panic("pmap_growkernel: no memory to grow kernel"); nkpt++; if ((nkpg->flags & PG_ZERO) == 0) pmap_zero_page(nkpg); ptppaddr = VM_PAGE_TO_PHYS(nkpg); newpdir = (pd_entry_t) (ptppaddr | PG_V | PG_RW | PG_A | PG_M); pdir_pde(KPTD, kernel_vm_end) = pgeflag | newpdir; pmap_kenter_pde(kernel_vm_end, newpdir); kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK; if (kernel_vm_end - 1 >= kernel_map->max_offset) { kernel_vm_end = kernel_map->max_offset; break; } } } /*************************************************** * page management routines. ***************************************************/ CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE); CTASSERT(_NPCM == 11); CTASSERT(_NPCPV == 336); static __inline struct pv_chunk * pv_to_chunk(pv_entry_t pv) { return ((struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK)); } #define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap) #define PC_FREE0_9 0xfffffffful /* Free values for index 0 through 9 */ #define PC_FREE10 0x0000fffful /* Free values for index 10 */ static const uint32_t pc_freemask[_NPCM] = { PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, PC_FREE10 }; SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0, "Current number of pv entries"); #ifdef PV_STATS static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail; SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0, "Current number of pv entry chunks"); SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0, "Current number of pv entry chunks allocated"); SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0, "Current number of pv entry chunks frees"); SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0, "Number of times tried to get a chunk page but failed."); static long pv_entry_frees, pv_entry_allocs; static int pv_entry_spare; SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0, "Current number of pv entry frees"); SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0, "Current number of pv entry allocs"); SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0, "Current number of spare pv entries"); #endif /* * We are in a serious low memory condition. Resort to * drastic measures to free some pages so we can allocate * another pv entry chunk. */ static vm_page_t pmap_pv_reclaim(pmap_t locked_pmap) { struct pch newtail; struct pv_chunk *pc; struct md_page *pvh; pd_entry_t *pde; pmap_t pmap; pt_entry_t *pte, tpte; pv_entry_t pv; vm_offset_t va; vm_page_t m, m_pc; struct spglist free; uint32_t inuse; int bit, field, freed; PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED); pmap = NULL; m_pc = NULL; SLIST_INIT(&free); TAILQ_INIT(&newtail); while ((pc = TAILQ_FIRST(&pv_chunks)) != NULL && (pv_vafree == 0 || SLIST_EMPTY(&free))) { TAILQ_REMOVE(&pv_chunks, pc, pc_lru); if (pmap != pc->pc_pmap) { if (pmap != NULL) { pmap_invalidate_all(pmap); if (pmap != locked_pmap) PMAP_UNLOCK(pmap); } pmap = pc->pc_pmap; /* Avoid deadlock and lock recursion. */ if (pmap > locked_pmap) PMAP_LOCK(pmap); else if (pmap != locked_pmap && !PMAP_TRYLOCK(pmap)) { pmap = NULL; TAILQ_INSERT_TAIL(&newtail, pc, pc_lru); continue; } } /* * Destroy every non-wired, 4 KB page mapping in the chunk. */ freed = 0; for (field = 0; field < _NPCM; field++) { for (inuse = ~pc->pc_map[field] & pc_freemask[field]; inuse != 0; inuse &= ~(1UL << bit)) { bit = bsfl(inuse); pv = &pc->pc_pventry[field * 32 + bit]; va = pv->pv_va; pde = pmap_pde(pmap, va); if ((*pde & PG_PS) != 0) continue; pte = pmap_pte(pmap, va); tpte = *pte; if ((tpte & PG_W) == 0) tpte = pte_load_clear(pte); pmap_pte_release(pte); if ((tpte & PG_W) != 0) continue; KASSERT(tpte != 0, ("pmap_pv_reclaim: pmap %p va %x zero pte", pmap, va)); if ((tpte & PG_G) != 0) pmap_invalidate_page(pmap, va); m = PHYS_TO_VM_PAGE(tpte & PG_FRAME); if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) vm_page_dirty(m); if ((tpte & PG_A) != 0) vm_page_aflag_set(m, PGA_REFERENCED); TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); if (TAILQ_EMPTY(&m->md.pv_list) && (m->flags & PG_FICTITIOUS) == 0) { pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); if (TAILQ_EMPTY(&pvh->pv_list)) { vm_page_aflag_clear(m, PGA_WRITEABLE); } } pc->pc_map[field] |= 1UL << bit; pmap_unuse_pt(pmap, va, &free); freed++; } } if (freed == 0) { TAILQ_INSERT_TAIL(&newtail, pc, pc_lru); continue; } /* Every freed mapping is for a 4 KB page. */ pmap->pm_stats.resident_count -= freed; PV_STAT(pv_entry_frees += freed); PV_STAT(pv_entry_spare += freed); pv_entry_count -= freed; TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); for (field = 0; field < _NPCM; field++) if (pc->pc_map[field] != pc_freemask[field]) { TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); TAILQ_INSERT_TAIL(&newtail, pc, pc_lru); /* * One freed pv entry in locked_pmap is * sufficient. */ if (pmap == locked_pmap) goto out; break; } if (field == _NPCM) { PV_STAT(pv_entry_spare -= _NPCPV); PV_STAT(pc_chunk_count--); PV_STAT(pc_chunk_frees++); /* Entire chunk is free; return it. */ m_pc = PHYS_TO_VM_PAGE(pmap_kextract((vm_offset_t)pc)); pmap_qremove((vm_offset_t)pc, 1); pmap_ptelist_free(&pv_vafree, (vm_offset_t)pc); break; } } out: TAILQ_CONCAT(&pv_chunks, &newtail, pc_lru); if (pmap != NULL) { pmap_invalidate_all(pmap); if (pmap != locked_pmap) PMAP_UNLOCK(pmap); } if (m_pc == NULL && pv_vafree != 0 && SLIST_EMPTY(&free)) { m_pc = SLIST_FIRST(&free); SLIST_REMOVE_HEAD(&free, plinks.s.ss); /* Recycle a freed page table page. */ m_pc->wire_count = 1; atomic_add_int(&vm_cnt.v_wire_count, 1); } pmap_free_zero_pages(&free); return (m_pc); } /* * free the pv_entry back to the free list */ static void free_pv_entry(pmap_t pmap, pv_entry_t pv) { struct pv_chunk *pc; int idx, field, bit; rw_assert(&pvh_global_lock, RA_WLOCKED); PMAP_LOCK_ASSERT(pmap, MA_OWNED); PV_STAT(pv_entry_frees++); PV_STAT(pv_entry_spare++); pv_entry_count--; pc = pv_to_chunk(pv); idx = pv - &pc->pc_pventry[0]; field = idx / 32; bit = idx % 32; pc->pc_map[field] |= 1ul << bit; for (idx = 0; idx < _NPCM; idx++) if (pc->pc_map[idx] != pc_freemask[idx]) { /* * 98% of the time, pc is already at the head of the * list. If it isn't already, move it to the head. */ if (__predict_false(TAILQ_FIRST(&pmap->pm_pvchunk) != pc)) { TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); } return; } TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); free_pv_chunk(pc); } static void free_pv_chunk(struct pv_chunk *pc) { vm_page_t m; TAILQ_REMOVE(&pv_chunks, pc, pc_lru); PV_STAT(pv_entry_spare -= _NPCPV); PV_STAT(pc_chunk_count--); PV_STAT(pc_chunk_frees++); /* entire chunk is free, return it */ m = PHYS_TO_VM_PAGE(pmap_kextract((vm_offset_t)pc)); pmap_qremove((vm_offset_t)pc, 1); vm_page_unwire(m, PQ_NONE); vm_page_free(m); pmap_ptelist_free(&pv_vafree, (vm_offset_t)pc); } /* * get a new pv_entry, allocating a block from the system * when needed. */ static pv_entry_t get_pv_entry(pmap_t pmap, boolean_t try) { static const struct timeval printinterval = { 60, 0 }; static struct timeval lastprint; int bit, field; pv_entry_t pv; struct pv_chunk *pc; vm_page_t m; rw_assert(&pvh_global_lock, RA_WLOCKED); PMAP_LOCK_ASSERT(pmap, MA_OWNED); PV_STAT(pv_entry_allocs++); pv_entry_count++; if (pv_entry_count > pv_entry_high_water) if (ratecheck(&lastprint, &printinterval)) printf("Approaching the limit on PV entries, consider " "increasing either the vm.pmap.shpgperproc or the " "vm.pmap.pv_entry_max tunable.\n"); retry: pc = TAILQ_FIRST(&pmap->pm_pvchunk); if (pc != NULL) { for (field = 0; field < _NPCM; field++) { if (pc->pc_map[field]) { bit = bsfl(pc->pc_map[field]); break; } } if (field < _NPCM) { pv = &pc->pc_pventry[field * 32 + bit]; pc->pc_map[field] &= ~(1ul << bit); /* If this was the last item, move it to tail */ for (field = 0; field < _NPCM; field++) if (pc->pc_map[field] != 0) { PV_STAT(pv_entry_spare--); return (pv); /* not full, return */ } TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); PV_STAT(pv_entry_spare--); return (pv); } } /* * Access to the ptelist "pv_vafree" is synchronized by the pvh * global lock. If "pv_vafree" is currently non-empty, it will * remain non-empty until pmap_ptelist_alloc() completes. */ if (pv_vafree == 0 || (m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == NULL) { if (try) { pv_entry_count--; PV_STAT(pc_chunk_tryfail++); return (NULL); } m = pmap_pv_reclaim(pmap); if (m == NULL) goto retry; } PV_STAT(pc_chunk_count++); PV_STAT(pc_chunk_allocs++); pc = (struct pv_chunk *)pmap_ptelist_alloc(&pv_vafree); pmap_qenter((vm_offset_t)pc, &m, 1); pc->pc_pmap = pmap; pc->pc_map[0] = pc_freemask[0] & ~1ul; /* preallocated bit 0 */ for (field = 1; field < _NPCM; field++) pc->pc_map[field] = pc_freemask[field]; TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru); pv = &pc->pc_pventry[0]; TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); PV_STAT(pv_entry_spare += _NPCPV - 1); return (pv); } static __inline pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va) { pv_entry_t pv; rw_assert(&pvh_global_lock, RA_WLOCKED); TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { if (pmap == PV_PMAP(pv) && va == pv->pv_va) { TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); break; } } return (pv); } static void pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa) { struct md_page *pvh; pv_entry_t pv; vm_offset_t va_last; vm_page_t m; rw_assert(&pvh_global_lock, RA_WLOCKED); KASSERT((pa & PDRMASK) == 0, ("pmap_pv_demote_pde: pa is not 4mpage aligned")); /* * Transfer the 4mpage's pv entry for this mapping to the first * page's pv list. */ pvh = pa_to_pvh(pa); va = trunc_4mpage(va); pv = pmap_pvh_remove(pvh, pmap, va); KASSERT(pv != NULL, ("pmap_pv_demote_pde: pv not found")); m = PHYS_TO_VM_PAGE(pa); TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); /* Instantiate the remaining NPTEPG - 1 pv entries. */ va_last = va + NBPDR - PAGE_SIZE; do { m++; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_pv_demote_pde: page %p is not managed", m)); va += PAGE_SIZE; pmap_insert_entry(pmap, va, m); } while (va < va_last); } static void pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa) { struct md_page *pvh; pv_entry_t pv; vm_offset_t va_last; vm_page_t m; rw_assert(&pvh_global_lock, RA_WLOCKED); KASSERT((pa & PDRMASK) == 0, ("pmap_pv_promote_pde: pa is not 4mpage aligned")); /* * Transfer the first page's pv entry for this mapping to the * 4mpage's pv list. Aside from avoiding the cost of a call * to get_pv_entry(), a transfer avoids the possibility that * get_pv_entry() calls pmap_collect() and that pmap_collect() * removes one of the mappings that is being promoted. */ m = PHYS_TO_VM_PAGE(pa); va = trunc_4mpage(va); pv = pmap_pvh_remove(&m->md, pmap, va); KASSERT(pv != NULL, ("pmap_pv_promote_pde: pv not found")); pvh = pa_to_pvh(pa); TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); /* Free the remaining NPTEPG - 1 pv entries. */ va_last = va + NBPDR - PAGE_SIZE; do { m++; va += PAGE_SIZE; pmap_pvh_free(&m->md, pmap, va); } while (va < va_last); } static void pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va) { pv_entry_t pv; pv = pmap_pvh_remove(pvh, pmap, va); KASSERT(pv != NULL, ("pmap_pvh_free: pv not found")); free_pv_entry(pmap, pv); } static void pmap_remove_entry(pmap_t pmap, vm_page_t m, vm_offset_t va) { struct md_page *pvh; rw_assert(&pvh_global_lock, RA_WLOCKED); pmap_pvh_free(&m->md, pmap, va); if (TAILQ_EMPTY(&m->md.pv_list) && (m->flags & PG_FICTITIOUS) == 0) { pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); if (TAILQ_EMPTY(&pvh->pv_list)) vm_page_aflag_clear(m, PGA_WRITEABLE); } } /* * Create a pv entry for page at pa for * (pmap, va). */ static void pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m) { pv_entry_t pv; rw_assert(&pvh_global_lock, RA_WLOCKED); PMAP_LOCK_ASSERT(pmap, MA_OWNED); pv = get_pv_entry(pmap, FALSE); pv->pv_va = va; TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); } /* * Conditionally create a pv entry. */ static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m) { pv_entry_t pv; rw_assert(&pvh_global_lock, RA_WLOCKED); PMAP_LOCK_ASSERT(pmap, MA_OWNED); if (pv_entry_count < pv_entry_high_water && (pv = get_pv_entry(pmap, TRUE)) != NULL) { pv->pv_va = va; TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); return (TRUE); } else return (FALSE); } /* * Create the pv entries for each of the pages within a superpage. */ static boolean_t pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa) { struct md_page *pvh; pv_entry_t pv; rw_assert(&pvh_global_lock, RA_WLOCKED); if (pv_entry_count < pv_entry_high_water && (pv = get_pv_entry(pmap, TRUE)) != NULL) { pv->pv_va = va; pvh = pa_to_pvh(pa); TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); return (TRUE); } else return (FALSE); } /* * Fills a page table page with mappings to consecutive physical pages. */ static void pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte) { pt_entry_t *pte; for (pte = firstpte; pte < firstpte + NPTEPG; pte++) { *pte = newpte; newpte += PAGE_SIZE; } } /* * Tries to demote a 2- or 4MB page mapping. If demotion fails, the * 2- or 4MB page mapping is invalidated. */ static boolean_t pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va) { pd_entry_t newpde, oldpde; pt_entry_t *firstpte, newpte; vm_paddr_t mptepa; vm_page_t mpte; struct spglist free; PMAP_LOCK_ASSERT(pmap, MA_OWNED); oldpde = *pde; KASSERT((oldpde & (PG_PS | PG_V)) == (PG_PS | PG_V), ("pmap_demote_pde: oldpde is missing PG_PS and/or PG_V")); if ((oldpde & PG_A) == 0 || (mpte = pmap_remove_pt_page(pmap, va)) == NULL) { KASSERT((oldpde & PG_W) == 0, ("pmap_demote_pde: page table page for a wired mapping" " is missing")); /* * Invalidate the 2- or 4MB page mapping and return * "failure" if the mapping was never accessed or the * allocation of the new page table page fails. */ if ((oldpde & PG_A) == 0 || (mpte = vm_page_alloc(NULL, va >> PDRSHIFT, VM_ALLOC_NOOBJ | VM_ALLOC_NORMAL | VM_ALLOC_WIRED)) == NULL) { SLIST_INIT(&free); pmap_remove_pde(pmap, pde, trunc_4mpage(va), &free); pmap_invalidate_page(pmap, trunc_4mpage(va)); pmap_free_zero_pages(&free); CTR2(KTR_PMAP, "pmap_demote_pde: failure for va %#x" " in pmap %p", va, pmap); return (FALSE); } if (va < VM_MAXUSER_ADDRESS) pmap->pm_stats.resident_count++; } mptepa = VM_PAGE_TO_PHYS(mpte); /* * If the page mapping is in the kernel's address space, then the * KPTmap can provide access to the page table page. Otherwise, * temporarily map the page table page (mpte) into the kernel's * address space at either PADDR1 or PADDR2. */ if (va >= KERNBASE) firstpte = &KPTmap[i386_btop(trunc_4mpage(va))]; else if (curthread->td_pinned > 0 && rw_wowned(&pvh_global_lock)) { if ((*PMAP1 & PG_FRAME) != mptepa) { *PMAP1 = mptepa | PG_RW | PG_V | PG_A | PG_M; #ifdef SMP PMAP1cpu = PCPU_GET(cpuid); #endif invlcaddr(PADDR1); PMAP1changed++; } else #ifdef SMP if (PMAP1cpu != PCPU_GET(cpuid)) { PMAP1cpu = PCPU_GET(cpuid); invlcaddr(PADDR1); PMAP1changedcpu++; } else #endif PMAP1unchanged++; firstpte = PADDR1; } else { mtx_lock(&PMAP2mutex); if ((*PMAP2 & PG_FRAME) != mptepa) { *PMAP2 = mptepa | PG_RW | PG_V | PG_A | PG_M; pmap_invalidate_page(kernel_pmap, (vm_offset_t)PADDR2); } firstpte = PADDR2; } newpde = mptepa | PG_M | PG_A | (oldpde & PG_U) | PG_RW | PG_V; KASSERT((oldpde & PG_A) != 0, ("pmap_demote_pde: oldpde is missing PG_A")); KASSERT((oldpde & (PG_M | PG_RW)) != PG_RW, ("pmap_demote_pde: oldpde is missing PG_M")); newpte = oldpde & ~PG_PS; if ((newpte & PG_PDE_PAT) != 0) newpte ^= PG_PDE_PAT | PG_PTE_PAT; /* * If the page table page is new, initialize it. */ if (mpte->wire_count == 1) { mpte->wire_count = NPTEPG; pmap_fill_ptp(firstpte, newpte); } KASSERT((*firstpte & PG_FRAME) == (newpte & PG_FRAME), ("pmap_demote_pde: firstpte and newpte map different physical" " addresses")); /* * If the mapping has changed attributes, update the page table * entries. */ if ((*firstpte & PG_PTE_PROMOTE) != (newpte & PG_PTE_PROMOTE)) pmap_fill_ptp(firstpte, newpte); /* * Demote the mapping. This pmap is locked. The old PDE has * PG_A set. If the old PDE has PG_RW set, it also has PG_M * set. Thus, there is no danger of a race with another * processor changing the setting of PG_A and/or PG_M between * the read above and the store below. */ if (workaround_erratum383) pmap_update_pde(pmap, va, pde, newpde); else if (pmap == kernel_pmap) pmap_kenter_pde(va, newpde); else pde_store(pde, newpde); if (firstpte == PADDR2) mtx_unlock(&PMAP2mutex); /* * Invalidate the recursive mapping of the page table page. */ pmap_invalidate_page(pmap, (vm_offset_t)vtopte(va)); /* * Demote the pv entry. This depends on the earlier demotion * of the mapping. Specifically, the (re)creation of a per- * page pv entry might trigger the execution of pmap_collect(), * which might reclaim a newly (re)created per-page pv entry * and destroy the associated mapping. In order to destroy * the mapping, the PDE must have already changed from mapping * the 2mpage to referencing the page table page. */ if ((oldpde & PG_MANAGED) != 0) pmap_pv_demote_pde(pmap, va, oldpde & PG_PS_FRAME); pmap_pde_demotions++; CTR2(KTR_PMAP, "pmap_demote_pde: success for va %#x" " in pmap %p", va, pmap); return (TRUE); } /* * Removes a 2- or 4MB page mapping from the kernel pmap. */ static void pmap_remove_kernel_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va) { pd_entry_t newpde; vm_paddr_t mptepa; vm_page_t mpte; PMAP_LOCK_ASSERT(pmap, MA_OWNED); mpte = pmap_remove_pt_page(pmap, va); if (mpte == NULL) panic("pmap_remove_kernel_pde: Missing pt page."); mptepa = VM_PAGE_TO_PHYS(mpte); newpde = mptepa | PG_M | PG_A | PG_RW | PG_V; /* * Initialize the page table page. */ pagezero((void *)&KPTmap[i386_btop(trunc_4mpage(va))]); /* * Remove the mapping. */ if (workaround_erratum383) pmap_update_pde(pmap, va, pde, newpde); else pmap_kenter_pde(va, newpde); /* * Invalidate the recursive mapping of the page table page. */ pmap_invalidate_page(pmap, (vm_offset_t)vtopte(va)); } /* * pmap_remove_pde: do the things to unmap a superpage in a process */ static void pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva, struct spglist *free) { struct md_page *pvh; pd_entry_t oldpde; vm_offset_t eva, va; vm_page_t m, mpte; PMAP_LOCK_ASSERT(pmap, MA_OWNED); KASSERT((sva & PDRMASK) == 0, ("pmap_remove_pde: sva is not 4mpage aligned")); oldpde = pte_load_clear(pdq); if (oldpde & PG_W) pmap->pm_stats.wired_count -= NBPDR / PAGE_SIZE; /* * Machines that don't support invlpg, also don't support * PG_G. */ if (oldpde & PG_G) pmap_invalidate_page(kernel_pmap, sva); pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE; if (oldpde & PG_MANAGED) { pvh = pa_to_pvh(oldpde & PG_PS_FRAME); pmap_pvh_free(pvh, pmap, sva); eva = sva + NBPDR; for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME); va < eva; va += PAGE_SIZE, m++) { if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW)) vm_page_dirty(m); if (oldpde & PG_A) vm_page_aflag_set(m, PGA_REFERENCED); if (TAILQ_EMPTY(&m->md.pv_list) && TAILQ_EMPTY(&pvh->pv_list)) vm_page_aflag_clear(m, PGA_WRITEABLE); } } if (pmap == kernel_pmap) { pmap_remove_kernel_pde(pmap, pdq, sva); } else { mpte = pmap_remove_pt_page(pmap, sva); if (mpte != NULL) { pmap->pm_stats.resident_count--; KASSERT(mpte->wire_count == NPTEPG, ("pmap_remove_pde: pte page wire count error")); mpte->wire_count = 0; pmap_add_delayed_free_list(mpte, free, FALSE); atomic_subtract_int(&vm_cnt.v_wire_count, 1); } } } /* * pmap_remove_pte: do the things to unmap a page in a process */ static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t va, struct spglist *free) { pt_entry_t oldpte; vm_page_t m; rw_assert(&pvh_global_lock, RA_WLOCKED); PMAP_LOCK_ASSERT(pmap, MA_OWNED); oldpte = pte_load_clear(ptq); KASSERT(oldpte != 0, ("pmap_remove_pte: pmap %p va %x zero pte", pmap, va)); if (oldpte & PG_W) pmap->pm_stats.wired_count -= 1; /* * Machines that don't support invlpg, also don't support * PG_G. */ if (oldpte & PG_G) pmap_invalidate_page(kernel_pmap, va); pmap->pm_stats.resident_count -= 1; if (oldpte & PG_MANAGED) { m = PHYS_TO_VM_PAGE(oldpte & PG_FRAME); if ((oldpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) vm_page_dirty(m); if (oldpte & PG_A) vm_page_aflag_set(m, PGA_REFERENCED); pmap_remove_entry(pmap, m, va); } return (pmap_unuse_pt(pmap, va, free)); } /* * Remove a single page from a process address space */ static void pmap_remove_page(pmap_t pmap, vm_offset_t va, struct spglist *free) { pt_entry_t *pte; rw_assert(&pvh_global_lock, RA_WLOCKED); KASSERT(curthread->td_pinned > 0, ("curthread not pinned")); PMAP_LOCK_ASSERT(pmap, MA_OWNED); if ((pte = pmap_pte_quick(pmap, va)) == NULL || *pte == 0) return; pmap_remove_pte(pmap, pte, va, free); pmap_invalidate_page(pmap, va); } /* * Remove the given range of addresses from the specified map. * * It is assumed that the start and end are properly * rounded to the page size. */ void pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { vm_offset_t pdnxt; pd_entry_t ptpaddr; pt_entry_t *pte; struct spglist free; int anyvalid; /* * Perform an unsynchronized read. This is, however, safe. */ if (pmap->pm_stats.resident_count == 0) return; anyvalid = 0; SLIST_INIT(&free); rw_wlock(&pvh_global_lock); sched_pin(); PMAP_LOCK(pmap); /* * special handling of removing one page. a very * common operation and easy to short circuit some * code. */ if ((sva + PAGE_SIZE == eva) && ((pmap->pm_pdir[(sva >> PDRSHIFT)] & PG_PS) == 0)) { pmap_remove_page(pmap, sva, &free); goto out; } for (; sva < eva; sva = pdnxt) { u_int pdirindex; /* * Calculate index for next page table. */ pdnxt = (sva + NBPDR) & ~PDRMASK; if (pdnxt < sva) pdnxt = eva; if (pmap->pm_stats.resident_count == 0) break; pdirindex = sva >> PDRSHIFT; ptpaddr = pmap->pm_pdir[pdirindex]; /* * Weed out invalid mappings. Note: we assume that the page * directory table is always allocated, and in kernel virtual. */ if (ptpaddr == 0) continue; /* * Check for large page. */ if ((ptpaddr & PG_PS) != 0) { /* * Are we removing the entire large page? If not, * demote the mapping and fall through. */ if (sva + NBPDR == pdnxt && eva >= pdnxt) { /* * The TLB entry for a PG_G mapping is * invalidated by pmap_remove_pde(). */ if ((ptpaddr & PG_G) == 0) anyvalid = 1; pmap_remove_pde(pmap, &pmap->pm_pdir[pdirindex], sva, &free); continue; } else if (!pmap_demote_pde(pmap, &pmap->pm_pdir[pdirindex], sva)) { /* The large page mapping was destroyed. */ continue; } } /* * Limit our scan to either the end of the va represented * by the current page table page, or to the end of the * range being removed. */ if (pdnxt > eva) pdnxt = eva; for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++, sva += PAGE_SIZE) { if (*pte == 0) continue; /* * The TLB entry for a PG_G mapping is invalidated * by pmap_remove_pte(). */ if ((*pte & PG_G) == 0) anyvalid = 1; if (pmap_remove_pte(pmap, pte, sva, &free)) break; } } out: sched_unpin(); if (anyvalid) pmap_invalidate_all(pmap); rw_wunlock(&pvh_global_lock); PMAP_UNLOCK(pmap); pmap_free_zero_pages(&free); } /* * Routine: pmap_remove_all * Function: * Removes this physical page from * all physical maps in which it resides. * Reflects back modify bits to the pager. * * Notes: * Original versions of this routine were very * inefficient because they iteratively called * pmap_remove (slow...) */ void pmap_remove_all(vm_page_t m) { struct md_page *pvh; pv_entry_t pv; pmap_t pmap; pt_entry_t *pte, tpte; pd_entry_t *pde; vm_offset_t va; struct spglist free; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_remove_all: page %p is not managed", m)); SLIST_INIT(&free); rw_wlock(&pvh_global_lock); sched_pin(); if ((m->flags & PG_FICTITIOUS) != 0) goto small_mappings; pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) { va = pv->pv_va; pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pde = pmap_pde(pmap, va); (void)pmap_demote_pde(pmap, pde, va); PMAP_UNLOCK(pmap); } small_mappings: while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pmap->pm_stats.resident_count--; pde = pmap_pde(pmap, pv->pv_va); KASSERT((*pde & PG_PS) == 0, ("pmap_remove_all: found" " a 4mpage in page %p's pv list", m)); pte = pmap_pte_quick(pmap, pv->pv_va); tpte = pte_load_clear(pte); KASSERT(tpte != 0, ("pmap_remove_all: pmap %p va %x zero pte", pmap, pv->pv_va)); if (tpte & PG_W) pmap->pm_stats.wired_count--; if (tpte & PG_A) vm_page_aflag_set(m, PGA_REFERENCED); /* * Update the vm_page_t clean and reference bits. */ if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) vm_page_dirty(m); pmap_unuse_pt(pmap, pv->pv_va, &free); pmap_invalidate_page(pmap, pv->pv_va); TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); free_pv_entry(pmap, pv); PMAP_UNLOCK(pmap); } vm_page_aflag_clear(m, PGA_WRITEABLE); sched_unpin(); rw_wunlock(&pvh_global_lock); pmap_free_zero_pages(&free); } /* * pmap_protect_pde: do the things to protect a 4mpage in a process */ static boolean_t pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva, vm_prot_t prot) { pd_entry_t newpde, oldpde; vm_offset_t eva, va; vm_page_t m; boolean_t anychanged; PMAP_LOCK_ASSERT(pmap, MA_OWNED); KASSERT((sva & PDRMASK) == 0, ("pmap_protect_pde: sva is not 4mpage aligned")); anychanged = FALSE; retry: oldpde = newpde = *pde; if (oldpde & PG_MANAGED) { eva = sva + NBPDR; for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME); va < eva; va += PAGE_SIZE, m++) if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW)) vm_page_dirty(m); } if ((prot & VM_PROT_WRITE) == 0) newpde &= ~(PG_RW | PG_M); #if defined(PAE) || defined(PAE_TABLES) if ((prot & VM_PROT_EXECUTE) == 0) newpde |= pg_nx; #endif if (newpde != oldpde) { if (!pde_cmpset(pde, oldpde, newpde)) goto retry; if (oldpde & PG_G) pmap_invalidate_page(pmap, sva); else anychanged = TRUE; } return (anychanged); } /* * Set the physical protection on the * specified range of this map as requested. */ void pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) { vm_offset_t pdnxt; pd_entry_t ptpaddr; pt_entry_t *pte; boolean_t anychanged, pv_lists_locked; KASSERT((prot & ~VM_PROT_ALL) == 0, ("invalid prot %x", prot)); if (prot == VM_PROT_NONE) { pmap_remove(pmap, sva, eva); return; } #if defined(PAE) || defined(PAE_TABLES) if ((prot & (VM_PROT_WRITE|VM_PROT_EXECUTE)) == (VM_PROT_WRITE|VM_PROT_EXECUTE)) return; #else if (prot & VM_PROT_WRITE) return; #endif if (pmap_is_current(pmap)) pv_lists_locked = FALSE; else { pv_lists_locked = TRUE; resume: rw_wlock(&pvh_global_lock); sched_pin(); } anychanged = FALSE; PMAP_LOCK(pmap); for (; sva < eva; sva = pdnxt) { pt_entry_t obits, pbits; u_int pdirindex; pdnxt = (sva + NBPDR) & ~PDRMASK; if (pdnxt < sva) pdnxt = eva; pdirindex = sva >> PDRSHIFT; ptpaddr = pmap->pm_pdir[pdirindex]; /* * Weed out invalid mappings. Note: we assume that the page * directory table is always allocated, and in kernel virtual. */ if (ptpaddr == 0) continue; /* * Check for large page. */ if ((ptpaddr & PG_PS) != 0) { /* * Are we protecting the entire large page? If not, * demote the mapping and fall through. */ if (sva + NBPDR == pdnxt && eva >= pdnxt) { /* * The TLB entry for a PG_G mapping is * invalidated by pmap_protect_pde(). */ if (pmap_protect_pde(pmap, &pmap->pm_pdir[pdirindex], sva, prot)) anychanged = TRUE; continue; } else { if (!pv_lists_locked) { pv_lists_locked = TRUE; if (!rw_try_wlock(&pvh_global_lock)) { if (anychanged) pmap_invalidate_all( pmap); PMAP_UNLOCK(pmap); goto resume; } sched_pin(); } if (!pmap_demote_pde(pmap, &pmap->pm_pdir[pdirindex], sva)) { /* * The large page mapping was * destroyed. */ continue; } } } if (pdnxt > eva) pdnxt = eva; for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++, sva += PAGE_SIZE) { vm_page_t m; retry: /* * Regardless of whether a pte is 32 or 64 bits in * size, PG_RW, PG_A, and PG_M are among the least * significant 32 bits. */ obits = pbits = *pte; if ((pbits & PG_V) == 0) continue; if ((prot & VM_PROT_WRITE) == 0) { if ((pbits & (PG_MANAGED | PG_M | PG_RW)) == (PG_MANAGED | PG_M | PG_RW)) { m = PHYS_TO_VM_PAGE(pbits & PG_FRAME); vm_page_dirty(m); } pbits &= ~(PG_RW | PG_M); } #if defined(PAE) || defined(PAE_TABLES) if ((prot & VM_PROT_EXECUTE) == 0) pbits |= pg_nx; #endif if (pbits != obits) { #if defined(PAE) || defined(PAE_TABLES) if (!atomic_cmpset_64(pte, obits, pbits)) goto retry; #else if (!atomic_cmpset_int((u_int *)pte, obits, pbits)) goto retry; #endif if (obits & PG_G) pmap_invalidate_page(pmap, sva); else anychanged = TRUE; } } } if (anychanged) pmap_invalidate_all(pmap); if (pv_lists_locked) { sched_unpin(); rw_wunlock(&pvh_global_lock); } PMAP_UNLOCK(pmap); } /* * Tries to promote the 512 or 1024, contiguous 4KB page mappings that are * within a single page table page (PTP) to a single 2- or 4MB page mapping. * For promotion to occur, two conditions must be met: (1) the 4KB page * mappings must map aligned, contiguous physical memory and (2) the 4KB page * mappings must have identical characteristics. * * Managed (PG_MANAGED) mappings within the kernel address space are not * promoted. The reason is that kernel PDEs are replicated in each pmap but * pmap_clear_ptes() and pmap_ts_referenced() only read the PDE from the kernel * pmap. */ static void pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va) { pd_entry_t newpde; pt_entry_t *firstpte, oldpte, pa, *pte; vm_offset_t oldpteva; vm_page_t mpte; PMAP_LOCK_ASSERT(pmap, MA_OWNED); /* * Examine the first PTE in the specified PTP. Abort if this PTE is * either invalid, unused, or does not map the first 4KB physical page * within a 2- or 4MB page. */ firstpte = pmap_pte_quick(pmap, trunc_4mpage(va)); setpde: newpde = *firstpte; if ((newpde & ((PG_FRAME & PDRMASK) | PG_A | PG_V)) != (PG_A | PG_V)) { pmap_pde_p_failures++; CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x" " in pmap %p", va, pmap); return; } if ((*firstpte & PG_MANAGED) != 0 && pmap == kernel_pmap) { pmap_pde_p_failures++; CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x" " in pmap %p", va, pmap); return; } if ((newpde & (PG_M | PG_RW)) == PG_RW) { /* * When PG_M is already clear, PG_RW can be cleared without * a TLB invalidation. */ if (!atomic_cmpset_int((u_int *)firstpte, newpde, newpde & ~PG_RW)) goto setpde; newpde &= ~PG_RW; } /* * Examine each of the other PTEs in the specified PTP. Abort if this * PTE maps an unexpected 4KB physical page or does not have identical * characteristics to the first PTE. */ pa = (newpde & (PG_PS_FRAME | PG_A | PG_V)) + NBPDR - PAGE_SIZE; for (pte = firstpte + NPTEPG - 1; pte > firstpte; pte--) { setpte: oldpte = *pte; if ((oldpte & (PG_FRAME | PG_A | PG_V)) != pa) { pmap_pde_p_failures++; CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x" " in pmap %p", va, pmap); return; } if ((oldpte & (PG_M | PG_RW)) == PG_RW) { /* * When PG_M is already clear, PG_RW can be cleared * without a TLB invalidation. */ if (!atomic_cmpset_int((u_int *)pte, oldpte, oldpte & ~PG_RW)) goto setpte; oldpte &= ~PG_RW; oldpteva = (oldpte & PG_FRAME & PDRMASK) | (va & ~PDRMASK); CTR2(KTR_PMAP, "pmap_promote_pde: protect for va %#x" " in pmap %p", oldpteva, pmap); } if ((oldpte & PG_PTE_PROMOTE) != (newpde & PG_PTE_PROMOTE)) { pmap_pde_p_failures++; CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x" " in pmap %p", va, pmap); return; } pa -= PAGE_SIZE; } /* * Save the page table page in its current state until the PDE * mapping the superpage is demoted by pmap_demote_pde() or * destroyed by pmap_remove_pde(). */ mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME); KASSERT(mpte >= vm_page_array && mpte < &vm_page_array[vm_page_array_size], ("pmap_promote_pde: page table page is out of range")); KASSERT(mpte->pindex == va >> PDRSHIFT, ("pmap_promote_pde: page table page's pindex is wrong")); if (pmap_insert_pt_page(pmap, mpte)) { pmap_pde_p_failures++; CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x in pmap %p", va, pmap); return; } /* * Promote the pv entries. */ if ((newpde & PG_MANAGED) != 0) pmap_pv_promote_pde(pmap, va, newpde & PG_PS_FRAME); /* * Propagate the PAT index to its proper position. */ if ((newpde & PG_PTE_PAT) != 0) newpde ^= PG_PDE_PAT | PG_PTE_PAT; /* * Map the superpage. */ if (workaround_erratum383) pmap_update_pde(pmap, va, pde, PG_PS | newpde); else if (pmap == kernel_pmap) pmap_kenter_pde(va, PG_PS | newpde); else pde_store(pde, PG_PS | newpde); pmap_pde_promotions++; CTR2(KTR_PMAP, "pmap_promote_pde: success for va %#x" " in pmap %p", va, pmap); } /* * Insert the given physical page (p) at * the specified virtual address (v) in the * target physical map with the protection requested. * * If specified, the page will be wired down, meaning * that the related pte can not be reclaimed. * * NB: This is the only routine which MAY NOT lazy-evaluate * or lose information. That is, this routine must actually * insert this page into the given map NOW. */ int pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, u_int flags, int8_t psind) { pd_entry_t *pde; pt_entry_t *pte; pt_entry_t newpte, origpte; pv_entry_t pv; vm_paddr_t opa, pa; vm_page_t mpte, om; boolean_t invlva, wired; va = trunc_page(va); mpte = NULL; wired = (flags & PMAP_ENTER_WIRED) != 0; KASSERT(va <= VM_MAX_KERNEL_ADDRESS, ("pmap_enter: toobig")); KASSERT(va < UPT_MIN_ADDRESS || va >= UPT_MAX_ADDRESS, ("pmap_enter: invalid to pmap_enter page table pages (va: 0x%x)", va)); if ((m->oflags & VPO_UNMANAGED) == 0 && !vm_page_xbusied(m)) VM_OBJECT_ASSERT_LOCKED(m->object); rw_wlock(&pvh_global_lock); PMAP_LOCK(pmap); sched_pin(); pde = pmap_pde(pmap, va); if (va < VM_MAXUSER_ADDRESS) { /* * va is for UVA. * In the case that a page table page is not resident, * we are creating it here. pmap_allocpte() handles * demotion. */ mpte = pmap_allocpte(pmap, va, flags); if (mpte == NULL) { KASSERT((flags & PMAP_ENTER_NOSLEEP) != 0, ("pmap_allocpte failed with sleep allowed")); sched_unpin(); rw_wunlock(&pvh_global_lock); PMAP_UNLOCK(pmap); return (KERN_RESOURCE_SHORTAGE); } } else { /* * va is for KVA, so pmap_demote_pde() will never fail * to install a page table page. PG_V is also * asserted by pmap_demote_pde(). */ KASSERT(pde != NULL && (*pde & PG_V) != 0, ("KVA %#x invalid pde pdir %#jx", va, (uintmax_t)pmap->pm_pdir[PTDPTDI])); if ((*pde & PG_PS) != 0) pmap_demote_pde(pmap, pde, va); } pte = pmap_pte_quick(pmap, va); /* * Page Directory table entry is not valid, which should not * happen. We should have either allocated the page table * page or demoted the existing mapping above. */ if (pte == NULL) { panic("pmap_enter: invalid page directory pdir=%#jx, va=%#x", (uintmax_t)pmap->pm_pdir[PTDPTDI], va); } pa = VM_PAGE_TO_PHYS(m); om = NULL; origpte = *pte; opa = origpte & PG_FRAME; /* * Mapping has not changed, must be protection or wiring change. */ if (origpte && (opa == pa)) { /* * Wiring change, just update stats. We don't worry about * wiring PT pages as they remain resident as long as there * are valid mappings in them. Hence, if a user page is wired, * the PT page will be also. */ if (wired && ((origpte & PG_W) == 0)) pmap->pm_stats.wired_count++; else if (!wired && (origpte & PG_W)) pmap->pm_stats.wired_count--; /* * Remove extra pte reference */ if (mpte) mpte->wire_count--; if (origpte & PG_MANAGED) { om = m; pa |= PG_MANAGED; } goto validate; } pv = NULL; /* * Mapping has changed, invalidate old range and fall through to * handle validating new mapping. */ if (opa) { if (origpte & PG_W) pmap->pm_stats.wired_count--; if (origpte & PG_MANAGED) { om = PHYS_TO_VM_PAGE(opa); pv = pmap_pvh_remove(&om->md, pmap, va); } if (mpte != NULL) { mpte->wire_count--; KASSERT(mpte->wire_count > 0, ("pmap_enter: missing reference to page table page," " va: 0x%x", va)); } } else pmap->pm_stats.resident_count++; /* * Enter on the PV list if part of our managed memory. */ if ((m->oflags & VPO_UNMANAGED) == 0) { KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva, ("pmap_enter: managed mapping within the clean submap")); if (pv == NULL) pv = get_pv_entry(pmap, FALSE); pv->pv_va = va; TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); pa |= PG_MANAGED; } else if (pv != NULL) free_pv_entry(pmap, pv); /* * Increment counters */ if (wired) pmap->pm_stats.wired_count++; validate: /* * Now validate mapping with desired protection/wiring. */ newpte = (pt_entry_t)(pa | pmap_cache_bits(m->md.pat_mode, 0) | PG_V); if ((prot & VM_PROT_WRITE) != 0) { newpte |= PG_RW; if ((newpte & PG_MANAGED) != 0) vm_page_aflag_set(m, PGA_WRITEABLE); } #if defined(PAE) || defined(PAE_TABLES) if ((prot & VM_PROT_EXECUTE) == 0) newpte |= pg_nx; #endif if (wired) newpte |= PG_W; if (va < VM_MAXUSER_ADDRESS) newpte |= PG_U; if (pmap == kernel_pmap) newpte |= pgeflag; /* * if the mapping or permission bits are different, we need * to update the pte. */ if ((origpte & ~(PG_M|PG_A)) != newpte) { newpte |= PG_A; if ((flags & VM_PROT_WRITE) != 0) newpte |= PG_M; if (origpte & PG_V) { invlva = FALSE; origpte = pte_load_store(pte, newpte); if (origpte & PG_A) { if (origpte & PG_MANAGED) vm_page_aflag_set(om, PGA_REFERENCED); if (opa != VM_PAGE_TO_PHYS(m)) invlva = TRUE; #if defined(PAE) || defined(PAE_TABLES) if ((origpte & PG_NX) == 0 && (newpte & PG_NX) != 0) invlva = TRUE; #endif } if ((origpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { if ((origpte & PG_MANAGED) != 0) vm_page_dirty(om); if ((prot & VM_PROT_WRITE) == 0) invlva = TRUE; } if ((origpte & PG_MANAGED) != 0 && TAILQ_EMPTY(&om->md.pv_list) && ((om->flags & PG_FICTITIOUS) != 0 || TAILQ_EMPTY(&pa_to_pvh(opa)->pv_list))) vm_page_aflag_clear(om, PGA_WRITEABLE); if (invlva) pmap_invalidate_page(pmap, va); } else pte_store(pte, newpte); } /* * If both the page table page and the reservation are fully * populated, then attempt promotion. */ if ((mpte == NULL || mpte->wire_count == NPTEPG) && pg_ps_enabled && (m->flags & PG_FICTITIOUS) == 0 && vm_reserv_level_iffullpop(m) == 0) pmap_promote_pde(pmap, pde, va); sched_unpin(); rw_wunlock(&pvh_global_lock); PMAP_UNLOCK(pmap); return (KERN_SUCCESS); } /* * Tries to create a 2- or 4MB page mapping. Returns TRUE if successful and * FALSE otherwise. Fails if (1) a page table page cannot be allocated without * blocking, (2) a mapping already exists at the specified virtual address, or * (3) a pv entry cannot be allocated without reclaiming another pv entry. */ static boolean_t pmap_enter_pde(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot) { pd_entry_t *pde, newpde; rw_assert(&pvh_global_lock, RA_WLOCKED); PMAP_LOCK_ASSERT(pmap, MA_OWNED); pde = pmap_pde(pmap, va); if (*pde != 0) { CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx" " in pmap %p", va, pmap); return (FALSE); } newpde = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(m->md.pat_mode, 1) | PG_PS | PG_V; if ((m->oflags & VPO_UNMANAGED) == 0) { newpde |= PG_MANAGED; /* * Abort this mapping if its PV entry could not be created. */ if (!pmap_pv_insert_pde(pmap, va, VM_PAGE_TO_PHYS(m))) { CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx" " in pmap %p", va, pmap); return (FALSE); } } #if defined(PAE) || defined(PAE_TABLES) if ((prot & VM_PROT_EXECUTE) == 0) newpde |= pg_nx; #endif if (va < VM_MAXUSER_ADDRESS) newpde |= PG_U; /* * Increment counters. */ pmap->pm_stats.resident_count += NBPDR / PAGE_SIZE; /* * Map the superpage. */ pde_store(pde, newpde); pmap_pde_mappings++; CTR2(KTR_PMAP, "pmap_enter_pde: success for va %#lx" " in pmap %p", va, pmap); return (TRUE); } /* * Maps a sequence of resident pages belonging to the same object. * The sequence begins with the given page m_start. This page is * mapped at the given virtual address start. Each subsequent page is * mapped at a virtual address that is offset from start by the same * amount as the page is offset from m_start within the object. The * last page in the sequence is the page with the largest offset from * m_start that can be mapped at a virtual address less than the given * virtual address end. Not every virtual page between start and end * is mapped; only those for which a resident page exists with the * corresponding offset from m_start are mapped. */ void pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end, vm_page_t m_start, vm_prot_t prot) { vm_offset_t va; vm_page_t m, mpte; vm_pindex_t diff, psize; VM_OBJECT_ASSERT_LOCKED(m_start->object); psize = atop(end - start); mpte = NULL; m = m_start; rw_wlock(&pvh_global_lock); PMAP_LOCK(pmap); while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) { va = start + ptoa(diff); if ((va & PDRMASK) == 0 && va + NBPDR <= end && m->psind == 1 && pg_ps_enabled && pmap_enter_pde(pmap, va, m, prot)) m = &m[NBPDR / PAGE_SIZE - 1]; else mpte = pmap_enter_quick_locked(pmap, va, m, prot, mpte); m = TAILQ_NEXT(m, listq); } rw_wunlock(&pvh_global_lock); PMAP_UNLOCK(pmap); } /* * this code makes some *MAJOR* assumptions: * 1. Current pmap & pmap exists. * 2. Not wired. * 3. Read access. * 4. No page table pages. * but is *MUCH* faster than pmap_enter... */ void pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot) { rw_wlock(&pvh_global_lock); PMAP_LOCK(pmap); (void)pmap_enter_quick_locked(pmap, va, m, prot, NULL); rw_wunlock(&pvh_global_lock); PMAP_UNLOCK(pmap); } static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, vm_page_t mpte) { pt_entry_t *pte; vm_paddr_t pa; struct spglist free; KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva || (m->oflags & VPO_UNMANAGED) != 0, ("pmap_enter_quick_locked: managed mapping within the clean submap")); rw_assert(&pvh_global_lock, RA_WLOCKED); PMAP_LOCK_ASSERT(pmap, MA_OWNED); /* * In the case that a page table page is not * resident, we are creating it here. */ if (va < VM_MAXUSER_ADDRESS) { u_int ptepindex; pd_entry_t ptepa; /* * Calculate pagetable page index */ ptepindex = va >> PDRSHIFT; if (mpte && (mpte->pindex == ptepindex)) { mpte->wire_count++; } else { /* * Get the page directory entry */ ptepa = pmap->pm_pdir[ptepindex]; /* * If the page table page is mapped, we just increment * the hold count, and activate it. */ if (ptepa) { if (ptepa & PG_PS) return (NULL); mpte = PHYS_TO_VM_PAGE(ptepa & PG_FRAME); mpte->wire_count++; } else { mpte = _pmap_allocpte(pmap, ptepindex, PMAP_ENTER_NOSLEEP); if (mpte == NULL) return (mpte); } } } else { mpte = NULL; } /* * This call to vtopte makes the assumption that we are * entering the page into the current pmap. In order to support * quick entry into any pmap, one would likely use pmap_pte_quick. * But that isn't as quick as vtopte. */ pte = vtopte(va); if (*pte) { if (mpte != NULL) { mpte->wire_count--; mpte = NULL; } return (mpte); } /* * Enter on the PV list if part of our managed memory. */ if ((m->oflags & VPO_UNMANAGED) == 0 && !pmap_try_insert_pv_entry(pmap, va, m)) { if (mpte != NULL) { SLIST_INIT(&free); if (pmap_unwire_ptp(pmap, mpte, &free)) { pmap_invalidate_page(pmap, va); pmap_free_zero_pages(&free); } mpte = NULL; } return (mpte); } /* * Increment counters */ pmap->pm_stats.resident_count++; pa = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(m->md.pat_mode, 0); #if defined(PAE) || defined(PAE_TABLES) if ((prot & VM_PROT_EXECUTE) == 0) pa |= pg_nx; #endif /* * Now validate mapping with RO protection */ if ((m->oflags & VPO_UNMANAGED) != 0) pte_store(pte, pa | PG_V | PG_U); else pte_store(pte, pa | PG_V | PG_U | PG_MANAGED); return (mpte); } /* * Make a temporary mapping for a physical address. This is only intended * to be used for panic dumps. */ void * pmap_kenter_temporary(vm_paddr_t pa, int i) { vm_offset_t va; va = (vm_offset_t)crashdumpmap + (i * PAGE_SIZE); pmap_kenter(va, pa); invlpg(va); return ((void *)crashdumpmap); } /* * This code maps large physical mmap regions into the * processor address space. Note that some shortcuts * are taken, but the code works. */ void pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object, vm_pindex_t pindex, vm_size_t size) { pd_entry_t *pde; vm_paddr_t pa, ptepa; vm_page_t p; int pat_mode; VM_OBJECT_ASSERT_WLOCKED(object); KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG, ("pmap_object_init_pt: non-device object")); if (pseflag && (addr & (NBPDR - 1)) == 0 && (size & (NBPDR - 1)) == 0) { if (!vm_object_populate(object, pindex, pindex + atop(size))) return; p = vm_page_lookup(object, pindex); KASSERT(p->valid == VM_PAGE_BITS_ALL, ("pmap_object_init_pt: invalid page %p", p)); pat_mode = p->md.pat_mode; /* * Abort the mapping if the first page is not physically * aligned to a 2/4MB page boundary. */ ptepa = VM_PAGE_TO_PHYS(p); if (ptepa & (NBPDR - 1)) return; /* * Skip the first page. Abort the mapping if the rest of * the pages are not physically contiguous or have differing * memory attributes. */ p = TAILQ_NEXT(p, listq); for (pa = ptepa + PAGE_SIZE; pa < ptepa + size; pa += PAGE_SIZE) { KASSERT(p->valid == VM_PAGE_BITS_ALL, ("pmap_object_init_pt: invalid page %p", p)); if (pa != VM_PAGE_TO_PHYS(p) || pat_mode != p->md.pat_mode) return; p = TAILQ_NEXT(p, listq); } /* * Map using 2/4MB pages. Since "ptepa" is 2/4M aligned and * "size" is a multiple of 2/4M, adding the PAT setting to * "pa" will not affect the termination of this loop. */ PMAP_LOCK(pmap); for (pa = ptepa | pmap_cache_bits(pat_mode, 1); pa < ptepa + size; pa += NBPDR) { pde = pmap_pde(pmap, addr); if (*pde == 0) { pde_store(pde, pa | PG_PS | PG_M | PG_A | PG_U | PG_RW | PG_V); pmap->pm_stats.resident_count += NBPDR / PAGE_SIZE; pmap_pde_mappings++; } /* Else continue on if the PDE is already valid. */ addr += NBPDR; } PMAP_UNLOCK(pmap); } } /* * Clear the wired attribute from the mappings for the specified range of * addresses in the given pmap. Every valid mapping within that range * must have the wired attribute set. In contrast, invalid mappings * cannot have the wired attribute set, so they are ignored. * * The wired attribute of the page table entry is not a hardware feature, * so there is no need to invalidate any TLB entries. */ void pmap_unwire(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { vm_offset_t pdnxt; pd_entry_t *pde; pt_entry_t *pte; boolean_t pv_lists_locked; if (pmap_is_current(pmap)) pv_lists_locked = FALSE; else { pv_lists_locked = TRUE; resume: rw_wlock(&pvh_global_lock); sched_pin(); } PMAP_LOCK(pmap); for (; sva < eva; sva = pdnxt) { pdnxt = (sva + NBPDR) & ~PDRMASK; if (pdnxt < sva) pdnxt = eva; pde = pmap_pde(pmap, sva); if ((*pde & PG_V) == 0) continue; if ((*pde & PG_PS) != 0) { if ((*pde & PG_W) == 0) panic("pmap_unwire: pde %#jx is missing PG_W", (uintmax_t)*pde); /* * Are we unwiring the entire large page? If not, * demote the mapping and fall through. */ if (sva + NBPDR == pdnxt && eva >= pdnxt) { /* * Regardless of whether a pde (or pte) is 32 * or 64 bits in size, PG_W is among the least * significant 32 bits. */ atomic_clear_int((u_int *)pde, PG_W); pmap->pm_stats.wired_count -= NBPDR / PAGE_SIZE; continue; } else { if (!pv_lists_locked) { pv_lists_locked = TRUE; if (!rw_try_wlock(&pvh_global_lock)) { PMAP_UNLOCK(pmap); /* Repeat sva. */ goto resume; } sched_pin(); } if (!pmap_demote_pde(pmap, pde, sva)) panic("pmap_unwire: demotion failed"); } } if (pdnxt > eva) pdnxt = eva; for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++, sva += PAGE_SIZE) { if ((*pte & PG_V) == 0) continue; if ((*pte & PG_W) == 0) panic("pmap_unwire: pte %#jx is missing PG_W", (uintmax_t)*pte); /* * PG_W must be cleared atomically. Although the pmap * lock synchronizes access to PG_W, another processor * could be setting PG_M and/or PG_A concurrently. * * PG_W is among the least significant 32 bits. */ atomic_clear_int((u_int *)pte, PG_W); pmap->pm_stats.wired_count--; } } if (pv_lists_locked) { sched_unpin(); rw_wunlock(&pvh_global_lock); } PMAP_UNLOCK(pmap); } /* * Copy the range specified by src_addr/len * from the source map to the range dst_addr/len * in the destination map. * * This routine is only advisory and need not do anything. */ void pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len, vm_offset_t src_addr) { struct spglist free; vm_offset_t addr; vm_offset_t end_addr = src_addr + len; vm_offset_t pdnxt; if (dst_addr != src_addr) return; if (!pmap_is_current(src_pmap)) return; rw_wlock(&pvh_global_lock); if (dst_pmap < src_pmap) { PMAP_LOCK(dst_pmap); PMAP_LOCK(src_pmap); } else { PMAP_LOCK(src_pmap); PMAP_LOCK(dst_pmap); } sched_pin(); for (addr = src_addr; addr < end_addr; addr = pdnxt) { pt_entry_t *src_pte, *dst_pte; vm_page_t dstmpte, srcmpte; pd_entry_t srcptepaddr; u_int ptepindex; KASSERT(addr < UPT_MIN_ADDRESS, ("pmap_copy: invalid to pmap_copy page tables")); pdnxt = (addr + NBPDR) & ~PDRMASK; if (pdnxt < addr) pdnxt = end_addr; ptepindex = addr >> PDRSHIFT; srcptepaddr = src_pmap->pm_pdir[ptepindex]; if (srcptepaddr == 0) continue; if (srcptepaddr & PG_PS) { if ((addr & PDRMASK) != 0 || addr + NBPDR > end_addr) continue; if (dst_pmap->pm_pdir[ptepindex] == 0 && ((srcptepaddr & PG_MANAGED) == 0 || pmap_pv_insert_pde(dst_pmap, addr, srcptepaddr & PG_PS_FRAME))) { dst_pmap->pm_pdir[ptepindex] = srcptepaddr & ~PG_W; dst_pmap->pm_stats.resident_count += NBPDR / PAGE_SIZE; pmap_pde_mappings++; } continue; } srcmpte = PHYS_TO_VM_PAGE(srcptepaddr & PG_FRAME); KASSERT(srcmpte->wire_count > 0, ("pmap_copy: source page table page is unused")); if (pdnxt > end_addr) pdnxt = end_addr; src_pte = vtopte(addr); while (addr < pdnxt) { pt_entry_t ptetemp; ptetemp = *src_pte; /* * we only virtual copy managed pages */ if ((ptetemp & PG_MANAGED) != 0) { dstmpte = pmap_allocpte(dst_pmap, addr, PMAP_ENTER_NOSLEEP); if (dstmpte == NULL) goto out; dst_pte = pmap_pte_quick(dst_pmap, addr); if (*dst_pte == 0 && pmap_try_insert_pv_entry(dst_pmap, addr, PHYS_TO_VM_PAGE(ptetemp & PG_FRAME))) { /* * Clear the wired, modified, and * accessed (referenced) bits * during the copy. */ *dst_pte = ptetemp & ~(PG_W | PG_M | PG_A); dst_pmap->pm_stats.resident_count++; } else { SLIST_INIT(&free); if (pmap_unwire_ptp(dst_pmap, dstmpte, &free)) { pmap_invalidate_page(dst_pmap, addr); pmap_free_zero_pages(&free); } goto out; } if (dstmpte->wire_count >= srcmpte->wire_count) break; } addr += PAGE_SIZE; src_pte++; } } out: sched_unpin(); rw_wunlock(&pvh_global_lock); PMAP_UNLOCK(src_pmap); PMAP_UNLOCK(dst_pmap); } /* * Zero 1 page of virtual memory mapped from a hardware page by the caller. */ static __inline void pagezero(void *page) { #if defined(I686_CPU) if (cpu_class == CPUCLASS_686) { #if defined(CPU_ENABLE_SSE) if (cpu_feature & CPUID_SSE2) sse2_pagezero(page); else #endif i686_pagezero(page); } else #endif bzero(page, PAGE_SIZE); } /* * Zero the specified hardware page. */ void pmap_zero_page(vm_page_t m) { pt_entry_t *cmap_pte2; struct pcpu *pc; sched_pin(); pc = pcpu_find(curcpu); cmap_pte2 = pc->pc_cmap_pte2; mtx_lock(&pc->pc_cmap_lock); if (*cmap_pte2) panic("pmap_zero_page: CMAP2 busy"); *cmap_pte2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M | pmap_cache_bits(m->md.pat_mode, 0); invlcaddr(pc->pc_cmap_addr2); pagezero(pc->pc_cmap_addr2); *cmap_pte2 = 0; - mtx_unlock(&pc->pc_cmap_lock); + + /* + * Unpin the thread before releasing the lock. Otherwise the thread + * could be rescheduled while still bound to the current CPU, only + * to unpin itself immediately upon resuming execution. + */ sched_unpin(); + mtx_unlock(&pc->pc_cmap_lock); } /* * Zero an an area within a single hardware page. off and size must not * cover an area beyond a single hardware page. */ void pmap_zero_page_area(vm_page_t m, int off, int size) { pt_entry_t *cmap_pte2; struct pcpu *pc; sched_pin(); pc = pcpu_find(curcpu); cmap_pte2 = pc->pc_cmap_pte2; mtx_lock(&pc->pc_cmap_lock); if (*cmap_pte2) panic("pmap_zero_page_area: CMAP2 busy"); *cmap_pte2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M | pmap_cache_bits(m->md.pat_mode, 0); invlcaddr(pc->pc_cmap_addr2); if (off == 0 && size == PAGE_SIZE) pagezero(pc->pc_cmap_addr2); else bzero(pc->pc_cmap_addr2 + off, size); *cmap_pte2 = 0; - mtx_unlock(&pc->pc_cmap_lock); sched_unpin(); + mtx_unlock(&pc->pc_cmap_lock); } /* * Copy 1 specified hardware page to another. */ void pmap_copy_page(vm_page_t src, vm_page_t dst) { pt_entry_t *cmap_pte1, *cmap_pte2; struct pcpu *pc; sched_pin(); pc = pcpu_find(curcpu); cmap_pte1 = pc->pc_cmap_pte1; cmap_pte2 = pc->pc_cmap_pte2; mtx_lock(&pc->pc_cmap_lock); if (*cmap_pte1) panic("pmap_copy_page: CMAP1 busy"); if (*cmap_pte2) panic("pmap_copy_page: CMAP2 busy"); *cmap_pte1 = PG_V | VM_PAGE_TO_PHYS(src) | PG_A | pmap_cache_bits(src->md.pat_mode, 0); invlcaddr(pc->pc_cmap_addr1); *cmap_pte2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(dst) | PG_A | PG_M | pmap_cache_bits(dst->md.pat_mode, 0); invlcaddr(pc->pc_cmap_addr2); bcopy(pc->pc_cmap_addr1, pc->pc_cmap_addr2, PAGE_SIZE); *cmap_pte1 = 0; *cmap_pte2 = 0; - mtx_unlock(&pc->pc_cmap_lock); sched_unpin(); + mtx_unlock(&pc->pc_cmap_lock); } int unmapped_buf_allowed = 1; void pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[], vm_offset_t b_offset, int xfersize) { vm_page_t a_pg, b_pg; char *a_cp, *b_cp; vm_offset_t a_pg_offset, b_pg_offset; pt_entry_t *cmap_pte1, *cmap_pte2; struct pcpu *pc; int cnt; sched_pin(); pc = pcpu_find(curcpu); cmap_pte1 = pc->pc_cmap_pte1; cmap_pte2 = pc->pc_cmap_pte2; mtx_lock(&pc->pc_cmap_lock); if (*cmap_pte1 != 0) panic("pmap_copy_pages: CMAP1 busy"); if (*cmap_pte2 != 0) panic("pmap_copy_pages: CMAP2 busy"); while (xfersize > 0) { a_pg = ma[a_offset >> PAGE_SHIFT]; a_pg_offset = a_offset & PAGE_MASK; cnt = min(xfersize, PAGE_SIZE - a_pg_offset); b_pg = mb[b_offset >> PAGE_SHIFT]; b_pg_offset = b_offset & PAGE_MASK; cnt = min(cnt, PAGE_SIZE - b_pg_offset); *cmap_pte1 = PG_V | VM_PAGE_TO_PHYS(a_pg) | PG_A | pmap_cache_bits(a_pg->md.pat_mode, 0); invlcaddr(pc->pc_cmap_addr1); *cmap_pte2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(b_pg) | PG_A | PG_M | pmap_cache_bits(b_pg->md.pat_mode, 0); invlcaddr(pc->pc_cmap_addr2); a_cp = pc->pc_cmap_addr1 + a_pg_offset; b_cp = pc->pc_cmap_addr2 + b_pg_offset; bcopy(a_cp, b_cp, cnt); a_offset += cnt; b_offset += cnt; xfersize -= cnt; } *cmap_pte1 = 0; *cmap_pte2 = 0; - mtx_unlock(&pc->pc_cmap_lock); sched_unpin(); + mtx_unlock(&pc->pc_cmap_lock); } /* * Returns true if the pmap's pv is one of the first * 16 pvs linked to from this page. This count may * be changed upwards or downwards in the future; it * is only necessary that true be returned for a small * subset of pmaps for proper page aging. */ boolean_t pmap_page_exists_quick(pmap_t pmap, vm_page_t m) { struct md_page *pvh; pv_entry_t pv; int loops = 0; boolean_t rv; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_page_exists_quick: page %p is not managed", m)); rv = FALSE; rw_wlock(&pvh_global_lock); TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { if (PV_PMAP(pv) == pmap) { rv = TRUE; break; } loops++; if (loops >= 16) break; } if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) { pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { if (PV_PMAP(pv) == pmap) { rv = TRUE; break; } loops++; if (loops >= 16) break; } } rw_wunlock(&pvh_global_lock); return (rv); } /* * pmap_page_wired_mappings: * * Return the number of managed mappings to the given physical page * that are wired. */ int pmap_page_wired_mappings(vm_page_t m) { int count; count = 0; if ((m->oflags & VPO_UNMANAGED) != 0) return (count); rw_wlock(&pvh_global_lock); count = pmap_pvh_wired_mappings(&m->md, count); if ((m->flags & PG_FICTITIOUS) == 0) { count = pmap_pvh_wired_mappings(pa_to_pvh(VM_PAGE_TO_PHYS(m)), count); } rw_wunlock(&pvh_global_lock); return (count); } /* * pmap_pvh_wired_mappings: * * Return the updated number "count" of managed mappings that are wired. */ static int pmap_pvh_wired_mappings(struct md_page *pvh, int count) { pmap_t pmap; pt_entry_t *pte; pv_entry_t pv; rw_assert(&pvh_global_lock, RA_WLOCKED); sched_pin(); TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pte = pmap_pte_quick(pmap, pv->pv_va); if ((*pte & PG_W) != 0) count++; PMAP_UNLOCK(pmap); } sched_unpin(); return (count); } /* * Returns TRUE if the given page is mapped individually or as part of * a 4mpage. Otherwise, returns FALSE. */ boolean_t pmap_page_is_mapped(vm_page_t m) { boolean_t rv; if ((m->oflags & VPO_UNMANAGED) != 0) return (FALSE); rw_wlock(&pvh_global_lock); rv = !TAILQ_EMPTY(&m->md.pv_list) || ((m->flags & PG_FICTITIOUS) == 0 && !TAILQ_EMPTY(&pa_to_pvh(VM_PAGE_TO_PHYS(m))->pv_list)); rw_wunlock(&pvh_global_lock); return (rv); } /* * Remove all pages from specified address space * this aids process exit speeds. Also, this code * is special cased for current process only, but * can have the more generic (and slightly slower) * mode enabled. This is much faster than pmap_remove * in the case of running down an entire address space. */ void pmap_remove_pages(pmap_t pmap) { pt_entry_t *pte, tpte; vm_page_t m, mpte, mt; pv_entry_t pv; struct md_page *pvh; struct pv_chunk *pc, *npc; struct spglist free; int field, idx; int32_t bit; uint32_t inuse, bitmask; int allfree; if (pmap != PCPU_GET(curpmap)) { printf("warning: pmap_remove_pages called with non-current pmap\n"); return; } SLIST_INIT(&free); rw_wlock(&pvh_global_lock); PMAP_LOCK(pmap); sched_pin(); TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) { KASSERT(pc->pc_pmap == pmap, ("Wrong pmap %p %p", pmap, pc->pc_pmap)); allfree = 1; for (field = 0; field < _NPCM; field++) { inuse = ~pc->pc_map[field] & pc_freemask[field]; while (inuse != 0) { bit = bsfl(inuse); bitmask = 1UL << bit; idx = field * 32 + bit; pv = &pc->pc_pventry[idx]; inuse &= ~bitmask; pte = pmap_pde(pmap, pv->pv_va); tpte = *pte; if ((tpte & PG_PS) == 0) { pte = vtopte(pv->pv_va); tpte = *pte & ~PG_PTE_PAT; } if (tpte == 0) { printf( "TPTE at %p IS ZERO @ VA %08x\n", pte, pv->pv_va); panic("bad pte"); } /* * We cannot remove wired pages from a process' mapping at this time */ if (tpte & PG_W) { allfree = 0; continue; } m = PHYS_TO_VM_PAGE(tpte & PG_FRAME); KASSERT(m->phys_addr == (tpte & PG_FRAME), ("vm_page_t %p phys_addr mismatch %016jx %016jx", m, (uintmax_t)m->phys_addr, (uintmax_t)tpte)); KASSERT((m->flags & PG_FICTITIOUS) != 0 || m < &vm_page_array[vm_page_array_size], ("pmap_remove_pages: bad tpte %#jx", (uintmax_t)tpte)); pte_clear(pte); /* * Update the vm_page_t clean/reference bits. */ if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { if ((tpte & PG_PS) != 0) { for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++) vm_page_dirty(mt); } else vm_page_dirty(m); } /* Mark free */ PV_STAT(pv_entry_frees++); PV_STAT(pv_entry_spare++); pv_entry_count--; pc->pc_map[field] |= bitmask; if ((tpte & PG_PS) != 0) { pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE; pvh = pa_to_pvh(tpte & PG_PS_FRAME); TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); if (TAILQ_EMPTY(&pvh->pv_list)) { for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++) if (TAILQ_EMPTY(&mt->md.pv_list)) vm_page_aflag_clear(mt, PGA_WRITEABLE); } mpte = pmap_remove_pt_page(pmap, pv->pv_va); if (mpte != NULL) { pmap->pm_stats.resident_count--; KASSERT(mpte->wire_count == NPTEPG, ("pmap_remove_pages: pte page wire count error")); mpte->wire_count = 0; pmap_add_delayed_free_list(mpte, &free, FALSE); atomic_subtract_int(&vm_cnt.v_wire_count, 1); } } else { pmap->pm_stats.resident_count--; TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); if (TAILQ_EMPTY(&m->md.pv_list) && (m->flags & PG_FICTITIOUS) == 0) { pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); if (TAILQ_EMPTY(&pvh->pv_list)) vm_page_aflag_clear(m, PGA_WRITEABLE); } pmap_unuse_pt(pmap, pv->pv_va, &free); } } } if (allfree) { TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); free_pv_chunk(pc); } } sched_unpin(); pmap_invalidate_all(pmap); rw_wunlock(&pvh_global_lock); PMAP_UNLOCK(pmap); pmap_free_zero_pages(&free); } /* * pmap_is_modified: * * Return whether or not the specified physical page was modified * in any physical maps. */ boolean_t pmap_is_modified(vm_page_t m) { boolean_t rv; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_is_modified: page %p is not managed", m)); /* * If the page is not exclusive busied, then PGA_WRITEABLE cannot be * concurrently set while the object is locked. Thus, if PGA_WRITEABLE * is clear, no PTEs can have PG_M set. */ VM_OBJECT_ASSERT_WLOCKED(m->object); if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0) return (FALSE); rw_wlock(&pvh_global_lock); rv = pmap_is_modified_pvh(&m->md) || ((m->flags & PG_FICTITIOUS) == 0 && pmap_is_modified_pvh(pa_to_pvh(VM_PAGE_TO_PHYS(m)))); rw_wunlock(&pvh_global_lock); return (rv); } /* * Returns TRUE if any of the given mappings were used to modify * physical memory. Otherwise, returns FALSE. Both page and 2mpage * mappings are supported. */ static boolean_t pmap_is_modified_pvh(struct md_page *pvh) { pv_entry_t pv; pt_entry_t *pte; pmap_t pmap; boolean_t rv; rw_assert(&pvh_global_lock, RA_WLOCKED); rv = FALSE; sched_pin(); TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pte = pmap_pte_quick(pmap, pv->pv_va); rv = (*pte & (PG_M | PG_RW)) == (PG_M | PG_RW); PMAP_UNLOCK(pmap); if (rv) break; } sched_unpin(); return (rv); } /* * pmap_is_prefaultable: * * Return whether or not the specified virtual address is elgible * for prefault. */ boolean_t pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr) { pd_entry_t *pde; pt_entry_t *pte; boolean_t rv; rv = FALSE; PMAP_LOCK(pmap); pde = pmap_pde(pmap, addr); if (*pde != 0 && (*pde & PG_PS) == 0) { pte = vtopte(addr); rv = *pte == 0; } PMAP_UNLOCK(pmap); return (rv); } /* * pmap_is_referenced: * * Return whether or not the specified physical page was referenced * in any physical maps. */ boolean_t pmap_is_referenced(vm_page_t m) { boolean_t rv; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_is_referenced: page %p is not managed", m)); rw_wlock(&pvh_global_lock); rv = pmap_is_referenced_pvh(&m->md) || ((m->flags & PG_FICTITIOUS) == 0 && pmap_is_referenced_pvh(pa_to_pvh(VM_PAGE_TO_PHYS(m)))); rw_wunlock(&pvh_global_lock); return (rv); } /* * Returns TRUE if any of the given mappings were referenced and FALSE * otherwise. Both page and 4mpage mappings are supported. */ static boolean_t pmap_is_referenced_pvh(struct md_page *pvh) { pv_entry_t pv; pt_entry_t *pte; pmap_t pmap; boolean_t rv; rw_assert(&pvh_global_lock, RA_WLOCKED); rv = FALSE; sched_pin(); TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pte = pmap_pte_quick(pmap, pv->pv_va); rv = (*pte & (PG_A | PG_V)) == (PG_A | PG_V); PMAP_UNLOCK(pmap); if (rv) break; } sched_unpin(); return (rv); } /* * Clear the write and modified bits in each of the given page's mappings. */ void pmap_remove_write(vm_page_t m) { struct md_page *pvh; pv_entry_t next_pv, pv; pmap_t pmap; pd_entry_t *pde; pt_entry_t oldpte, *pte; vm_offset_t va; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_remove_write: page %p is not managed", m)); /* * If the page is not exclusive busied, then PGA_WRITEABLE cannot be * set by another thread while the object is locked. Thus, * if PGA_WRITEABLE is clear, no page table entries need updating. */ VM_OBJECT_ASSERT_WLOCKED(m->object); if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0) return; rw_wlock(&pvh_global_lock); sched_pin(); if ((m->flags & PG_FICTITIOUS) != 0) goto small_mappings; pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { va = pv->pv_va; pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pde = pmap_pde(pmap, va); if ((*pde & PG_RW) != 0) (void)pmap_demote_pde(pmap, pde, va); PMAP_UNLOCK(pmap); } small_mappings: TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pde = pmap_pde(pmap, pv->pv_va); KASSERT((*pde & PG_PS) == 0, ("pmap_clear_write: found" " a 4mpage in page %p's pv list", m)); pte = pmap_pte_quick(pmap, pv->pv_va); retry: oldpte = *pte; if ((oldpte & PG_RW) != 0) { /* * Regardless of whether a pte is 32 or 64 bits * in size, PG_RW and PG_M are among the least * significant 32 bits. */ if (!atomic_cmpset_int((u_int *)pte, oldpte, oldpte & ~(PG_RW | PG_M))) goto retry; if ((oldpte & PG_M) != 0) vm_page_dirty(m); pmap_invalidate_page(pmap, pv->pv_va); } PMAP_UNLOCK(pmap); } vm_page_aflag_clear(m, PGA_WRITEABLE); sched_unpin(); rw_wunlock(&pvh_global_lock); } /* * pmap_ts_referenced: * * Return a count of reference bits for a page, clearing those bits. * It is not necessary for every reference bit to be cleared, but it * is necessary that 0 only be returned when there are truly no * reference bits set. * * As an optimization, update the page's dirty field if a modified bit is * found while counting reference bits. This opportunistic update can be * performed at low cost and can eliminate the need for some future calls * to pmap_is_modified(). However, since this function stops after * finding PMAP_TS_REFERENCED_MAX reference bits, it may not detect some * dirty pages. Those dirty pages will only be detected by a future call * to pmap_is_modified(). */ int pmap_ts_referenced(vm_page_t m) { struct md_page *pvh; pv_entry_t pv, pvf; pmap_t pmap; pd_entry_t *pde; pt_entry_t *pte; vm_paddr_t pa; int rtval = 0; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_ts_referenced: page %p is not managed", m)); pa = VM_PAGE_TO_PHYS(m); pvh = pa_to_pvh(pa); rw_wlock(&pvh_global_lock); sched_pin(); if ((m->flags & PG_FICTITIOUS) != 0 || (pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL) goto small_mappings; pv = pvf; do { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pde = pmap_pde(pmap, pv->pv_va); if ((*pde & (PG_M | PG_RW)) == (PG_M | PG_RW)) { /* * Although "*pde" is mapping a 2/4MB page, because * this function is called at a 4KB page granularity, * we only update the 4KB page under test. */ vm_page_dirty(m); } if ((*pde & PG_A) != 0) { /* * Since this reference bit is shared by either 1024 * or 512 4KB pages, it should not be cleared every * time it is tested. Apply a simple "hash" function * on the physical page number, the virtual superpage * number, and the pmap address to select one 4KB page * out of the 1024 or 512 on which testing the * reference bit will result in clearing that bit. * This function is designed to avoid the selection of * the same 4KB page for every 2- or 4MB page mapping. * * On demotion, a mapping that hasn't been referenced * is simply destroyed. To avoid the possibility of a * subsequent page fault on a demoted wired mapping, * always leave its reference bit set. Moreover, * since the superpage is wired, the current state of * its reference bit won't affect page replacement. */ if ((((pa >> PAGE_SHIFT) ^ (pv->pv_va >> PDRSHIFT) ^ (uintptr_t)pmap) & (NPTEPG - 1)) == 0 && (*pde & PG_W) == 0) { atomic_clear_int((u_int *)pde, PG_A); pmap_invalidate_page(pmap, pv->pv_va); } rtval++; } PMAP_UNLOCK(pmap); /* Rotate the PV list if it has more than one entry. */ if (TAILQ_NEXT(pv, pv_next) != NULL) { TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); } if (rtval >= PMAP_TS_REFERENCED_MAX) goto out; } while ((pv = TAILQ_FIRST(&pvh->pv_list)) != pvf); small_mappings: if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL) goto out; pv = pvf; do { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pde = pmap_pde(pmap, pv->pv_va); KASSERT((*pde & PG_PS) == 0, ("pmap_ts_referenced: found a 4mpage in page %p's pv list", m)); pte = pmap_pte_quick(pmap, pv->pv_va); if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) vm_page_dirty(m); if ((*pte & PG_A) != 0) { atomic_clear_int((u_int *)pte, PG_A); pmap_invalidate_page(pmap, pv->pv_va); rtval++; } PMAP_UNLOCK(pmap); /* Rotate the PV list if it has more than one entry. */ if (TAILQ_NEXT(pv, pv_next) != NULL) { TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); } } while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && rtval < PMAP_TS_REFERENCED_MAX); out: sched_unpin(); rw_wunlock(&pvh_global_lock); return (rtval); } /* * Apply the given advice to the specified range of addresses within the * given pmap. Depending on the advice, clear the referenced and/or * modified flags in each mapping and set the mapped page's dirty field. */ void pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice) { pd_entry_t oldpde, *pde; pt_entry_t *pte; vm_offset_t va, pdnxt; vm_page_t m; boolean_t anychanged, pv_lists_locked; if (advice != MADV_DONTNEED && advice != MADV_FREE) return; if (pmap_is_current(pmap)) pv_lists_locked = FALSE; else { pv_lists_locked = TRUE; resume: rw_wlock(&pvh_global_lock); sched_pin(); } anychanged = FALSE; PMAP_LOCK(pmap); for (; sva < eva; sva = pdnxt) { pdnxt = (sva + NBPDR) & ~PDRMASK; if (pdnxt < sva) pdnxt = eva; pde = pmap_pde(pmap, sva); oldpde = *pde; if ((oldpde & PG_V) == 0) continue; else if ((oldpde & PG_PS) != 0) { if ((oldpde & PG_MANAGED) == 0) continue; if (!pv_lists_locked) { pv_lists_locked = TRUE; if (!rw_try_wlock(&pvh_global_lock)) { if (anychanged) pmap_invalidate_all(pmap); PMAP_UNLOCK(pmap); goto resume; } sched_pin(); } if (!pmap_demote_pde(pmap, pde, sva)) { /* * The large page mapping was destroyed. */ continue; } /* * Unless the page mappings are wired, remove the * mapping to a single page so that a subsequent * access may repromote. Since the underlying page * table page is fully populated, this removal never * frees a page table page. */ if ((oldpde & PG_W) == 0) { pte = pmap_pte_quick(pmap, sva); KASSERT((*pte & PG_V) != 0, ("pmap_advise: invalid PTE")); pmap_remove_pte(pmap, pte, sva, NULL); anychanged = TRUE; } } if (pdnxt > eva) pdnxt = eva; va = pdnxt; for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++, sva += PAGE_SIZE) { if ((*pte & (PG_MANAGED | PG_V)) != (PG_MANAGED | PG_V)) goto maybe_invlrng; else if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { if (advice == MADV_DONTNEED) { /* * Future calls to pmap_is_modified() * can be avoided by making the page * dirty now. */ m = PHYS_TO_VM_PAGE(*pte & PG_FRAME); vm_page_dirty(m); } atomic_clear_int((u_int *)pte, PG_M | PG_A); } else if ((*pte & PG_A) != 0) atomic_clear_int((u_int *)pte, PG_A); else goto maybe_invlrng; if ((*pte & PG_G) != 0) { if (va == pdnxt) va = sva; } else anychanged = TRUE; continue; maybe_invlrng: if (va != pdnxt) { pmap_invalidate_range(pmap, va, sva); va = pdnxt; } } if (va != pdnxt) pmap_invalidate_range(pmap, va, sva); } if (anychanged) pmap_invalidate_all(pmap); if (pv_lists_locked) { sched_unpin(); rw_wunlock(&pvh_global_lock); } PMAP_UNLOCK(pmap); } /* * Clear the modify bits on the specified physical page. */ void pmap_clear_modify(vm_page_t m) { struct md_page *pvh; pv_entry_t next_pv, pv; pmap_t pmap; pd_entry_t oldpde, *pde; pt_entry_t oldpte, *pte; vm_offset_t va; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_clear_modify: page %p is not managed", m)); VM_OBJECT_ASSERT_WLOCKED(m->object); KASSERT(!vm_page_xbusied(m), ("pmap_clear_modify: page %p is exclusive busied", m)); /* * If the page is not PGA_WRITEABLE, then no PTEs can have PG_M set. * If the object containing the page is locked and the page is not * exclusive busied, then PGA_WRITEABLE cannot be concurrently set. */ if ((m->aflags & PGA_WRITEABLE) == 0) return; rw_wlock(&pvh_global_lock); sched_pin(); if ((m->flags & PG_FICTITIOUS) != 0) goto small_mappings; pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { va = pv->pv_va; pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pde = pmap_pde(pmap, va); oldpde = *pde; if ((oldpde & PG_RW) != 0) { if (pmap_demote_pde(pmap, pde, va)) { if ((oldpde & PG_W) == 0) { /* * Write protect the mapping to a * single page so that a subsequent * write access may repromote. */ va += VM_PAGE_TO_PHYS(m) - (oldpde & PG_PS_FRAME); pte = pmap_pte_quick(pmap, va); oldpte = *pte; if ((oldpte & PG_V) != 0) { /* * Regardless of whether a pte is 32 or 64 bits * in size, PG_RW and PG_M are among the least * significant 32 bits. */ while (!atomic_cmpset_int((u_int *)pte, oldpte, oldpte & ~(PG_M | PG_RW))) oldpte = *pte; vm_page_dirty(m); pmap_invalidate_page(pmap, va); } } } } PMAP_UNLOCK(pmap); } small_mappings: TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pde = pmap_pde(pmap, pv->pv_va); KASSERT((*pde & PG_PS) == 0, ("pmap_clear_modify: found" " a 4mpage in page %p's pv list", m)); pte = pmap_pte_quick(pmap, pv->pv_va); if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { /* * Regardless of whether a pte is 32 or 64 bits * in size, PG_M is among the least significant * 32 bits. */ atomic_clear_int((u_int *)pte, PG_M); pmap_invalidate_page(pmap, pv->pv_va); } PMAP_UNLOCK(pmap); } sched_unpin(); rw_wunlock(&pvh_global_lock); } /* * Miscellaneous support routines follow */ /* Adjust the cache mode for a 4KB page mapped via a PTE. */ static __inline void pmap_pte_attr(pt_entry_t *pte, int cache_bits) { u_int opte, npte; /* * The cache mode bits are all in the low 32-bits of the * PTE, so we can just spin on updating the low 32-bits. */ do { opte = *(u_int *)pte; npte = opte & ~PG_PTE_CACHE; npte |= cache_bits; } while (npte != opte && !atomic_cmpset_int((u_int *)pte, opte, npte)); } /* Adjust the cache mode for a 2/4MB page mapped via a PDE. */ static __inline void pmap_pde_attr(pd_entry_t *pde, int cache_bits) { u_int opde, npde; /* * The cache mode bits are all in the low 32-bits of the * PDE, so we can just spin on updating the low 32-bits. */ do { opde = *(u_int *)pde; npde = opde & ~PG_PDE_CACHE; npde |= cache_bits; } while (npde != opde && !atomic_cmpset_int((u_int *)pde, opde, npde)); } /* * Map a set of physical memory pages into the kernel virtual * address space. Return a pointer to where it is mapped. This * routine is intended to be used for mapping device memory, * NOT real memory. */ void * pmap_mapdev_attr(vm_paddr_t pa, vm_size_t size, int mode) { struct pmap_preinit_mapping *ppim; vm_offset_t va, offset; vm_size_t tmpsize; int i; offset = pa & PAGE_MASK; size = round_page(offset + size); pa = pa & PG_FRAME; if (pa < KERNLOAD && pa + size <= KERNLOAD) va = KERNBASE + pa; else if (!pmap_initialized) { va = 0; for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { ppim = pmap_preinit_mapping + i; if (ppim->va == 0) { ppim->pa = pa; ppim->sz = size; ppim->mode = mode; ppim->va = virtual_avail; virtual_avail += size; va = ppim->va; break; } } if (va == 0) panic("%s: too many preinit mappings", __func__); } else { /* * If we have a preinit mapping, re-use it. */ for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { ppim = pmap_preinit_mapping + i; if (ppim->pa == pa && ppim->sz == size && ppim->mode == mode) return ((void *)(ppim->va + offset)); } va = kva_alloc(size); if (va == 0) panic("%s: Couldn't allocate KVA", __func__); } for (tmpsize = 0; tmpsize < size; tmpsize += PAGE_SIZE) pmap_kenter_attr(va + tmpsize, pa + tmpsize, mode); pmap_invalidate_range(kernel_pmap, va, va + tmpsize); pmap_invalidate_cache_range(va, va + size, FALSE); return ((void *)(va + offset)); } void * pmap_mapdev(vm_paddr_t pa, vm_size_t size) { return (pmap_mapdev_attr(pa, size, PAT_UNCACHEABLE)); } void * pmap_mapbios(vm_paddr_t pa, vm_size_t size) { return (pmap_mapdev_attr(pa, size, PAT_WRITE_BACK)); } void pmap_unmapdev(vm_offset_t va, vm_size_t size) { struct pmap_preinit_mapping *ppim; vm_offset_t offset; int i; if (va >= KERNBASE && va + size <= KERNBASE + KERNLOAD) return; offset = va & PAGE_MASK; size = round_page(offset + size); va = trunc_page(va); for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { ppim = pmap_preinit_mapping + i; if (ppim->va == va && ppim->sz == size) { if (pmap_initialized) return; ppim->pa = 0; ppim->va = 0; ppim->sz = 0; ppim->mode = 0; if (va + size == virtual_avail) virtual_avail = va; return; } } if (pmap_initialized) kva_free(va, size); } /* * Sets the memory attribute for the specified page. */ void pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma) { m->md.pat_mode = ma; if ((m->flags & PG_FICTITIOUS) != 0) return; /* * If "m" is a normal page, flush it from the cache. * See pmap_invalidate_cache_range(). * * First, try to find an existing mapping of the page by sf * buffer. sf_buf_invalidate_cache() modifies mapping and * flushes the cache. */ if (sf_buf_invalidate_cache(m)) return; /* * If page is not mapped by sf buffer, but CPU does not * support self snoop, map the page transient and do * invalidation. In the worst case, whole cache is flushed by * pmap_invalidate_cache_range(). */ if ((cpu_feature & CPUID_SS) == 0) pmap_flush_page(m); } static void pmap_flush_page(vm_page_t m) { pt_entry_t *cmap_pte2; struct pcpu *pc; vm_offset_t sva, eva; bool useclflushopt; useclflushopt = (cpu_stdext_feature & CPUID_STDEXT_CLFLUSHOPT) != 0; if (useclflushopt || (cpu_feature & CPUID_CLFSH) != 0) { sched_pin(); pc = pcpu_find(curcpu); cmap_pte2 = pc->pc_cmap_pte2; mtx_lock(&pc->pc_cmap_lock); if (*cmap_pte2) panic("pmap_flush_page: CMAP2 busy"); *cmap_pte2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M | pmap_cache_bits(m->md.pat_mode, 0); invlcaddr(pc->pc_cmap_addr2); sva = (vm_offset_t)pc->pc_cmap_addr2; eva = sva + PAGE_SIZE; /* * Use mfence despite the ordering implied by * mtx_{un,}lock() because clflush on non-Intel CPUs * and clflushopt are not guaranteed to be ordered by * any other instruction. */ if (useclflushopt || cpu_vendor_id != CPU_VENDOR_INTEL) mfence(); for (; sva < eva; sva += cpu_clflush_line_size) { if (useclflushopt) clflushopt(sva); else clflush(sva); } if (useclflushopt || cpu_vendor_id != CPU_VENDOR_INTEL) mfence(); *cmap_pte2 = 0; - mtx_unlock(&pc->pc_cmap_lock); sched_unpin(); + mtx_unlock(&pc->pc_cmap_lock); } else pmap_invalidate_cache(); } /* * Changes the specified virtual address range's memory type to that given by * the parameter "mode". The specified virtual address range must be * completely contained within either the kernel map. * * Returns zero if the change completed successfully, and either EINVAL or * ENOMEM if the change failed. Specifically, EINVAL is returned if some part * of the virtual address range was not mapped, and ENOMEM is returned if * there was insufficient memory available to complete the change. */ int pmap_change_attr(vm_offset_t va, vm_size_t size, int mode) { vm_offset_t base, offset, tmpva; pd_entry_t *pde; pt_entry_t *pte; int cache_bits_pte, cache_bits_pde; boolean_t changed; base = trunc_page(va); offset = va & PAGE_MASK; size = round_page(offset + size); /* * Only supported on kernel virtual addresses above the recursive map. */ if (base < VM_MIN_KERNEL_ADDRESS) return (EINVAL); cache_bits_pde = pmap_cache_bits(mode, 1); cache_bits_pte = pmap_cache_bits(mode, 0); changed = FALSE; /* * Pages that aren't mapped aren't supported. Also break down * 2/4MB pages into 4KB pages if required. */ PMAP_LOCK(kernel_pmap); for (tmpva = base; tmpva < base + size; ) { pde = pmap_pde(kernel_pmap, tmpva); if (*pde == 0) { PMAP_UNLOCK(kernel_pmap); return (EINVAL); } if (*pde & PG_PS) { /* * If the current 2/4MB page already has * the required memory type, then we need not * demote this page. Just increment tmpva to * the next 2/4MB page frame. */ if ((*pde & PG_PDE_CACHE) == cache_bits_pde) { tmpva = trunc_4mpage(tmpva) + NBPDR; continue; } /* * If the current offset aligns with a 2/4MB * page frame and there is at least 2/4MB left * within the range, then we need not break * down this page into 4KB pages. */ if ((tmpva & PDRMASK) == 0 && tmpva + PDRMASK < base + size) { tmpva += NBPDR; continue; } if (!pmap_demote_pde(kernel_pmap, pde, tmpva)) { PMAP_UNLOCK(kernel_pmap); return (ENOMEM); } } pte = vtopte(tmpva); if (*pte == 0) { PMAP_UNLOCK(kernel_pmap); return (EINVAL); } tmpva += PAGE_SIZE; } PMAP_UNLOCK(kernel_pmap); /* * Ok, all the pages exist, so run through them updating their * cache mode if required. */ for (tmpva = base; tmpva < base + size; ) { pde = pmap_pde(kernel_pmap, tmpva); if (*pde & PG_PS) { if ((*pde & PG_PDE_CACHE) != cache_bits_pde) { pmap_pde_attr(pde, cache_bits_pde); changed = TRUE; } tmpva = trunc_4mpage(tmpva) + NBPDR; } else { pte = vtopte(tmpva); if ((*pte & PG_PTE_CACHE) != cache_bits_pte) { pmap_pte_attr(pte, cache_bits_pte); changed = TRUE; } tmpva += PAGE_SIZE; } } /* * Flush CPU caches to make sure any data isn't cached that * shouldn't be, etc. */ if (changed) { pmap_invalidate_range(kernel_pmap, base, tmpva); pmap_invalidate_cache_range(base, tmpva, FALSE); } return (0); } /* * perform the pmap work for mincore */ int pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *locked_pa) { pd_entry_t *pdep; pt_entry_t *ptep, pte; vm_paddr_t pa; int val; PMAP_LOCK(pmap); retry: pdep = pmap_pde(pmap, addr); if (*pdep != 0) { if (*pdep & PG_PS) { pte = *pdep; /* Compute the physical address of the 4KB page. */ pa = ((*pdep & PG_PS_FRAME) | (addr & PDRMASK)) & PG_FRAME; val = MINCORE_SUPER; } else { ptep = pmap_pte(pmap, addr); pte = *ptep; pmap_pte_release(ptep); pa = pte & PG_FRAME; val = 0; } } else { pte = 0; pa = 0; val = 0; } if ((pte & PG_V) != 0) { val |= MINCORE_INCORE; if ((pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER; if ((pte & PG_A) != 0) val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER; } if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) != (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) && (pte & (PG_MANAGED | PG_V)) == (PG_MANAGED | PG_V)) { /* Ensure that "PHYS_TO_VM_PAGE(pa)->object" doesn't change. */ if (vm_page_pa_tryrelock(pmap, pa, locked_pa)) goto retry; } else PA_UNLOCK_COND(*locked_pa); PMAP_UNLOCK(pmap); return (val); } void pmap_activate(struct thread *td) { pmap_t pmap, oldpmap; u_int cpuid; u_int32_t cr3; critical_enter(); pmap = vmspace_pmap(td->td_proc->p_vmspace); oldpmap = PCPU_GET(curpmap); cpuid = PCPU_GET(cpuid); #if defined(SMP) CPU_CLR_ATOMIC(cpuid, &oldpmap->pm_active); CPU_SET_ATOMIC(cpuid, &pmap->pm_active); #else CPU_CLR(cpuid, &oldpmap->pm_active); CPU_SET(cpuid, &pmap->pm_active); #endif #if defined(PAE) || defined(PAE_TABLES) cr3 = vtophys(pmap->pm_pdpt); #else cr3 = vtophys(pmap->pm_pdir); #endif /* * pmap_activate is for the current thread on the current cpu */ td->td_pcb->pcb_cr3 = cr3; load_cr3(cr3); PCPU_SET(curpmap, pmap); critical_exit(); } void pmap_sync_icache(pmap_t pm, vm_offset_t va, vm_size_t sz) { } /* * Increase the starting virtual address of the given mapping if a * different alignment might result in more superpage mappings. */ void pmap_align_superpage(vm_object_t object, vm_ooffset_t offset, vm_offset_t *addr, vm_size_t size) { vm_offset_t superpage_offset; if (size < NBPDR) return; if (object != NULL && (object->flags & OBJ_COLORED) != 0) offset += ptoa(object->pg_color); superpage_offset = offset & PDRMASK; if (size - ((NBPDR - superpage_offset) & PDRMASK) < NBPDR || (*addr & PDRMASK) == superpage_offset) return; if ((*addr & PDRMASK) < superpage_offset) *addr = (*addr & ~PDRMASK) + superpage_offset; else *addr = ((*addr + PDRMASK) & ~PDRMASK) + superpage_offset; } vm_offset_t pmap_quick_enter_page(vm_page_t m) { vm_offset_t qaddr; pt_entry_t *pte; critical_enter(); qaddr = PCPU_GET(qmap_addr); pte = vtopte(qaddr); KASSERT(*pte == 0, ("pmap_quick_enter_page: PTE busy")); *pte = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M | pmap_cache_bits(pmap_page_get_memattr(m), 0); invlpg(qaddr); return (qaddr); } void pmap_quick_remove_page(vm_offset_t addr) { vm_offset_t qaddr; pt_entry_t *pte; qaddr = PCPU_GET(qmap_addr); pte = vtopte(qaddr); KASSERT(*pte != 0, ("pmap_quick_remove_page: PTE not in use")); KASSERT(addr == qaddr, ("pmap_quick_remove_page: invalid address")); *pte = 0; critical_exit(); } #if defined(PMAP_DEBUG) pmap_pid_dump(int pid) { pmap_t pmap; struct proc *p; int npte = 0; int index; sx_slock(&allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { if (p->p_pid != pid) continue; if (p->p_vmspace) { int i,j; index = 0; pmap = vmspace_pmap(p->p_vmspace); for (i = 0; i < NPDEPTD; i++) { pd_entry_t *pde; pt_entry_t *pte; vm_offset_t base = i << PDRSHIFT; pde = &pmap->pm_pdir[i]; if (pde && pmap_pde_v(pde)) { for (j = 0; j < NPTEPG; j++) { vm_offset_t va = base + (j << PAGE_SHIFT); if (va >= (vm_offset_t) VM_MIN_KERNEL_ADDRESS) { if (index) { index = 0; printf("\n"); } sx_sunlock(&allproc_lock); return (npte); } pte = pmap_pte(pmap, va); if (pte && pmap_pte_v(pte)) { pt_entry_t pa; vm_page_t m; pa = *pte; m = PHYS_TO_VM_PAGE(pa & PG_FRAME); printf("va: 0x%x, pt: 0x%x, h: %d, w: %d, f: 0x%x", va, pa, m->hold_count, m->wire_count, m->flags); npte++; index++; if (index >= 2) { index = 0; printf("\n"); } else { printf(" "); } } } } } } } sx_sunlock(&allproc_lock); return (npte); } #endif Index: projects/netbsd-tests-upstream-01-2017/sys/kern/kern_acct.c =================================================================== --- projects/netbsd-tests-upstream-01-2017/sys/kern/kern_acct.c (revision 312217) +++ projects/netbsd-tests-upstream-01-2017/sys/kern/kern_acct.c (revision 312218) @@ -1,652 +1,652 @@ /*- * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * Copyright (c) 2005 Robert N. M. Watson * All rights reserved. * * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * Copyright (c) 1994 Christopher G. Demetriou * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)kern_acct.c 8.1 (Berkeley) 6/14/93 */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * The routines implemented in this file are described in: * Leffler, et al.: The Design and Implementation of the 4.3BSD * UNIX Operating System (Addison Welley, 1989) * on pages 62-63. * On May 2007 the historic 3 bits base 8 exponent, 13 bit fraction * compt_t representation described in the above reference was replaced * with that of IEEE-754 floats. * * Arguably, to simplify accounting operations, this mechanism should * be replaced by one in which an accounting log file (similar to /dev/klog) * is read by a user process, etc. However, that has its own problems. */ /* Floating point definitions from . */ #define FLT_MANT_DIG 24 /* p */ #define FLT_MAX_EXP 128 /* emax */ /* * Internal accounting functions. * The former's operation is described in Leffler, et al., and the latter * was provided by UCB with the 4.4BSD-Lite release */ static uint32_t encode_timeval(struct timeval); static uint32_t encode_long(long); static void acctwatch(void); static void acct_thread(void *); static int acct_disable(struct thread *, int); /* * Accounting vnode pointer, saved vnode pointer, and flags for each. * acct_sx protects against changes to the active vnode and credentials * while accounting records are being committed to disk. */ static int acct_configured; static int acct_suspended; static struct vnode *acct_vp; static struct ucred *acct_cred; static struct plimit *acct_limit; static int acct_flags; static struct sx acct_sx; SX_SYSINIT(acct, &acct_sx, "acct_sx"); /* * State of the accounting kthread. */ static int acct_state; #define ACCT_RUNNING 1 /* Accounting kthread is running. */ #define ACCT_EXITREQ 2 /* Accounting kthread should exit. */ /* * Values associated with enabling and disabling accounting */ static int acctsuspend = 2; /* stop accounting when < 2% free space left */ SYSCTL_INT(_kern, OID_AUTO, acct_suspend, CTLFLAG_RW, &acctsuspend, 0, "percentage of free disk space below which accounting stops"); static int acctresume = 4; /* resume when free space risen to > 4% */ SYSCTL_INT(_kern, OID_AUTO, acct_resume, CTLFLAG_RW, &acctresume, 0, "percentage of free disk space above which accounting resumes"); static int acctchkfreq = 15; /* frequency (in seconds) to check space */ static int sysctl_acct_chkfreq(SYSCTL_HANDLER_ARGS) { int error, value; /* Write out the old value. */ error = SYSCTL_OUT(req, &acctchkfreq, sizeof(int)); if (error || req->newptr == NULL) return (error); /* Read in and verify the new value. */ error = SYSCTL_IN(req, &value, sizeof(int)); if (error) return (error); if (value <= 0) return (EINVAL); acctchkfreq = value; return (0); } SYSCTL_PROC(_kern, OID_AUTO, acct_chkfreq, CTLTYPE_INT|CTLFLAG_RW, &acctchkfreq, 0, sysctl_acct_chkfreq, "I", "frequency for checking the free space"); SYSCTL_INT(_kern, OID_AUTO, acct_configured, CTLFLAG_RD, &acct_configured, 0, "Accounting configured or not"); SYSCTL_INT(_kern, OID_AUTO, acct_suspended, CTLFLAG_RD, &acct_suspended, 0, "Accounting suspended or not"); /* * Accounting system call. Written based on the specification and previous * implementation done by Mark Tinguely. */ int sys_acct(struct thread *td, struct acct_args *uap) { struct nameidata nd; int error, flags, i, replacing; error = priv_check(td, PRIV_ACCT); if (error) return (error); /* * If accounting is to be started to a file, open that file for * appending and make sure it's a 'normal'. */ if (uap->path != NULL) { NDINIT(&nd, LOOKUP, NOFOLLOW | AUDITVNODE1, UIO_USERSPACE, uap->path, td); flags = FWRITE | O_APPEND; error = vn_open(&nd, &flags, 0, NULL); if (error) return (error); NDFREE(&nd, NDF_ONLY_PNBUF); #ifdef MAC error = mac_system_check_acct(td->td_ucred, nd.ni_vp); if (error) { VOP_UNLOCK(nd.ni_vp, 0); vn_close(nd.ni_vp, flags, td->td_ucred, td); return (error); } #endif VOP_UNLOCK(nd.ni_vp, 0); if (nd.ni_vp->v_type != VREG) { vn_close(nd.ni_vp, flags, td->td_ucred, td); return (EACCES); } #ifdef MAC } else { error = mac_system_check_acct(td->td_ucred, NULL); if (error) return (error); #endif } /* * Disallow concurrent access to the accounting vnode while we swap * it out, in order to prevent access after close. */ sx_xlock(&acct_sx); /* * Don't log spurious disable/enable messages if we are * switching from one accounting file to another due to log * rotation. */ replacing = (acct_vp != NULL && uap->path != NULL); /* * If accounting was previously enabled, kill the old space-watcher, * close the file, and (if no new file was specified, leave). Reset * the suspended state regardless of whether accounting remains * enabled. */ acct_suspended = 0; if (acct_vp != NULL) error = acct_disable(td, !replacing); if (uap->path == NULL) { if (acct_state & ACCT_RUNNING) { acct_state |= ACCT_EXITREQ; wakeup(&acct_state); } sx_xunlock(&acct_sx); return (error); } /* * Create our own plimit object without limits. It will be assigned * to exiting processes. */ acct_limit = lim_alloc(); for (i = 0; i < RLIM_NLIMITS; i++) acct_limit->pl_rlimit[i].rlim_cur = acct_limit->pl_rlimit[i].rlim_max = RLIM_INFINITY; /* * Save the new accounting file vnode, and schedule the new * free space watcher. */ acct_vp = nd.ni_vp; acct_cred = crhold(td->td_ucred); acct_flags = flags; if (acct_state & ACCT_RUNNING) acct_state &= ~ACCT_EXITREQ; else { /* * Try to start up an accounting kthread. We may start more * than one, but if so the extras will commit suicide as * soon as they start up. */ error = kproc_create(acct_thread, NULL, NULL, 0, 0, "accounting"); if (error) { (void) acct_disable(td, 0); sx_xunlock(&acct_sx); log(LOG_NOTICE, "Unable to start accounting thread\n"); return (error); } } acct_configured = 1; sx_xunlock(&acct_sx); if (!replacing) log(LOG_NOTICE, "Accounting enabled\n"); return (error); } /* * Disable currently in-progress accounting by closing the vnode, dropping * our reference to the credential, and clearing the vnode's flags. */ static int acct_disable(struct thread *td, int logging) { int error; sx_assert(&acct_sx, SX_XLOCKED); error = vn_close(acct_vp, acct_flags, acct_cred, td); crfree(acct_cred); lim_free(acct_limit); acct_configured = 0; acct_vp = NULL; acct_cred = NULL; acct_flags = 0; if (logging) log(LOG_NOTICE, "Accounting disabled\n"); return (error); } /* * Write out process accounting information, on process exit. * Data to be written out is specified in Leffler, et al. * and are enumerated below. (They're also noted in the system * "acct.h" header file.) */ int acct_process(struct thread *td) { struct acctv2 acct; struct timeval ut, st, tmp; struct plimit *oldlim; struct proc *p; struct rusage ru; int t, ret; /* * Lockless check of accounting condition before doing the hard * work. */ if (acct_vp == NULL || acct_suspended) return (0); sx_slock(&acct_sx); /* * If accounting isn't enabled, don't bother. Have to check again * once we own the lock in case we raced with disabling of accounting * by another thread. */ if (acct_vp == NULL || acct_suspended) { sx_sunlock(&acct_sx); return (0); } p = td->td_proc; /* * Get process accounting information. */ sx_slock(&proctree_lock); PROC_LOCK(p); /* (1) The terminal from which the process was started */ if ((p->p_flag & P_CONTROLT) && p->p_pgrp->pg_session->s_ttyp) acct.ac_tty = tty_udev(p->p_pgrp->pg_session->s_ttyp); else acct.ac_tty = NODEV; sx_sunlock(&proctree_lock); /* (2) The name of the command that ran */ bcopy(p->p_comm, acct.ac_comm, sizeof acct.ac_comm); /* (3) The amount of user and system time that was used */ rufetchcalc(p, &ru, &ut, &st); acct.ac_utime = encode_timeval(ut); acct.ac_stime = encode_timeval(st); /* (4) The elapsed time the command ran (and its starting time) */ getboottime(&tmp); timevaladd(&tmp, &p->p_stats->p_start); acct.ac_btime = tmp.tv_sec; microuptime(&tmp); timevalsub(&tmp, &p->p_stats->p_start); acct.ac_etime = encode_timeval(tmp); /* (5) The average amount of memory used */ tmp = ut; timevaladd(&tmp, &st); /* Convert tmp (i.e. u + s) into hz units to match ru_i*. */ t = tmp.tv_sec * hz + tmp.tv_usec / tick; if (t) acct.ac_mem = encode_long((ru.ru_ixrss + ru.ru_idrss + + ru.ru_isrss) / t); else acct.ac_mem = 0; /* (6) The number of disk I/O operations done */ acct.ac_io = encode_long(ru.ru_inblock + ru.ru_oublock); /* (7) The UID and GID of the process */ acct.ac_uid = p->p_ucred->cr_ruid; acct.ac_gid = p->p_ucred->cr_rgid; /* (8) The boolean flags that tell how the process terminated, etc. */ acct.ac_flagx = p->p_acflag; /* Setup ancillary structure fields. */ acct.ac_flagx |= ANVER; acct.ac_zero = 0; acct.ac_version = 2; acct.ac_len = acct.ac_len2 = sizeof(acct); /* * Eliminate rlimits (file size limit in particular). */ oldlim = p->p_limit; p->p_limit = lim_hold(acct_limit); PROC_UNLOCK(p); lim_free(oldlim); /* * Write the accounting information to the file. */ ret = vn_rdwr(UIO_WRITE, acct_vp, (caddr_t)&acct, sizeof (acct), (off_t)0, UIO_SYSSPACE, IO_APPEND|IO_UNIT, acct_cred, NOCRED, NULL, td); sx_sunlock(&acct_sx); return (ret); } /* FLOAT_CONVERSION_START (Regression testing; don't remove this line.) */ /* Convert timevals and longs into IEEE-754 bit patterns. */ /* Mantissa mask (MSB is implied, so subtract 1). */ #define MANT_MASK ((1 << (FLT_MANT_DIG - 1)) - 1) /* * We calculate integer values to a precision of approximately * 28 bits. * This is high-enough precision to fill the 24 float bits * and low-enough to avoid overflowing the 32 int bits. */ #define CALC_BITS 28 /* log_2(1000000). */ #define LOG2_1M 20 /* * Convert the elements of a timeval into a 32-bit word holding * the bits of a IEEE-754 float. * The float value represents the timeval's value in microsecond units. */ static uint32_t encode_timeval(struct timeval tv) { int log2_s; - int val, exponent; /* Unnormalized value and exponent */ - int norm_exponent; /* Normalized exponent */ + int val, exp; /* Unnormalized value and exponent */ + int norm_exp; /* Normalized exponent */ int shift; /* * First calculate value and exponent to about CALC_BITS precision. * Note that the following conditionals have been ordered so that * the most common cases appear first. */ if (tv.tv_sec == 0) { if (tv.tv_usec == 0) return (0); - exponent = 0; + exp = 0; val = tv.tv_usec; } else { /* * Calculate the value to a precision of approximately * CALC_BITS. */ log2_s = fls(tv.tv_sec) - 1; if (log2_s + LOG2_1M < CALC_BITS) { - exponent = 0; + exp = 0; val = 1000000 * tv.tv_sec + tv.tv_usec; } else { - exponent = log2_s + LOG2_1M - CALC_BITS; + exp = log2_s + LOG2_1M - CALC_BITS; val = (unsigned int)(((uint64_t)1000000 * tv.tv_sec + - tv.tv_usec) >> exponent); + tv.tv_usec) >> exp); } } /* Now normalize and pack the value into an IEEE-754 float. */ - norm_exponent = fls(val) - 1; - shift = FLT_MANT_DIG - norm_exponent - 1; + norm_exp = fls(val) - 1; + shift = FLT_MANT_DIG - norm_exp - 1; #ifdef ACCT_DEBUG printf("val=%d exp=%d shift=%d log2(val)=%d\n", - val, exponent, shift, norm_exponent); - printf("exp=%x mant=%x\n", FLT_MAX_EXP - 1 + exponent + norm_exponent, + val, exp, shift, norm_exp); + printf("exp=%x mant=%x\n", FLT_MAX_EXP - 1 + exp + norm_exp, ((shift > 0 ? (val << shift) : (val >> -shift)) & MANT_MASK)); #endif - return (((FLT_MAX_EXP - 1 + exponent + norm_exponent) << (FLT_MANT_DIG - 1)) | + return (((FLT_MAX_EXP - 1 + exp + norm_exp) << (FLT_MANT_DIG - 1)) | ((shift > 0 ? val << shift : val >> -shift) & MANT_MASK)); } /* * Convert a non-negative long value into the bit pattern of * an IEEE-754 float value. */ static uint32_t encode_long(long val) { - int norm_exponent; /* Normalized exponent */ + int norm_exp; /* Normalized exponent */ int shift; if (val == 0) return (0); if (val < 0) { log(LOG_NOTICE, "encode_long: negative value %ld in accounting record\n", val); val = LONG_MAX; } - norm_exponent = fls(val) - 1; - shift = FLT_MANT_DIG - norm_exponent - 1; + norm_exp = fls(val) - 1; + shift = FLT_MANT_DIG - norm_exp - 1; #ifdef ACCT_DEBUG printf("val=%d shift=%d log2(val)=%d\n", - val, shift, norm_exponent); - printf("exp=%x mant=%x\n", FLT_MAX_EXP - 1 + exp + norm_exponent, + val, shift, norm_exp); + printf("exp=%x mant=%x\n", FLT_MAX_EXP - 1 + exp + norm_exp, ((shift > 0 ? (val << shift) : (val >> -shift)) & MANT_MASK)); #endif - return (((FLT_MAX_EXP - 1 + norm_exponent) << (FLT_MANT_DIG - 1)) | + return (((FLT_MAX_EXP - 1 + norm_exp) << (FLT_MANT_DIG - 1)) | ((shift > 0 ? val << shift : val >> -shift) & MANT_MASK)); } /* FLOAT_CONVERSION_END (Regression testing; don't remove this line.) */ /* * Periodically check the filesystem to see if accounting * should be turned on or off. Beware the case where the vnode * has been vgone()'d out from underneath us, e.g. when the file * system containing the accounting file has been forcibly unmounted. */ /* ARGSUSED */ static void acctwatch(void) { struct statfs *sp; sx_assert(&acct_sx, SX_XLOCKED); /* * If accounting was disabled before our kthread was scheduled, * then acct_vp might be NULL. If so, just ask our kthread to * exit and return. */ if (acct_vp == NULL) { acct_state |= ACCT_EXITREQ; return; } /* * If our vnode is no longer valid, tear it down and signal the * accounting thread to die. */ if (acct_vp->v_type == VBAD) { (void) acct_disable(NULL, 1); acct_state |= ACCT_EXITREQ; return; } /* * Stopping here is better than continuing, maybe it will be VBAD * next time around. */ sp = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK); if (VFS_STATFS(acct_vp->v_mount, sp) < 0) { free(sp, M_STATFS); return; } if (acct_suspended) { if (sp->f_bavail > (int64_t)(acctresume * sp->f_blocks / 100)) { acct_suspended = 0; log(LOG_NOTICE, "Accounting resumed\n"); } } else { if (sp->f_bavail <= (int64_t)(acctsuspend * sp->f_blocks / 100)) { acct_suspended = 1; log(LOG_NOTICE, "Accounting suspended\n"); } } free(sp, M_STATFS); } /* * The main loop for the dedicated kernel thread that periodically calls * acctwatch(). */ static void acct_thread(void *dummy) { u_char pri; /* This is a low-priority kernel thread. */ pri = PRI_MAX_KERN; thread_lock(curthread); sched_prio(curthread, pri); thread_unlock(curthread); /* If another accounting kthread is already running, just die. */ sx_xlock(&acct_sx); if (acct_state & ACCT_RUNNING) { sx_xunlock(&acct_sx); kproc_exit(0); } acct_state |= ACCT_RUNNING; /* Loop until we are asked to exit. */ while (!(acct_state & ACCT_EXITREQ)) { /* Perform our periodic checks. */ acctwatch(); /* * We check this flag again before sleeping since the * acctwatch() might have shut down accounting and asked us * to exit. */ if (!(acct_state & ACCT_EXITREQ)) { sx_sleep(&acct_state, &acct_sx, 0, "-", acctchkfreq * hz); } } /* * Acknowledge the exit request and shutdown. We clear both the * exit request and running flags. */ acct_state = 0; sx_xunlock(&acct_sx); kproc_exit(0); } Index: projects/netbsd-tests-upstream-01-2017/sys/kern/kern_shutdown.c =================================================================== --- projects/netbsd-tests-upstream-01-2017/sys/kern/kern_shutdown.c (revision 312217) +++ projects/netbsd-tests-upstream-01-2017/sys/kern/kern_shutdown.c (revision 312218) @@ -1,1258 +1,1258 @@ /*- * Copyright (c) 1986, 1988, 1991, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)kern_shutdown.c 8.3 (Berkeley) 1/21/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include "opt_ekcd.h" #include "opt_kdb.h" #include "opt_panic.h" #include "opt_sched.h" #include "opt_watchdog.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static MALLOC_DEFINE(M_DUMPER, "dumper", "dumper block buffer"); #ifndef PANIC_REBOOT_WAIT_TIME #define PANIC_REBOOT_WAIT_TIME 15 /* default to 15 seconds */ #endif static int panic_reboot_wait_time = PANIC_REBOOT_WAIT_TIME; SYSCTL_INT(_kern, OID_AUTO, panic_reboot_wait_time, CTLFLAG_RWTUN, &panic_reboot_wait_time, 0, "Seconds to wait before rebooting after a panic"); /* * Note that stdarg.h and the ANSI style va_start macro is used for both * ANSI and traditional C compilers. */ #include #ifdef KDB #ifdef KDB_UNATTENDED int debugger_on_panic = 0; #else int debugger_on_panic = 1; #endif SYSCTL_INT(_debug, OID_AUTO, debugger_on_panic, CTLFLAG_RWTUN | CTLFLAG_SECURE, &debugger_on_panic, 0, "Run debugger on kernel panic"); #ifdef KDB_TRACE static int trace_on_panic = 1; #else static int trace_on_panic = 0; #endif SYSCTL_INT(_debug, OID_AUTO, trace_on_panic, CTLFLAG_RWTUN | CTLFLAG_SECURE, &trace_on_panic, 0, "Print stack trace on kernel panic"); #endif /* KDB */ static int sync_on_panic = 0; SYSCTL_INT(_kern, OID_AUTO, sync_on_panic, CTLFLAG_RWTUN, &sync_on_panic, 0, "Do a sync before rebooting from a panic"); static SYSCTL_NODE(_kern, OID_AUTO, shutdown, CTLFLAG_RW, 0, "Shutdown environment"); #ifndef DIAGNOSTIC static int show_busybufs; #else static int show_busybufs = 1; #endif SYSCTL_INT(_kern_shutdown, OID_AUTO, show_busybufs, CTLFLAG_RW, &show_busybufs, 0, ""); int suspend_blocked = 0; SYSCTL_INT(_kern, OID_AUTO, suspend_blocked, CTLFLAG_RW, &suspend_blocked, 0, "Block suspend due to a pending shutdown"); #ifdef EKCD FEATURE(ekcd, "Encrypted kernel crash dumps support"); MALLOC_DEFINE(M_EKCD, "ekcd", "Encrypted kernel crash dumps data"); struct kerneldumpcrypto { uint8_t kdc_encryption; uint8_t kdc_iv[KERNELDUMP_IV_MAX_SIZE]; keyInstance kdc_ki; cipherInstance kdc_ci; off_t kdc_nextoffset; uint32_t kdc_dumpkeysize; struct kerneldumpkey kdc_dumpkey[]; }; #endif /* * Variable panicstr contains argument to first call to panic; used as flag * to indicate that the kernel has already called panic. */ const char *panicstr; int dumping; /* system is dumping */ int rebooting; /* system is rebooting */ static struct dumperinfo dumper; /* our selected dumper */ /* Context information for dump-debuggers. */ static struct pcb dumppcb; /* Registers. */ lwpid_t dumptid; /* Thread ID. */ static struct cdevsw reroot_cdevsw = { .d_version = D_VERSION, .d_name = "reroot", }; static void poweroff_wait(void *, int); static void shutdown_halt(void *junk, int howto); static void shutdown_panic(void *junk, int howto); static void shutdown_reset(void *junk, int howto); static int kern_reroot(void); /* register various local shutdown events */ static void shutdown_conf(void *unused) { EVENTHANDLER_REGISTER(shutdown_final, poweroff_wait, NULL, SHUTDOWN_PRI_FIRST); EVENTHANDLER_REGISTER(shutdown_final, shutdown_halt, NULL, SHUTDOWN_PRI_LAST + 100); EVENTHANDLER_REGISTER(shutdown_final, shutdown_panic, NULL, SHUTDOWN_PRI_LAST + 100); EVENTHANDLER_REGISTER(shutdown_final, shutdown_reset, NULL, SHUTDOWN_PRI_LAST + 200); } SYSINIT(shutdown_conf, SI_SUB_INTRINSIC, SI_ORDER_ANY, shutdown_conf, NULL); /* * The only reason this exists is to create the /dev/reroot/ directory, * used by reroot code in init(8) as a mountpoint for tmpfs. */ static void reroot_conf(void *unused) { int error; struct cdev *cdev; error = make_dev_p(MAKEDEV_CHECKNAME | MAKEDEV_WAITOK, &cdev, &reroot_cdevsw, NULL, UID_ROOT, GID_WHEEL, 0600, "reroot/reroot"); if (error != 0) { printf("%s: failed to create device node, error %d", __func__, error); } } SYSINIT(reroot_conf, SI_SUB_DEVFS, SI_ORDER_ANY, reroot_conf, NULL); /* * The system call that results in a reboot. */ /* ARGSUSED */ int sys_reboot(struct thread *td, struct reboot_args *uap) { int error; error = 0; #ifdef MAC error = mac_system_check_reboot(td->td_ucred, uap->opt); #endif if (error == 0) error = priv_check(td, PRIV_REBOOT); if (error == 0) { if (uap->opt & RB_REROOT) { error = kern_reroot(); } else { mtx_lock(&Giant); kern_reboot(uap->opt); mtx_unlock(&Giant); } } return (error); } /* * Called by events that want to shut down.. e.g on a PC */ void shutdown_nice(int howto) { if (initproc != NULL) { /* Send a signal to init(8) and have it shutdown the world. */ PROC_LOCK(initproc); if (howto & RB_POWEROFF) kern_psignal(initproc, SIGUSR2); else if (howto & RB_HALT) kern_psignal(initproc, SIGUSR1); else kern_psignal(initproc, SIGINT); PROC_UNLOCK(initproc); } else { /* No init(8) running, so simply reboot. */ kern_reboot(howto | RB_NOSYNC); } } static void print_uptime(void) { int f; struct timespec ts; getnanouptime(&ts); printf("Uptime: "); f = 0; if (ts.tv_sec >= 86400) { printf("%ldd", (long)ts.tv_sec / 86400); ts.tv_sec %= 86400; f = 1; } if (f || ts.tv_sec >= 3600) { printf("%ldh", (long)ts.tv_sec / 3600); ts.tv_sec %= 3600; f = 1; } if (f || ts.tv_sec >= 60) { printf("%ldm", (long)ts.tv_sec / 60); ts.tv_sec %= 60; f = 1; } printf("%lds\n", (long)ts.tv_sec); } int doadump(boolean_t textdump) { boolean_t coredump; int error; error = 0; if (dumping) return (EBUSY); if (dumper.dumper == NULL) return (ENXIO); savectx(&dumppcb); dumptid = curthread->td_tid; dumping++; coredump = TRUE; #ifdef DDB if (textdump && textdump_pending) { coredump = FALSE; textdump_dumpsys(&dumper); } #endif if (coredump) error = dumpsys(&dumper); dumping--; return (error); } /* * Shutdown the system cleanly to prepare for reboot, halt, or power off. */ void kern_reboot(int howto) { static int once = 0; #if defined(SMP) /* * Bind us to CPU 0 so that all shutdown code runs there. Some * systems don't shutdown properly (i.e., ACPI power off) if we * run on another processor. */ if (!SCHEDULER_STOPPED()) { thread_lock(curthread); sched_bind(curthread, 0); thread_unlock(curthread); KASSERT(PCPU_GET(cpuid) == 0, ("boot: not running on cpu 0")); } #endif /* We're in the process of rebooting. */ rebooting = 1; /* We are out of the debugger now. */ kdb_active = 0; /* * Do any callouts that should be done BEFORE syncing the filesystems. */ EVENTHANDLER_INVOKE(shutdown_pre_sync, howto); /* * Now sync filesystems */ if (!cold && (howto & RB_NOSYNC) == 0 && once == 0) { once = 1; bufshutdown(show_busybufs); } print_uptime(); cngrab(); /* * Ok, now do things that assume all filesystem activity has * been completed. */ EVENTHANDLER_INVOKE(shutdown_post_sync, howto); if ((howto & (RB_HALT|RB_DUMP)) == RB_DUMP && !cold && !dumping) doadump(TRUE); /* Now that we're going to really halt the system... */ EVENTHANDLER_INVOKE(shutdown_final, howto); for(;;) ; /* safety against shutdown_reset not working */ /* NOTREACHED */ } /* * The system call that results in changing the rootfs. */ static int kern_reroot(void) { struct vnode *oldrootvnode, *vp; struct mount *mp, *devmp; int error; if (curproc != initproc) return (EPERM); /* * Mark the filesystem containing currently-running executable * (the temporary copy of init(8)) busy. */ vp = curproc->p_textvp; error = vn_lock(vp, LK_SHARED); if (error != 0) return (error); mp = vp->v_mount; error = vfs_busy(mp, MBF_NOWAIT); if (error != 0) { vfs_ref(mp); VOP_UNLOCK(vp, 0); error = vfs_busy(mp, 0); vn_lock(vp, LK_SHARED | LK_RETRY); vfs_rel(mp); if (error != 0) { VOP_UNLOCK(vp, 0); return (ENOENT); } if (vp->v_iflag & VI_DOOMED) { VOP_UNLOCK(vp, 0); vfs_unbusy(mp); return (ENOENT); } } VOP_UNLOCK(vp, 0); /* * Remove the filesystem containing currently-running executable * from the mount list, to prevent it from being unmounted * by vfs_unmountall(), and to avoid confusing vfs_mountroot(). * * Also preserve /dev - forcibly unmounting it could cause driver * reinitialization. */ vfs_ref(rootdevmp); devmp = rootdevmp; rootdevmp = NULL; mtx_lock(&mountlist_mtx); TAILQ_REMOVE(&mountlist, mp, mnt_list); TAILQ_REMOVE(&mountlist, devmp, mnt_list); mtx_unlock(&mountlist_mtx); oldrootvnode = rootvnode; /* * Unmount everything except for the two filesystems preserved above. */ vfs_unmountall(); /* * Add /dev back; vfs_mountroot() will move it into its new place. */ mtx_lock(&mountlist_mtx); TAILQ_INSERT_HEAD(&mountlist, devmp, mnt_list); mtx_unlock(&mountlist_mtx); rootdevmp = devmp; vfs_rel(rootdevmp); /* * Mount the new rootfs. */ vfs_mountroot(); /* * Update all references to the old rootvnode. */ mountcheckdirs(oldrootvnode, rootvnode); /* * Add the temporary filesystem back and unbusy it. */ mtx_lock(&mountlist_mtx); TAILQ_INSERT_TAIL(&mountlist, mp, mnt_list); mtx_unlock(&mountlist_mtx); vfs_unbusy(mp); return (0); } /* * If the shutdown was a clean halt, behave accordingly. */ static void shutdown_halt(void *junk, int howto) { if (howto & RB_HALT) { printf("\n"); printf("The operating system has halted.\n"); printf("Please press any key to reboot.\n\n"); switch (cngetc()) { case -1: /* No console, just die */ cpu_halt(); /* NOTREACHED */ default: howto &= ~RB_HALT; break; } } } /* * Check to see if the system paniced, pause and then reboot * according to the specified delay. */ static void shutdown_panic(void *junk, int howto) { int loop; if (howto & RB_DUMP) { if (panic_reboot_wait_time != 0) { if (panic_reboot_wait_time != -1) { printf("Automatic reboot in %d seconds - " "press a key on the console to abort\n", panic_reboot_wait_time); for (loop = panic_reboot_wait_time * 10; loop > 0; --loop) { DELAY(1000 * 100); /* 1/10th second */ /* Did user type a key? */ if (cncheckc() != -1) break; } if (!loop) return; } } else { /* zero time specified - reboot NOW */ return; } printf("--> Press a key on the console to reboot,\n"); printf("--> or switch off the system now.\n"); cngetc(); } } /* * Everything done, now reset */ static void shutdown_reset(void *junk, int howto) { printf("Rebooting...\n"); DELAY(1000000); /* wait 1 sec for printf's to complete and be read */ /* * Acquiring smp_ipi_mtx here has a double effect: * - it disables interrupts avoiding CPU0 preemption * by fast handlers (thus deadlocking against other CPUs) * - it avoids deadlocks against smp_rendezvous() or, more * generally, threads busy-waiting, with this spinlock held, * and waiting for responses by threads on other CPUs * (ie. smp_tlb_shootdown()). * * For the !SMP case it just needs to handle the former problem. */ #ifdef SMP mtx_lock_spin(&smp_ipi_mtx); #else spinlock_enter(); #endif /* cpu_boot(howto); */ /* doesn't do anything at the moment */ cpu_reset(); /* NOTREACHED */ /* assuming reset worked */ } #if defined(WITNESS) || defined(INVARIANT_SUPPORT) static int kassert_warn_only = 0; #ifdef KDB static int kassert_do_kdb = 0; #endif #ifdef KTR static int kassert_do_ktr = 0; #endif static int kassert_do_log = 1; static int kassert_log_pps_limit = 4; static int kassert_log_mute_at = 0; static int kassert_log_panic_at = 0; static int kassert_warnings = 0; SYSCTL_NODE(_debug, OID_AUTO, kassert, CTLFLAG_RW, NULL, "kassert options"); SYSCTL_INT(_debug_kassert, OID_AUTO, warn_only, CTLFLAG_RWTUN, &kassert_warn_only, 0, "KASSERT triggers a panic (1) or just a warning (0)"); #ifdef KDB SYSCTL_INT(_debug_kassert, OID_AUTO, do_kdb, CTLFLAG_RWTUN, &kassert_do_kdb, 0, "KASSERT will enter the debugger"); #endif #ifdef KTR SYSCTL_UINT(_debug_kassert, OID_AUTO, do_ktr, CTLFLAG_RWTUN, &kassert_do_ktr, 0, "KASSERT does a KTR, set this to the KTRMASK you want"); #endif SYSCTL_INT(_debug_kassert, OID_AUTO, do_log, CTLFLAG_RWTUN, &kassert_do_log, 0, "KASSERT triggers a panic (1) or just a warning (0)"); SYSCTL_INT(_debug_kassert, OID_AUTO, warnings, CTLFLAG_RWTUN, &kassert_warnings, 0, "number of KASSERTs that have been triggered"); SYSCTL_INT(_debug_kassert, OID_AUTO, log_panic_at, CTLFLAG_RWTUN, &kassert_log_panic_at, 0, "max number of KASSERTS before we will panic"); SYSCTL_INT(_debug_kassert, OID_AUTO, log_pps_limit, CTLFLAG_RWTUN, &kassert_log_pps_limit, 0, "limit number of log messages per second"); SYSCTL_INT(_debug_kassert, OID_AUTO, log_mute_at, CTLFLAG_RWTUN, &kassert_log_mute_at, 0, "max number of KASSERTS to log"); static int kassert_sysctl_kassert(SYSCTL_HANDLER_ARGS); SYSCTL_PROC(_debug_kassert, OID_AUTO, kassert, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_SECURE, NULL, 0, kassert_sysctl_kassert, "I", "set to trigger a test kassert"); static int kassert_sysctl_kassert(SYSCTL_HANDLER_ARGS) { int error, i; error = sysctl_wire_old_buffer(req, sizeof(int)); if (error == 0) { i = 0; error = sysctl_handle_int(oidp, &i, 0, req); } if (error != 0 || req->newptr == NULL) return (error); KASSERT(0, ("kassert_sysctl_kassert triggered kassert %d", i)); return (0); } /* * Called by KASSERT, this decides if we will panic * or if we will log via printf and/or ktr. */ void kassert_panic(const char *fmt, ...) { static char buf[256]; va_list ap; va_start(ap, fmt); (void)vsnprintf(buf, sizeof(buf), fmt, ap); va_end(ap); /* * panic if we're not just warning, or if we've exceeded * kassert_log_panic_at warnings. */ if (!kassert_warn_only || (kassert_log_panic_at > 0 && kassert_warnings >= kassert_log_panic_at)) { va_start(ap, fmt); vpanic(fmt, ap); /* NORETURN */ } #ifdef KTR if (kassert_do_ktr) CTR0(ktr_mask, buf); #endif /* KTR */ /* * log if we've not yet met the mute limit. */ if (kassert_do_log && (kassert_log_mute_at == 0 || kassert_warnings < kassert_log_mute_at)) { static struct timeval lasterr; static int curerr; if (ppsratecheck(&lasterr, &curerr, kassert_log_pps_limit)) { printf("KASSERT failed: %s\n", buf); kdb_backtrace(); } } #ifdef KDB if (kassert_do_kdb) { kdb_enter(KDB_WHY_KASSERT, buf); } #endif atomic_add_int(&kassert_warnings, 1); } #endif /* * Panic is called on unresolvable fatal errors. It prints "panic: mesg", * and then reboots. If we are called twice, then we avoid trying to sync * the disks as this often leads to recursive panics. */ void panic(const char *fmt, ...) { va_list ap; va_start(ap, fmt); vpanic(fmt, ap); } void vpanic(const char *fmt, va_list ap) { #ifdef SMP cpuset_t other_cpus; #endif struct thread *td = curthread; int bootopt, newpanic; static char buf[256]; spinlock_enter(); #ifdef SMP /* * stop_cpus_hard(other_cpus) should prevent multiple CPUs from * concurrently entering panic. Only the winner will proceed * further. */ if (panicstr == NULL && !kdb_active) { other_cpus = all_cpus; CPU_CLR(PCPU_GET(cpuid), &other_cpus); stop_cpus_hard(other_cpus); } +#endif /* * Ensure that the scheduler is stopped while panicking, even if panic * has been entered from kdb. */ td->td_stopsched = 1; -#endif bootopt = RB_AUTOBOOT; newpanic = 0; if (panicstr) bootopt |= RB_NOSYNC; else { bootopt |= RB_DUMP; panicstr = fmt; newpanic = 1; } if (newpanic) { (void)vsnprintf(buf, sizeof(buf), fmt, ap); panicstr = buf; cngrab(); printf("panic: %s\n", buf); } else { printf("panic: "); vprintf(fmt, ap); printf("\n"); } #ifdef SMP printf("cpuid = %d\n", PCPU_GET(cpuid)); #endif #ifdef KDB if (newpanic && trace_on_panic) kdb_backtrace(); if (debugger_on_panic) kdb_enter(KDB_WHY_PANIC, "panic"); #endif /*thread_lock(td); */ td->td_flags |= TDF_INPANIC; /* thread_unlock(td); */ if (!sync_on_panic) bootopt |= RB_NOSYNC; kern_reboot(bootopt); } /* * Support for poweroff delay. * * Please note that setting this delay too short might power off your machine * before the write cache on your hard disk has been flushed, leading to * soft-updates inconsistencies. */ #ifndef POWEROFF_DELAY # define POWEROFF_DELAY 5000 #endif static int poweroff_delay = POWEROFF_DELAY; SYSCTL_INT(_kern_shutdown, OID_AUTO, poweroff_delay, CTLFLAG_RW, &poweroff_delay, 0, "Delay before poweroff to write disk caches (msec)"); static void poweroff_wait(void *junk, int howto) { if (!(howto & RB_POWEROFF) || poweroff_delay <= 0) return; DELAY(poweroff_delay * 1000); } /* * Some system processes (e.g. syncer) need to be stopped at appropriate * points in their main loops prior to a system shutdown, so that they * won't interfere with the shutdown process (e.g. by holding a disk buf * to cause sync to fail). For each of these system processes, register * shutdown_kproc() as a handler for one of shutdown events. */ static int kproc_shutdown_wait = 60; SYSCTL_INT(_kern_shutdown, OID_AUTO, kproc_shutdown_wait, CTLFLAG_RW, &kproc_shutdown_wait, 0, "Max wait time (sec) to stop for each process"); void kproc_shutdown(void *arg, int howto) { struct proc *p; int error; if (panicstr) return; p = (struct proc *)arg; printf("Waiting (max %d seconds) for system process `%s' to stop... ", kproc_shutdown_wait, p->p_comm); error = kproc_suspend(p, kproc_shutdown_wait * hz); if (error == EWOULDBLOCK) printf("timed out\n"); else printf("done\n"); } void kthread_shutdown(void *arg, int howto) { struct thread *td; int error; if (panicstr) return; td = (struct thread *)arg; printf("Waiting (max %d seconds) for system thread `%s' to stop... ", kproc_shutdown_wait, td->td_name); error = kthread_suspend(td, kproc_shutdown_wait * hz); if (error == EWOULDBLOCK) printf("timed out\n"); else printf("done\n"); } static char dumpdevname[sizeof(((struct cdev*)NULL)->si_name)]; SYSCTL_STRING(_kern_shutdown, OID_AUTO, dumpdevname, CTLFLAG_RD, dumpdevname, 0, "Device for kernel dumps"); #ifdef EKCD static struct kerneldumpcrypto * kerneldumpcrypto_create(size_t blocksize, uint8_t encryption, const uint8_t *key, uint32_t encryptedkeysize, const uint8_t *encryptedkey) { struct kerneldumpcrypto *kdc; struct kerneldumpkey *kdk; uint32_t dumpkeysize; dumpkeysize = roundup2(sizeof(*kdk) + encryptedkeysize, blocksize); kdc = malloc(sizeof(*kdc) + dumpkeysize, M_EKCD, M_WAITOK | M_ZERO); arc4rand(kdc->kdc_iv, sizeof(kdc->kdc_iv), 0); kdc->kdc_encryption = encryption; switch (kdc->kdc_encryption) { case KERNELDUMP_ENC_AES_256_CBC: if (rijndael_makeKey(&kdc->kdc_ki, DIR_ENCRYPT, 256, key) <= 0) goto failed; break; default: goto failed; } kdc->kdc_dumpkeysize = dumpkeysize; kdk = kdc->kdc_dumpkey; kdk->kdk_encryption = kdc->kdc_encryption; memcpy(kdk->kdk_iv, kdc->kdc_iv, sizeof(kdk->kdk_iv)); kdk->kdk_encryptedkeysize = htod32(encryptedkeysize); memcpy(kdk->kdk_encryptedkey, encryptedkey, encryptedkeysize); return (kdc); failed: explicit_bzero(kdc, sizeof(*kdc) + dumpkeysize); free(kdc, M_EKCD); return (NULL); } #endif /* EKCD */ int kerneldumpcrypto_init(struct kerneldumpcrypto *kdc) { #ifndef EKCD return (0); #else uint8_t hash[SHA256_DIGEST_LENGTH]; SHA256_CTX ctx; struct kerneldumpkey *kdk; int error; error = 0; if (kdc == NULL) return (0); /* * When a user enters ddb it can write a crash dump multiple times. * Each time it should be encrypted using a different IV. */ SHA256_Init(&ctx); SHA256_Update(&ctx, kdc->kdc_iv, sizeof(kdc->kdc_iv)); SHA256_Final(hash, &ctx); bcopy(hash, kdc->kdc_iv, sizeof(kdc->kdc_iv)); switch (kdc->kdc_encryption) { case KERNELDUMP_ENC_AES_256_CBC: if (rijndael_cipherInit(&kdc->kdc_ci, MODE_CBC, kdc->kdc_iv) <= 0) { error = EINVAL; goto out; } break; default: error = EINVAL; goto out; } kdc->kdc_nextoffset = 0; kdk = kdc->kdc_dumpkey; memcpy(kdk->kdk_iv, kdc->kdc_iv, sizeof(kdk->kdk_iv)); out: explicit_bzero(hash, sizeof(hash)); return (error); #endif } uint32_t kerneldumpcrypto_dumpkeysize(const struct kerneldumpcrypto *kdc) { #ifdef EKCD if (kdc == NULL) return (0); return (kdc->kdc_dumpkeysize); #else return (0); #endif } /* Registration of dumpers */ int set_dumper(struct dumperinfo *di, const char *devname, struct thread *td, uint8_t encryption, const uint8_t *key, uint32_t encryptedkeysize, const uint8_t *encryptedkey) { size_t wantcopy; int error; error = priv_check(td, PRIV_SETDUMPER); if (error != 0) return (error); if (di == NULL) { error = 0; goto cleanup; } if (dumper.dumper != NULL) return (EBUSY); dumper = *di; dumper.blockbuf = NULL; dumper.kdc = NULL; if (encryption != KERNELDUMP_ENC_NONE) { #ifdef EKCD dumper.kdc = kerneldumpcrypto_create(di->blocksize, encryption, key, encryptedkeysize, encryptedkey); if (dumper.kdc == NULL) { error = EINVAL; goto cleanup; } #else error = EOPNOTSUPP; goto cleanup; #endif } wantcopy = strlcpy(dumpdevname, devname, sizeof(dumpdevname)); if (wantcopy >= sizeof(dumpdevname)) { printf("set_dumper: device name truncated from '%s' -> '%s'\n", devname, dumpdevname); } dumper.blockbuf = malloc(di->blocksize, M_DUMPER, M_WAITOK | M_ZERO); return (0); cleanup: #ifdef EKCD if (dumper.kdc != NULL) { explicit_bzero(dumper.kdc, sizeof(*dumper.kdc) + dumper.kdc->kdc_dumpkeysize); free(dumper.kdc, M_EKCD); } #endif if (dumper.blockbuf != NULL) { explicit_bzero(dumper.blockbuf, dumper.blocksize); free(dumper.blockbuf, M_DUMPER); } explicit_bzero(&dumper, sizeof(dumper)); dumpdevname[0] = '\0'; return (error); } static int dump_check_bounds(struct dumperinfo *di, off_t offset, size_t length) { if (length != 0 && (offset < di->mediaoffset || offset - di->mediaoffset + length > di->mediasize)) { printf("Attempt to write outside dump device boundaries.\n" "offset(%jd), mediaoffset(%jd), length(%ju), mediasize(%jd).\n", (intmax_t)offset, (intmax_t)di->mediaoffset, (uintmax_t)length, (intmax_t)di->mediasize); return (ENOSPC); } return (0); } #ifdef EKCD static int dump_encrypt(struct kerneldumpcrypto *kdc, uint8_t *buf, size_t size) { switch (kdc->kdc_encryption) { case KERNELDUMP_ENC_AES_256_CBC: if (rijndael_blockEncrypt(&kdc->kdc_ci, &kdc->kdc_ki, buf, 8 * size, buf) <= 0) { return (EIO); } if (rijndael_cipherInit(&kdc->kdc_ci, MODE_CBC, buf + size - 16 /* IV size for AES-256-CBC */) <= 0) { return (EIO); } break; default: return (EINVAL); } return (0); } /* Encrypt data and call dumper. */ static int dump_encrypted_write(struct dumperinfo *di, void *virtual, vm_offset_t physical, off_t offset, size_t length) { static uint8_t buf[KERNELDUMP_BUFFER_SIZE]; struct kerneldumpcrypto *kdc; int error; size_t nbytes; off_t nextoffset; kdc = di->kdc; error = dump_check_bounds(di, offset, length); if (error != 0) return (error); /* Signal completion. */ if (virtual == NULL && physical == 0 && offset == 0 && length == 0) { return (di->dumper(di->priv, virtual, physical, offset, length)); } /* Data have to be aligned to block size. */ if ((length % di->blocksize) != 0) return (EINVAL); /* * Data have to be written continuously becase we're encrypting using * CBC mode which has this assumption. */ if (kdc->kdc_nextoffset != 0 && kdc->kdc_nextoffset != offset) return (EINVAL); nextoffset = offset + (off_t)length; while (length > 0) { nbytes = MIN(length, sizeof(buf)); bcopy(virtual, buf, nbytes); if (dump_encrypt(kdc, buf, nbytes) != 0) return (EIO); error = di->dumper(di->priv, buf, physical, offset, nbytes); if (error != 0) return (error); offset += nbytes; virtual = (void *)((uint8_t *)virtual + nbytes); length -= nbytes; } kdc->kdc_nextoffset = nextoffset; return (0); } #endif /* EKCD */ /* Call dumper with bounds checking. */ static int dump_raw_write(struct dumperinfo *di, void *virtual, vm_offset_t physical, off_t offset, size_t length) { int error; error = dump_check_bounds(di, offset, length); if (error != 0) return (error); return (di->dumper(di->priv, virtual, physical, offset, length)); } int dump_write(struct dumperinfo *di, void *virtual, vm_offset_t physical, off_t offset, size_t length) { #ifdef EKCD if (di->kdc != NULL) { return (dump_encrypted_write(di, virtual, physical, offset, length)); } #endif return (dump_raw_write(di, virtual, physical, offset, length)); } static int dump_pad(struct dumperinfo *di, void *virtual, size_t length, void **buf, size_t *size) { if (length > di->blocksize) return (ENOMEM); *size = di->blocksize; if (length == di->blocksize) { *buf = virtual; } else { *buf = di->blockbuf; memcpy(*buf, virtual, length); memset((uint8_t *)*buf + length, 0, di->blocksize - length); } return (0); } static int dump_raw_write_pad(struct dumperinfo *di, void *virtual, vm_offset_t physical, off_t offset, size_t length, size_t *size) { void *buf; int error; error = dump_pad(di, virtual, length, &buf, size); if (error != 0) return (error); return (dump_raw_write(di, buf, physical, offset, *size)); } int dump_write_pad(struct dumperinfo *di, void *virtual, vm_offset_t physical, off_t offset, size_t length, size_t *size) { void *buf; int error; error = dump_pad(di, virtual, length, &buf, size); if (error != 0) return (error); return (dump_write(di, buf, physical, offset, *size)); } int dump_write_header(struct dumperinfo *di, struct kerneldumpheader *kdh, vm_offset_t physical, off_t offset) { size_t size; int ret; ret = dump_raw_write_pad(di, kdh, physical, offset, sizeof(*kdh), &size); if (ret == 0 && size != di->blocksize) ret = EINVAL; return (ret); } int dump_write_key(struct dumperinfo *di, vm_offset_t physical, off_t offset) { #ifndef EKCD return (0); #else /* EKCD */ struct kerneldumpcrypto *kdc; kdc = di->kdc; if (kdc == NULL) return (0); return (dump_raw_write(di, kdc->kdc_dumpkey, physical, offset, kdc->kdc_dumpkeysize)); #endif /* !EKCD */ } void mkdumpheader(struct kerneldumpheader *kdh, char *magic, uint32_t archver, uint64_t dumplen, uint32_t dumpkeysize, uint32_t blksz) { bzero(kdh, sizeof(*kdh)); strlcpy(kdh->magic, magic, sizeof(kdh->magic)); strlcpy(kdh->architecture, MACHINE_ARCH, sizeof(kdh->architecture)); kdh->version = htod32(KERNELDUMPVERSION); kdh->architectureversion = htod32(archver); kdh->dumplength = htod64(dumplen); kdh->dumptime = htod64(time_second); kdh->dumpkeysize = htod32(dumpkeysize); kdh->blocksize = htod32(blksz); strlcpy(kdh->hostname, prison0.pr_hostname, sizeof(kdh->hostname)); strlcpy(kdh->versionstring, version, sizeof(kdh->versionstring)); if (panicstr != NULL) strlcpy(kdh->panicstring, panicstr, sizeof(kdh->panicstring)); kdh->parity = kerneldump_parity(kdh); } #ifdef DDB DB_SHOW_COMMAND(panic, db_show_panic) { if (panicstr == NULL) db_printf("panicstr not set\n"); else db_printf("panic: %s\n", panicstr); } #endif Index: projects/netbsd-tests-upstream-01-2017/sys/kern/subr_gtaskqueue.c =================================================================== --- projects/netbsd-tests-upstream-01-2017/sys/kern/subr_gtaskqueue.c (revision 312217) +++ projects/netbsd-tests-upstream-01-2017/sys/kern/subr_gtaskqueue.c (revision 312218) @@ -1,937 +1,937 @@ /*- * Copyright (c) 2000 Doug Rabson * Copyright (c) 2014 Jeff Roberson * Copyright (c) 2016 Matthew Macy * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static MALLOC_DEFINE(M_GTASKQUEUE, "taskqueue", "Task Queues"); static void gtaskqueue_thread_enqueue(void *); static void gtaskqueue_thread_loop(void *arg); struct gtaskqueue_busy { struct gtask *tb_running; TAILQ_ENTRY(gtaskqueue_busy) tb_link; }; static struct gtask * const TB_DRAIN_WAITER = (struct gtask *)0x1; struct gtaskqueue { STAILQ_HEAD(, gtask) tq_queue; gtaskqueue_enqueue_fn tq_enqueue; void *tq_context; char *tq_name; TAILQ_HEAD(, gtaskqueue_busy) tq_active; struct mtx tq_mutex; struct thread **tq_threads; int tq_tcount; int tq_spin; int tq_flags; int tq_callouts; taskqueue_callback_fn tq_callbacks[TASKQUEUE_NUM_CALLBACKS]; void *tq_cb_contexts[TASKQUEUE_NUM_CALLBACKS]; }; #define TQ_FLAGS_ACTIVE (1 << 0) #define TQ_FLAGS_BLOCKED (1 << 1) #define TQ_FLAGS_UNLOCKED_ENQUEUE (1 << 2) #define DT_CALLOUT_ARMED (1 << 0) #define TQ_LOCK(tq) \ do { \ if ((tq)->tq_spin) \ mtx_lock_spin(&(tq)->tq_mutex); \ else \ mtx_lock(&(tq)->tq_mutex); \ } while (0) #define TQ_ASSERT_LOCKED(tq) mtx_assert(&(tq)->tq_mutex, MA_OWNED) #define TQ_UNLOCK(tq) \ do { \ if ((tq)->tq_spin) \ mtx_unlock_spin(&(tq)->tq_mutex); \ else \ mtx_unlock(&(tq)->tq_mutex); \ } while (0) #define TQ_ASSERT_UNLOCKED(tq) mtx_assert(&(tq)->tq_mutex, MA_NOTOWNED) #ifdef INVARIANTS static void gtask_dump(struct gtask *gtask) { printf("gtask: %p ta_flags=%x ta_priority=%d ta_func=%p ta_context=%p\n", gtask, gtask->ta_flags, gtask->ta_priority, gtask->ta_func, gtask->ta_context); } #endif static __inline int TQ_SLEEP(struct gtaskqueue *tq, void *p, struct mtx *m, int pri, const char *wm, int t) { if (tq->tq_spin) return (msleep_spin(p, m, wm, t)); return (msleep(p, m, pri, wm, t)); } static struct gtaskqueue * _gtaskqueue_create(const char *name, int mflags, taskqueue_enqueue_fn enqueue, void *context, int mtxflags, const char *mtxname __unused) { struct gtaskqueue *queue; char *tq_name; tq_name = malloc(TASKQUEUE_NAMELEN, M_GTASKQUEUE, mflags | M_ZERO); if (!tq_name) return (NULL); snprintf(tq_name, TASKQUEUE_NAMELEN, "%s", (name) ? name : "taskqueue"); queue = malloc(sizeof(struct gtaskqueue), M_GTASKQUEUE, mflags | M_ZERO); if (!queue) return (NULL); STAILQ_INIT(&queue->tq_queue); TAILQ_INIT(&queue->tq_active); queue->tq_enqueue = enqueue; queue->tq_context = context; queue->tq_name = tq_name; queue->tq_spin = (mtxflags & MTX_SPIN) != 0; queue->tq_flags |= TQ_FLAGS_ACTIVE; if (enqueue == gtaskqueue_thread_enqueue) queue->tq_flags |= TQ_FLAGS_UNLOCKED_ENQUEUE; mtx_init(&queue->tq_mutex, tq_name, NULL, mtxflags); return (queue); } /* * Signal a taskqueue thread to terminate. */ static void gtaskqueue_terminate(struct thread **pp, struct gtaskqueue *tq) { while (tq->tq_tcount > 0 || tq->tq_callouts > 0) { wakeup(tq); TQ_SLEEP(tq, pp, &tq->tq_mutex, PWAIT, "taskqueue_destroy", 0); } } static void gtaskqueue_free(struct gtaskqueue *queue) { TQ_LOCK(queue); queue->tq_flags &= ~TQ_FLAGS_ACTIVE; gtaskqueue_terminate(queue->tq_threads, queue); KASSERT(TAILQ_EMPTY(&queue->tq_active), ("Tasks still running?")); KASSERT(queue->tq_callouts == 0, ("Armed timeout tasks")); mtx_destroy(&queue->tq_mutex); free(queue->tq_threads, M_GTASKQUEUE); free(queue->tq_name, M_GTASKQUEUE); free(queue, M_GTASKQUEUE); } int grouptaskqueue_enqueue(struct gtaskqueue *queue, struct gtask *gtask) { #ifdef INVARIANTS if (queue == NULL) { gtask_dump(gtask); panic("queue == NULL"); } #endif TQ_LOCK(queue); if (gtask->ta_flags & TASK_ENQUEUED) { TQ_UNLOCK(queue); return (0); } STAILQ_INSERT_TAIL(&queue->tq_queue, gtask, ta_link); gtask->ta_flags |= TASK_ENQUEUED; TQ_UNLOCK(queue); if ((queue->tq_flags & TQ_FLAGS_BLOCKED) == 0) queue->tq_enqueue(queue->tq_context); return (0); } static void gtaskqueue_task_nop_fn(void *context) { } /* * Block until all currently queued tasks in this taskqueue * have begun execution. Tasks queued during execution of * this function are ignored. */ static void gtaskqueue_drain_tq_queue(struct gtaskqueue *queue) { struct gtask t_barrier; if (STAILQ_EMPTY(&queue->tq_queue)) return; /* * Enqueue our barrier after all current tasks, but with * the highest priority so that newly queued tasks cannot * pass it. Because of the high priority, we can not use * taskqueue_enqueue_locked directly (which drops the lock * anyway) so just insert it at tail while we have the * queue lock. */ GTASK_INIT(&t_barrier, 0, USHRT_MAX, gtaskqueue_task_nop_fn, &t_barrier); STAILQ_INSERT_TAIL(&queue->tq_queue, &t_barrier, ta_link); t_barrier.ta_flags |= TASK_ENQUEUED; /* * Once the barrier has executed, all previously queued tasks * have completed or are currently executing. */ while (t_barrier.ta_flags & TASK_ENQUEUED) TQ_SLEEP(queue, &t_barrier, &queue->tq_mutex, PWAIT, "-", 0); } /* * Block until all currently executing tasks for this taskqueue * complete. Tasks that begin execution during the execution * of this function are ignored. */ static void gtaskqueue_drain_tq_active(struct gtaskqueue *queue) { struct gtaskqueue_busy tb_marker, *tb_first; if (TAILQ_EMPTY(&queue->tq_active)) return; /* Block taskq_terminate().*/ queue->tq_callouts++; /* * Wait for all currently executing taskqueue threads * to go idle. */ tb_marker.tb_running = TB_DRAIN_WAITER; TAILQ_INSERT_TAIL(&queue->tq_active, &tb_marker, tb_link); while (TAILQ_FIRST(&queue->tq_active) != &tb_marker) TQ_SLEEP(queue, &tb_marker, &queue->tq_mutex, PWAIT, "-", 0); TAILQ_REMOVE(&queue->tq_active, &tb_marker, tb_link); /* * Wakeup any other drain waiter that happened to queue up * without any intervening active thread. */ tb_first = TAILQ_FIRST(&queue->tq_active); if (tb_first != NULL && tb_first->tb_running == TB_DRAIN_WAITER) wakeup(tb_first); /* Release taskqueue_terminate(). */ queue->tq_callouts--; if ((queue->tq_flags & TQ_FLAGS_ACTIVE) == 0) wakeup_one(queue->tq_threads); } void gtaskqueue_block(struct gtaskqueue *queue) { TQ_LOCK(queue); queue->tq_flags |= TQ_FLAGS_BLOCKED; TQ_UNLOCK(queue); } void gtaskqueue_unblock(struct gtaskqueue *queue) { TQ_LOCK(queue); queue->tq_flags &= ~TQ_FLAGS_BLOCKED; if (!STAILQ_EMPTY(&queue->tq_queue)) queue->tq_enqueue(queue->tq_context); TQ_UNLOCK(queue); } static void gtaskqueue_run_locked(struct gtaskqueue *queue) { struct gtaskqueue_busy tb; struct gtaskqueue_busy *tb_first; struct gtask *gtask; KASSERT(queue != NULL, ("tq is NULL")); TQ_ASSERT_LOCKED(queue); tb.tb_running = NULL; while (STAILQ_FIRST(&queue->tq_queue)) { TAILQ_INSERT_TAIL(&queue->tq_active, &tb, tb_link); /* * Carefully remove the first task from the queue and * clear its TASK_ENQUEUED flag */ gtask = STAILQ_FIRST(&queue->tq_queue); KASSERT(gtask != NULL, ("task is NULL")); STAILQ_REMOVE_HEAD(&queue->tq_queue, ta_link); gtask->ta_flags &= ~TASK_ENQUEUED; tb.tb_running = gtask; TQ_UNLOCK(queue); KASSERT(gtask->ta_func != NULL, ("task->ta_func is NULL")); gtask->ta_func(gtask->ta_context); TQ_LOCK(queue); tb.tb_running = NULL; wakeup(gtask); TAILQ_REMOVE(&queue->tq_active, &tb, tb_link); tb_first = TAILQ_FIRST(&queue->tq_active); if (tb_first != NULL && tb_first->tb_running == TB_DRAIN_WAITER) wakeup(tb_first); } } static int task_is_running(struct gtaskqueue *queue, struct gtask *gtask) { struct gtaskqueue_busy *tb; TQ_ASSERT_LOCKED(queue); TAILQ_FOREACH(tb, &queue->tq_active, tb_link) { if (tb->tb_running == gtask) return (1); } return (0); } static int gtaskqueue_cancel_locked(struct gtaskqueue *queue, struct gtask *gtask) { if (gtask->ta_flags & TASK_ENQUEUED) STAILQ_REMOVE(&queue->tq_queue, gtask, gtask, ta_link); gtask->ta_flags &= ~TASK_ENQUEUED; return (task_is_running(queue, gtask) ? EBUSY : 0); } int gtaskqueue_cancel(struct gtaskqueue *queue, struct gtask *gtask) { int error; TQ_LOCK(queue); error = gtaskqueue_cancel_locked(queue, gtask); TQ_UNLOCK(queue); return (error); } void gtaskqueue_drain(struct gtaskqueue *queue, struct gtask *gtask) { if (!queue->tq_spin) WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, __func__); TQ_LOCK(queue); while ((gtask->ta_flags & TASK_ENQUEUED) || task_is_running(queue, gtask)) TQ_SLEEP(queue, gtask, &queue->tq_mutex, PWAIT, "-", 0); TQ_UNLOCK(queue); } void gtaskqueue_drain_all(struct gtaskqueue *queue) { if (!queue->tq_spin) WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, __func__); TQ_LOCK(queue); gtaskqueue_drain_tq_queue(queue); gtaskqueue_drain_tq_active(queue); TQ_UNLOCK(queue); } static int _gtaskqueue_start_threads(struct gtaskqueue **tqp, int count, int pri, cpuset_t *mask, const char *name, va_list ap) { char ktname[MAXCOMLEN + 1]; struct thread *td; struct gtaskqueue *tq; int i, error; if (count <= 0) return (EINVAL); vsnprintf(ktname, sizeof(ktname), name, ap); tq = *tqp; tq->tq_threads = malloc(sizeof(struct thread *) * count, M_GTASKQUEUE, M_NOWAIT | M_ZERO); if (tq->tq_threads == NULL) { printf("%s: no memory for %s threads\n", __func__, ktname); return (ENOMEM); } for (i = 0; i < count; i++) { if (count == 1) error = kthread_add(gtaskqueue_thread_loop, tqp, NULL, &tq->tq_threads[i], RFSTOPPED, 0, "%s", ktname); else error = kthread_add(gtaskqueue_thread_loop, tqp, NULL, &tq->tq_threads[i], RFSTOPPED, 0, "%s_%d", ktname, i); if (error) { /* should be ok to continue, taskqueue_free will dtrt */ printf("%s: kthread_add(%s): error %d", __func__, ktname, error); tq->tq_threads[i] = NULL; /* paranoid */ } else tq->tq_tcount++; } for (i = 0; i < count; i++) { if (tq->tq_threads[i] == NULL) continue; td = tq->tq_threads[i]; if (mask) { error = cpuset_setthread(td->td_tid, mask); /* * Failing to pin is rarely an actual fatal error; * it'll just affect performance. */ if (error) printf("%s: curthread=%llu: can't pin; " "error=%d\n", __func__, (unsigned long long) td->td_tid, error); } thread_lock(td); sched_prio(td, pri); sched_add(td, SRQ_BORING); thread_unlock(td); } return (0); } static int gtaskqueue_start_threads(struct gtaskqueue **tqp, int count, int pri, const char *name, ...) { va_list ap; int error; va_start(ap, name); error = _gtaskqueue_start_threads(tqp, count, pri, NULL, name, ap); va_end(ap); return (error); } static inline void gtaskqueue_run_callback(struct gtaskqueue *tq, enum taskqueue_callback_type cb_type) { taskqueue_callback_fn tq_callback; TQ_ASSERT_UNLOCKED(tq); tq_callback = tq->tq_callbacks[cb_type]; if (tq_callback != NULL) tq_callback(tq->tq_cb_contexts[cb_type]); } static void gtaskqueue_thread_loop(void *arg) { struct gtaskqueue **tqp, *tq; tqp = arg; tq = *tqp; gtaskqueue_run_callback(tq, TASKQUEUE_CALLBACK_TYPE_INIT); TQ_LOCK(tq); while ((tq->tq_flags & TQ_FLAGS_ACTIVE) != 0) { /* XXX ? */ gtaskqueue_run_locked(tq); /* * Because taskqueue_run() can drop tq_mutex, we need to * check if the TQ_FLAGS_ACTIVE flag wasn't removed in the * meantime, which means we missed a wakeup. */ if ((tq->tq_flags & TQ_FLAGS_ACTIVE) == 0) break; TQ_SLEEP(tq, tq, &tq->tq_mutex, 0, "-", 0); } gtaskqueue_run_locked(tq); /* * This thread is on its way out, so just drop the lock temporarily * in order to call the shutdown callback. This allows the callback * to look at the taskqueue, even just before it dies. */ TQ_UNLOCK(tq); gtaskqueue_run_callback(tq, TASKQUEUE_CALLBACK_TYPE_SHUTDOWN); TQ_LOCK(tq); /* rendezvous with thread that asked us to terminate */ tq->tq_tcount--; wakeup_one(tq->tq_threads); TQ_UNLOCK(tq); kthread_exit(); } static void gtaskqueue_thread_enqueue(void *context) { struct gtaskqueue **tqp, *tq; tqp = context; tq = *tqp; wakeup_one(tq); } static struct gtaskqueue * gtaskqueue_create_fast(const char *name, int mflags, taskqueue_enqueue_fn enqueue, void *context) { return _gtaskqueue_create(name, mflags, enqueue, context, MTX_SPIN, "fast_taskqueue"); } struct taskqgroup_cpu { LIST_HEAD(, grouptask) tgc_tasks; struct gtaskqueue *tgc_taskq; int tgc_cnt; int tgc_cpu; }; struct taskqgroup { struct taskqgroup_cpu tqg_queue[MAXCPU]; struct mtx tqg_lock; char * tqg_name; int tqg_adjusting; int tqg_stride; int tqg_cnt; }; struct taskq_bind_task { struct gtask bt_task; int bt_cpuid; }; static void taskqgroup_cpu_create(struct taskqgroup *qgroup, int idx, int cpu) { struct taskqgroup_cpu *qcpu; qcpu = &qgroup->tqg_queue[idx]; LIST_INIT(&qcpu->tgc_tasks); qcpu->tgc_taskq = gtaskqueue_create_fast(NULL, M_WAITOK, taskqueue_thread_enqueue, &qcpu->tgc_taskq); gtaskqueue_start_threads(&qcpu->tgc_taskq, 1, PI_SOFT, "%s_%d", qgroup->tqg_name, idx); qcpu->tgc_cpu = cpu; } static void taskqgroup_cpu_remove(struct taskqgroup *qgroup, int idx) { gtaskqueue_free(qgroup->tqg_queue[idx].tgc_taskq); } /* * Find the taskq with least # of tasks that doesn't currently have any * other queues from the uniq identifier. */ static int taskqgroup_find(struct taskqgroup *qgroup, void *uniq) { struct grouptask *n; int i, idx, mincnt; int strict; mtx_assert(&qgroup->tqg_lock, MA_OWNED); if (qgroup->tqg_cnt == 0) return (0); idx = -1; mincnt = INT_MAX; /* * Two passes; First scan for a queue with the least tasks that * does not already service this uniq id. If that fails simply find * the queue with the least total tasks; */ for (strict = 1; mincnt == INT_MAX; strict = 0) { for (i = 0; i < qgroup->tqg_cnt; i++) { if (qgroup->tqg_queue[i].tgc_cnt > mincnt) continue; if (strict) { LIST_FOREACH(n, &qgroup->tqg_queue[i].tgc_tasks, gt_list) if (n->gt_uniq == uniq) break; if (n != NULL) continue; } mincnt = qgroup->tqg_queue[i].tgc_cnt; idx = i; } } if (idx == -1) panic("taskqgroup_find: Failed to pick a qid."); return (idx); } void taskqgroup_attach(struct taskqgroup *qgroup, struct grouptask *gtask, void *uniq, int irq, char *name) { cpuset_t mask; int qid; gtask->gt_uniq = uniq; gtask->gt_name = name; gtask->gt_irq = irq; gtask->gt_cpu = -1; mtx_lock(&qgroup->tqg_lock); qid = taskqgroup_find(qgroup, uniq); qgroup->tqg_queue[qid].tgc_cnt++; LIST_INSERT_HEAD(&qgroup->tqg_queue[qid].tgc_tasks, gtask, gt_list); gtask->gt_taskqueue = qgroup->tqg_queue[qid].tgc_taskq; - if (irq != -1 && smp_started) { + if (irq != -1 && (smp_started || mp_ncpus == 1)) { gtask->gt_cpu = qgroup->tqg_queue[qid].tgc_cpu; CPU_ZERO(&mask); CPU_SET(qgroup->tqg_queue[qid].tgc_cpu, &mask); mtx_unlock(&qgroup->tqg_lock); intr_setaffinity(irq, &mask); } else mtx_unlock(&qgroup->tqg_lock); } static void taskqgroup_attach_deferred(struct taskqgroup *qgroup, struct grouptask *gtask) { cpuset_t mask; int qid, cpu; mtx_lock(&qgroup->tqg_lock); qid = taskqgroup_find(qgroup, gtask->gt_uniq); cpu = qgroup->tqg_queue[qid].tgc_cpu; if (gtask->gt_irq != -1) { mtx_unlock(&qgroup->tqg_lock); CPU_ZERO(&mask); CPU_SET(cpu, &mask); intr_setaffinity(gtask->gt_irq, &mask); mtx_lock(&qgroup->tqg_lock); } qgroup->tqg_queue[qid].tgc_cnt++; LIST_INSERT_HEAD(&qgroup->tqg_queue[qid].tgc_tasks, gtask, gt_list); MPASS(qgroup->tqg_queue[qid].tgc_taskq != NULL); gtask->gt_taskqueue = qgroup->tqg_queue[qid].tgc_taskq; mtx_unlock(&qgroup->tqg_lock); } int taskqgroup_attach_cpu(struct taskqgroup *qgroup, struct grouptask *gtask, void *uniq, int cpu, int irq, char *name) { cpuset_t mask; int i, qid; qid = -1; gtask->gt_uniq = uniq; gtask->gt_name = name; gtask->gt_irq = irq; gtask->gt_cpu = cpu; mtx_lock(&qgroup->tqg_lock); - if (smp_started) { + if (smp_started || mp_ncpus == 1) { for (i = 0; i < qgroup->tqg_cnt; i++) if (qgroup->tqg_queue[i].tgc_cpu == cpu) { qid = i; break; } if (qid == -1) { mtx_unlock(&qgroup->tqg_lock); return (EINVAL); } } else qid = 0; qgroup->tqg_queue[qid].tgc_cnt++; LIST_INSERT_HEAD(&qgroup->tqg_queue[qid].tgc_tasks, gtask, gt_list); gtask->gt_taskqueue = qgroup->tqg_queue[qid].tgc_taskq; cpu = qgroup->tqg_queue[qid].tgc_cpu; mtx_unlock(&qgroup->tqg_lock); CPU_ZERO(&mask); CPU_SET(cpu, &mask); - if (irq != -1 && smp_started) + if (irq != -1 && (smp_started || mp_ncpus == 1)) intr_setaffinity(irq, &mask); return (0); } static int taskqgroup_attach_cpu_deferred(struct taskqgroup *qgroup, struct grouptask *gtask) { cpuset_t mask; int i, qid, irq, cpu; qid = -1; irq = gtask->gt_irq; cpu = gtask->gt_cpu; - MPASS(smp_started); + MPASS(smp_started || mp_ncpus == 1); mtx_lock(&qgroup->tqg_lock); for (i = 0; i < qgroup->tqg_cnt; i++) if (qgroup->tqg_queue[i].tgc_cpu == cpu) { qid = i; break; } if (qid == -1) { mtx_unlock(&qgroup->tqg_lock); return (EINVAL); } qgroup->tqg_queue[qid].tgc_cnt++; LIST_INSERT_HEAD(&qgroup->tqg_queue[qid].tgc_tasks, gtask, gt_list); MPASS(qgroup->tqg_queue[qid].tgc_taskq != NULL); gtask->gt_taskqueue = qgroup->tqg_queue[qid].tgc_taskq; mtx_unlock(&qgroup->tqg_lock); CPU_ZERO(&mask); CPU_SET(cpu, &mask); if (irq != -1) intr_setaffinity(irq, &mask); return (0); } void taskqgroup_detach(struct taskqgroup *qgroup, struct grouptask *gtask) { int i; mtx_lock(&qgroup->tqg_lock); for (i = 0; i < qgroup->tqg_cnt; i++) if (qgroup->tqg_queue[i].tgc_taskq == gtask->gt_taskqueue) break; if (i == qgroup->tqg_cnt) panic("taskqgroup_detach: task not in group\n"); qgroup->tqg_queue[i].tgc_cnt--; LIST_REMOVE(gtask, gt_list); mtx_unlock(&qgroup->tqg_lock); gtask->gt_taskqueue = NULL; } static void taskqgroup_binder(void *ctx) { struct taskq_bind_task *gtask = (struct taskq_bind_task *)ctx; cpuset_t mask; int error; CPU_ZERO(&mask); CPU_SET(gtask->bt_cpuid, &mask); error = cpuset_setthread(curthread->td_tid, &mask); thread_lock(curthread); sched_bind(curthread, gtask->bt_cpuid); thread_unlock(curthread); if (error) printf("taskqgroup_binder: setaffinity failed: %d\n", error); free(gtask, M_DEVBUF); } static void taskqgroup_bind(struct taskqgroup *qgroup) { struct taskq_bind_task *gtask; int i; /* * Bind taskqueue threads to specific CPUs, if they have been assigned * one. */ if (qgroup->tqg_cnt == 1) return; for (i = 0; i < qgroup->tqg_cnt; i++) { gtask = malloc(sizeof (*gtask), M_DEVBUF, M_WAITOK); GTASK_INIT(>ask->bt_task, 0, 0, taskqgroup_binder, gtask); gtask->bt_cpuid = qgroup->tqg_queue[i].tgc_cpu; grouptaskqueue_enqueue(qgroup->tqg_queue[i].tgc_taskq, >ask->bt_task); } } static int _taskqgroup_adjust(struct taskqgroup *qgroup, int cnt, int stride) { LIST_HEAD(, grouptask) gtask_head = LIST_HEAD_INITIALIZER(NULL); struct grouptask *gtask; int i, k, old_cnt, old_cpu, cpu; mtx_assert(&qgroup->tqg_lock, MA_OWNED); - if (cnt < 1 || cnt * stride > mp_ncpus || !smp_started) { + if (cnt < 1 || cnt * stride > mp_ncpus || (!smp_started && (mp_ncpus != 1))) { printf("taskqgroup_adjust failed cnt: %d stride: %d mp_ncpus: %d smp_started: %d\n", cnt, stride, mp_ncpus, smp_started); return (EINVAL); } if (qgroup->tqg_adjusting) { printf("taskqgroup_adjust failed: adjusting\n"); return (EBUSY); } qgroup->tqg_adjusting = 1; old_cnt = qgroup->tqg_cnt; old_cpu = 0; if (old_cnt < cnt) old_cpu = qgroup->tqg_queue[old_cnt].tgc_cpu; mtx_unlock(&qgroup->tqg_lock); /* * Set up queue for tasks added before boot. */ if (old_cnt == 0) { LIST_SWAP(>ask_head, &qgroup->tqg_queue[0].tgc_tasks, grouptask, gt_list); qgroup->tqg_queue[0].tgc_cnt = 0; } /* * If new taskq threads have been added. */ cpu = old_cpu; for (i = old_cnt; i < cnt; i++) { taskqgroup_cpu_create(qgroup, i, cpu); for (k = 0; k < stride; k++) cpu = CPU_NEXT(cpu); } mtx_lock(&qgroup->tqg_lock); qgroup->tqg_cnt = cnt; qgroup->tqg_stride = stride; /* * Adjust drivers to use new taskqs. */ for (i = 0; i < old_cnt; i++) { while ((gtask = LIST_FIRST(&qgroup->tqg_queue[i].tgc_tasks))) { LIST_REMOVE(gtask, gt_list); qgroup->tqg_queue[i].tgc_cnt--; LIST_INSERT_HEAD(>ask_head, gtask, gt_list); } } mtx_unlock(&qgroup->tqg_lock); while ((gtask = LIST_FIRST(>ask_head))) { LIST_REMOVE(gtask, gt_list); if (gtask->gt_cpu == -1) taskqgroup_attach_deferred(qgroup, gtask); else if (taskqgroup_attach_cpu_deferred(qgroup, gtask)) taskqgroup_attach_deferred(qgroup, gtask); } #ifdef INVARIANTS mtx_lock(&qgroup->tqg_lock); for (i = 0; i < qgroup->tqg_cnt; i++) { MPASS(qgroup->tqg_queue[i].tgc_taskq != NULL); LIST_FOREACH(gtask, &qgroup->tqg_queue[i].tgc_tasks, gt_list) MPASS(gtask->gt_taskqueue != NULL); } mtx_unlock(&qgroup->tqg_lock); #endif /* * If taskq thread count has been reduced. */ for (i = cnt; i < old_cnt; i++) taskqgroup_cpu_remove(qgroup, i); taskqgroup_bind(qgroup); mtx_lock(&qgroup->tqg_lock); qgroup->tqg_adjusting = 0; return (0); } int taskqgroup_adjust(struct taskqgroup *qgroup, int cnt, int stride) { int error; mtx_lock(&qgroup->tqg_lock); error = _taskqgroup_adjust(qgroup, cnt, stride); mtx_unlock(&qgroup->tqg_lock); return (error); } struct taskqgroup * taskqgroup_create(char *name) { struct taskqgroup *qgroup; qgroup = malloc(sizeof(*qgroup), M_GTASKQUEUE, M_WAITOK | M_ZERO); mtx_init(&qgroup->tqg_lock, "taskqgroup", NULL, MTX_DEF); qgroup->tqg_name = name; LIST_INIT(&qgroup->tqg_queue[0].tgc_tasks); return (qgroup); } void taskqgroup_destroy(struct taskqgroup *qgroup) { } Index: projects/netbsd-tests-upstream-01-2017/sys/kern/uipc_mbuf.c =================================================================== --- projects/netbsd-tests-upstream-01-2017/sys/kern/uipc_mbuf.c (revision 312217) +++ projects/netbsd-tests-upstream-01-2017/sys/kern/uipc_mbuf.c (revision 312218) @@ -1,1869 +1,1869 @@ /*- * Copyright (c) 1982, 1986, 1988, 1991, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)uipc_mbuf.c 8.2 (Berkeley) 1/4/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_param.h" #include "opt_mbuf_stress_test.h" #include "opt_mbuf_profiling.h" #include #include #include #include #include #include #include #include #include #include #include #include SDT_PROBE_DEFINE5_XLATE(sdt, , , m__init, "struct mbuf *", "mbufinfo_t *", "uint32_t", "uint32_t", "uint16_t", "uint16_t", "uint32_t", "uint32_t", "uint32_t", "uint32_t"); SDT_PROBE_DEFINE3_XLATE(sdt, , , m__gethdr, "uint32_t", "uint32_t", "uint16_t", "uint16_t", "struct mbuf *", "mbufinfo_t *"); SDT_PROBE_DEFINE3_XLATE(sdt, , , m__get, "uint32_t", "uint32_t", "uint16_t", "uint16_t", "struct mbuf *", "mbufinfo_t *"); SDT_PROBE_DEFINE4_XLATE(sdt, , , m__getcl, "uint32_t", "uint32_t", "uint16_t", "uint16_t", "uint32_t", "uint32_t", "struct mbuf *", "mbufinfo_t *"); SDT_PROBE_DEFINE3_XLATE(sdt, , , m__clget, "struct mbuf *", "mbufinfo_t *", "uint32_t", "uint32_t", "uint32_t", "uint32_t"); SDT_PROBE_DEFINE4_XLATE(sdt, , , m__cljget, "struct mbuf *", "mbufinfo_t *", "uint32_t", "uint32_t", "uint32_t", "uint32_t", "void*", "void*"); SDT_PROBE_DEFINE(sdt, , , m__cljset); SDT_PROBE_DEFINE1_XLATE(sdt, , , m__free, "struct mbuf *", "mbufinfo_t *"); SDT_PROBE_DEFINE1_XLATE(sdt, , , m__freem, "struct mbuf *", "mbufinfo_t *"); #include int max_linkhdr; int max_protohdr; int max_hdr; int max_datalen; #ifdef MBUF_STRESS_TEST int m_defragpackets; int m_defragbytes; int m_defraguseless; int m_defragfailure; int m_defragrandomfailures; #endif /* * sysctl(8) exported objects */ SYSCTL_INT(_kern_ipc, KIPC_MAX_LINKHDR, max_linkhdr, CTLFLAG_RD, &max_linkhdr, 0, "Size of largest link layer header"); SYSCTL_INT(_kern_ipc, KIPC_MAX_PROTOHDR, max_protohdr, CTLFLAG_RD, &max_protohdr, 0, "Size of largest protocol layer header"); SYSCTL_INT(_kern_ipc, KIPC_MAX_HDR, max_hdr, CTLFLAG_RD, &max_hdr, 0, "Size of largest link plus protocol header"); SYSCTL_INT(_kern_ipc, KIPC_MAX_DATALEN, max_datalen, CTLFLAG_RD, &max_datalen, 0, "Minimum space left in mbuf after max_hdr"); #ifdef MBUF_STRESS_TEST SYSCTL_INT(_kern_ipc, OID_AUTO, m_defragpackets, CTLFLAG_RD, &m_defragpackets, 0, ""); SYSCTL_INT(_kern_ipc, OID_AUTO, m_defragbytes, CTLFLAG_RD, &m_defragbytes, 0, ""); SYSCTL_INT(_kern_ipc, OID_AUTO, m_defraguseless, CTLFLAG_RD, &m_defraguseless, 0, ""); SYSCTL_INT(_kern_ipc, OID_AUTO, m_defragfailure, CTLFLAG_RD, &m_defragfailure, 0, ""); SYSCTL_INT(_kern_ipc, OID_AUTO, m_defragrandomfailures, CTLFLAG_RW, &m_defragrandomfailures, 0, ""); #endif /* * Ensure the correct size of various mbuf parameters. It could be off due * to compiler-induced padding and alignment artifacts. */ CTASSERT(MSIZE - offsetof(struct mbuf, m_dat) == MLEN); CTASSERT(MSIZE - offsetof(struct mbuf, m_pktdat) == MHLEN); /* * mbuf data storage should be 64-bit aligned regardless of architectural * pointer size; check this is the case with and without a packet header. */ CTASSERT(offsetof(struct mbuf, m_dat) % 8 == 0); CTASSERT(offsetof(struct mbuf, m_pktdat) % 8 == 0); /* * While the specific values here don't matter too much (i.e., +/- a few * words), we do want to ensure that changes to these values are carefully * reasoned about and properly documented. This is especially the case as * network-protocol and device-driver modules encode these layouts, and must * be recompiled if the structures change. Check these values at compile time * against the ones documented in comments in mbuf.h. * * NB: Possibly they should be documented there via #define's and not just * comments. */ #if defined(__LP64__) CTASSERT(offsetof(struct mbuf, m_dat) == 32); CTASSERT(sizeof(struct pkthdr) == 56); CTASSERT(sizeof(struct m_ext) == 48); #else CTASSERT(offsetof(struct mbuf, m_dat) == 24); CTASSERT(sizeof(struct pkthdr) == 48); CTASSERT(sizeof(struct m_ext) == 28); #endif /* * Assert that the queue(3) macros produce code of the same size as an old * plain pointer does. */ #ifdef INVARIANTS -static struct mbuf m_assertbuf; +static struct mbuf __used m_assertbuf; CTASSERT(sizeof(m_assertbuf.m_slist) == sizeof(m_assertbuf.m_next)); CTASSERT(sizeof(m_assertbuf.m_stailq) == sizeof(m_assertbuf.m_next)); CTASSERT(sizeof(m_assertbuf.m_slistpkt) == sizeof(m_assertbuf.m_nextpkt)); CTASSERT(sizeof(m_assertbuf.m_stailqpkt) == sizeof(m_assertbuf.m_nextpkt)); #endif /* * Attach the cluster from *m to *n, set up m_ext in *n * and bump the refcount of the cluster. */ void mb_dupcl(struct mbuf *n, struct mbuf *m) { volatile u_int *refcnt; KASSERT(m->m_flags & M_EXT, ("%s: M_EXT not set on %p", __func__, m)); KASSERT(!(n->m_flags & M_EXT), ("%s: M_EXT set on %p", __func__, n)); n->m_ext = m->m_ext; n->m_flags |= M_EXT; n->m_flags |= m->m_flags & M_RDONLY; /* See if this is the mbuf that holds the embedded refcount. */ if (m->m_ext.ext_flags & EXT_FLAG_EMBREF) { refcnt = n->m_ext.ext_cnt = &m->m_ext.ext_count; n->m_ext.ext_flags &= ~EXT_FLAG_EMBREF; } else { KASSERT(m->m_ext.ext_cnt != NULL, ("%s: no refcounting pointer on %p", __func__, m)); refcnt = m->m_ext.ext_cnt; } if (*refcnt == 1) *refcnt += 1; else atomic_add_int(refcnt, 1); } void m_demote_pkthdr(struct mbuf *m) { M_ASSERTPKTHDR(m); m_tag_delete_chain(m, NULL); m->m_flags &= ~M_PKTHDR; bzero(&m->m_pkthdr, sizeof(struct pkthdr)); } /* * Clean up mbuf (chain) from any tags and packet headers. * If "all" is set then the first mbuf in the chain will be * cleaned too. */ void m_demote(struct mbuf *m0, int all, int flags) { struct mbuf *m; for (m = all ? m0 : m0->m_next; m != NULL; m = m->m_next) { KASSERT(m->m_nextpkt == NULL, ("%s: m_nextpkt in m %p, m0 %p", __func__, m, m0)); if (m->m_flags & M_PKTHDR) m_demote_pkthdr(m); m->m_flags = m->m_flags & (M_EXT | M_RDONLY | M_NOFREE | flags); } } /* * Sanity checks on mbuf (chain) for use in KASSERT() and general * debugging. * Returns 0 or panics when bad and 1 on all tests passed. * Sanitize, 0 to run M_SANITY_ACTION, 1 to garble things so they * blow up later. */ int m_sanity(struct mbuf *m0, int sanitize) { struct mbuf *m; caddr_t a, b; int pktlen = 0; #ifdef INVARIANTS #define M_SANITY_ACTION(s) panic("mbuf %p: " s, m) #else #define M_SANITY_ACTION(s) printf("mbuf %p: " s, m) #endif for (m = m0; m != NULL; m = m->m_next) { /* * Basic pointer checks. If any of these fails then some * unrelated kernel memory before or after us is trashed. * No way to recover from that. */ a = M_START(m); b = a + M_SIZE(m); if ((caddr_t)m->m_data < a) M_SANITY_ACTION("m_data outside mbuf data range left"); if ((caddr_t)m->m_data > b) M_SANITY_ACTION("m_data outside mbuf data range right"); if ((caddr_t)m->m_data + m->m_len > b) M_SANITY_ACTION("m_data + m_len exeeds mbuf space"); /* m->m_nextpkt may only be set on first mbuf in chain. */ if (m != m0 && m->m_nextpkt != NULL) { if (sanitize) { m_freem(m->m_nextpkt); m->m_nextpkt = (struct mbuf *)0xDEADC0DE; } else M_SANITY_ACTION("m->m_nextpkt on in-chain mbuf"); } /* packet length (not mbuf length!) calculation */ if (m0->m_flags & M_PKTHDR) pktlen += m->m_len; /* m_tags may only be attached to first mbuf in chain. */ if (m != m0 && m->m_flags & M_PKTHDR && !SLIST_EMPTY(&m->m_pkthdr.tags)) { if (sanitize) { m_tag_delete_chain(m, NULL); /* put in 0xDEADC0DE perhaps? */ } else M_SANITY_ACTION("m_tags on in-chain mbuf"); } /* M_PKTHDR may only be set on first mbuf in chain */ if (m != m0 && m->m_flags & M_PKTHDR) { if (sanitize) { bzero(&m->m_pkthdr, sizeof(m->m_pkthdr)); m->m_flags &= ~M_PKTHDR; /* put in 0xDEADCODE and leave hdr flag in */ } else M_SANITY_ACTION("M_PKTHDR on in-chain mbuf"); } } m = m0; if (pktlen && pktlen != m->m_pkthdr.len) { if (sanitize) m->m_pkthdr.len = 0; else M_SANITY_ACTION("m_pkthdr.len != mbuf chain length"); } return 1; #undef M_SANITY_ACTION } /* * Non-inlined part of m_init(). */ int m_pkthdr_init(struct mbuf *m, int how) { #ifdef MAC int error; #endif m->m_data = m->m_pktdat; bzero(&m->m_pkthdr, sizeof(m->m_pkthdr)); #ifdef MAC /* If the label init fails, fail the alloc */ error = mac_mbuf_init(m, how); if (error) return (error); #endif return (0); } /* * "Move" mbuf pkthdr from "from" to "to". * "from" must have M_PKTHDR set, and "to" must be empty. */ void m_move_pkthdr(struct mbuf *to, struct mbuf *from) { #if 0 /* see below for why these are not enabled */ M_ASSERTPKTHDR(to); /* Note: with MAC, this may not be a good assertion. */ KASSERT(SLIST_EMPTY(&to->m_pkthdr.tags), ("m_move_pkthdr: to has tags")); #endif #ifdef MAC /* * XXXMAC: It could be this should also occur for non-MAC? */ if (to->m_flags & M_PKTHDR) m_tag_delete_chain(to, NULL); #endif to->m_flags = (from->m_flags & M_COPYFLAGS) | (to->m_flags & M_EXT); if ((to->m_flags & M_EXT) == 0) to->m_data = to->m_pktdat; to->m_pkthdr = from->m_pkthdr; /* especially tags */ SLIST_INIT(&from->m_pkthdr.tags); /* purge tags from src */ from->m_flags &= ~M_PKTHDR; } /* * Duplicate "from"'s mbuf pkthdr in "to". * "from" must have M_PKTHDR set, and "to" must be empty. * In particular, this does a deep copy of the packet tags. */ int m_dup_pkthdr(struct mbuf *to, const struct mbuf *from, int how) { #if 0 /* * The mbuf allocator only initializes the pkthdr * when the mbuf is allocated with m_gethdr(). Many users * (e.g. m_copy*, m_prepend) use m_get() and then * smash the pkthdr as needed causing these * assertions to trip. For now just disable them. */ M_ASSERTPKTHDR(to); /* Note: with MAC, this may not be a good assertion. */ KASSERT(SLIST_EMPTY(&to->m_pkthdr.tags), ("m_dup_pkthdr: to has tags")); #endif MBUF_CHECKSLEEP(how); #ifdef MAC if (to->m_flags & M_PKTHDR) m_tag_delete_chain(to, NULL); #endif to->m_flags = (from->m_flags & M_COPYFLAGS) | (to->m_flags & M_EXT); if ((to->m_flags & M_EXT) == 0) to->m_data = to->m_pktdat; to->m_pkthdr = from->m_pkthdr; SLIST_INIT(&to->m_pkthdr.tags); return (m_tag_copy_chain(to, from, how)); } /* * Lesser-used path for M_PREPEND: * allocate new mbuf to prepend to chain, * copy junk along. */ struct mbuf * m_prepend(struct mbuf *m, int len, int how) { struct mbuf *mn; if (m->m_flags & M_PKTHDR) mn = m_gethdr(how, m->m_type); else mn = m_get(how, m->m_type); if (mn == NULL) { m_freem(m); return (NULL); } if (m->m_flags & M_PKTHDR) m_move_pkthdr(mn, m); mn->m_next = m; m = mn; if (len < M_SIZE(m)) M_ALIGN(m, len); m->m_len = len; return (m); } /* * Make a copy of an mbuf chain starting "off0" bytes from the beginning, * continuing for "len" bytes. If len is M_COPYALL, copy to end of mbuf. * The wait parameter is a choice of M_WAITOK/M_NOWAIT from caller. * Note that the copy is read-only, because clusters are not copied, * only their reference counts are incremented. */ struct mbuf * m_copym(struct mbuf *m, int off0, int len, int wait) { struct mbuf *n, **np; int off = off0; struct mbuf *top; int copyhdr = 0; KASSERT(off >= 0, ("m_copym, negative off %d", off)); KASSERT(len >= 0, ("m_copym, negative len %d", len)); MBUF_CHECKSLEEP(wait); if (off == 0 && m->m_flags & M_PKTHDR) copyhdr = 1; while (off > 0) { KASSERT(m != NULL, ("m_copym, offset > size of mbuf chain")); if (off < m->m_len) break; off -= m->m_len; m = m->m_next; } np = ⊤ top = NULL; while (len > 0) { if (m == NULL) { KASSERT(len == M_COPYALL, ("m_copym, length > size of mbuf chain")); break; } if (copyhdr) n = m_gethdr(wait, m->m_type); else n = m_get(wait, m->m_type); *np = n; if (n == NULL) goto nospace; if (copyhdr) { if (!m_dup_pkthdr(n, m, wait)) goto nospace; if (len == M_COPYALL) n->m_pkthdr.len -= off0; else n->m_pkthdr.len = len; copyhdr = 0; } n->m_len = min(len, m->m_len - off); if (m->m_flags & M_EXT) { n->m_data = m->m_data + off; mb_dupcl(n, m); } else bcopy(mtod(m, caddr_t)+off, mtod(n, caddr_t), (u_int)n->m_len); if (len != M_COPYALL) len -= n->m_len; off = 0; m = m->m_next; np = &n->m_next; } return (top); nospace: m_freem(top); return (NULL); } /* * Copy an entire packet, including header (which must be present). * An optimization of the common case `m_copym(m, 0, M_COPYALL, how)'. * Note that the copy is read-only, because clusters are not copied, * only their reference counts are incremented. * Preserve alignment of the first mbuf so if the creator has left * some room at the beginning (e.g. for inserting protocol headers) * the copies still have the room available. */ struct mbuf * m_copypacket(struct mbuf *m, int how) { struct mbuf *top, *n, *o; MBUF_CHECKSLEEP(how); n = m_get(how, m->m_type); top = n; if (n == NULL) goto nospace; if (!m_dup_pkthdr(n, m, how)) goto nospace; n->m_len = m->m_len; if (m->m_flags & M_EXT) { n->m_data = m->m_data; mb_dupcl(n, m); } else { n->m_data = n->m_pktdat + (m->m_data - m->m_pktdat ); bcopy(mtod(m, char *), mtod(n, char *), n->m_len); } m = m->m_next; while (m) { o = m_get(how, m->m_type); if (o == NULL) goto nospace; n->m_next = o; n = n->m_next; n->m_len = m->m_len; if (m->m_flags & M_EXT) { n->m_data = m->m_data; mb_dupcl(n, m); } else { bcopy(mtod(m, char *), mtod(n, char *), n->m_len); } m = m->m_next; } return top; nospace: m_freem(top); return (NULL); } /* * Copy data from an mbuf chain starting "off" bytes from the beginning, * continuing for "len" bytes, into the indicated buffer. */ void m_copydata(const struct mbuf *m, int off, int len, caddr_t cp) { u_int count; KASSERT(off >= 0, ("m_copydata, negative off %d", off)); KASSERT(len >= 0, ("m_copydata, negative len %d", len)); while (off > 0) { KASSERT(m != NULL, ("m_copydata, offset > size of mbuf chain")); if (off < m->m_len) break; off -= m->m_len; m = m->m_next; } while (len > 0) { KASSERT(m != NULL, ("m_copydata, length > size of mbuf chain")); count = min(m->m_len - off, len); bcopy(mtod(m, caddr_t) + off, cp, count); len -= count; cp += count; off = 0; m = m->m_next; } } /* * Copy a packet header mbuf chain into a completely new chain, including * copying any mbuf clusters. Use this instead of m_copypacket() when * you need a writable copy of an mbuf chain. */ struct mbuf * m_dup(const struct mbuf *m, int how) { struct mbuf **p, *top = NULL; int remain, moff, nsize; MBUF_CHECKSLEEP(how); /* Sanity check */ if (m == NULL) return (NULL); M_ASSERTPKTHDR(m); /* While there's more data, get a new mbuf, tack it on, and fill it */ remain = m->m_pkthdr.len; moff = 0; p = ⊤ while (remain > 0 || top == NULL) { /* allow m->m_pkthdr.len == 0 */ struct mbuf *n; /* Get the next new mbuf */ if (remain >= MINCLSIZE) { n = m_getcl(how, m->m_type, 0); nsize = MCLBYTES; } else { n = m_get(how, m->m_type); nsize = MLEN; } if (n == NULL) goto nospace; if (top == NULL) { /* First one, must be PKTHDR */ if (!m_dup_pkthdr(n, m, how)) { m_free(n); goto nospace; } if ((n->m_flags & M_EXT) == 0) nsize = MHLEN; n->m_flags &= ~M_RDONLY; } n->m_len = 0; /* Link it into the new chain */ *p = n; p = &n->m_next; /* Copy data from original mbuf(s) into new mbuf */ while (n->m_len < nsize && m != NULL) { int chunk = min(nsize - n->m_len, m->m_len - moff); bcopy(m->m_data + moff, n->m_data + n->m_len, chunk); moff += chunk; n->m_len += chunk; remain -= chunk; if (moff == m->m_len) { m = m->m_next; moff = 0; } } /* Check correct total mbuf length */ KASSERT((remain > 0 && m != NULL) || (remain == 0 && m == NULL), ("%s: bogus m_pkthdr.len", __func__)); } return (top); nospace: m_freem(top); return (NULL); } /* * Concatenate mbuf chain n to m. * Both chains must be of the same type (e.g. MT_DATA). * Any m_pkthdr is not updated. */ void m_cat(struct mbuf *m, struct mbuf *n) { while (m->m_next) m = m->m_next; while (n) { if (!M_WRITABLE(m) || M_TRAILINGSPACE(m) < n->m_len) { /* just join the two chains */ m->m_next = n; return; } /* splat the data from one into the other */ bcopy(mtod(n, caddr_t), mtod(m, caddr_t) + m->m_len, (u_int)n->m_len); m->m_len += n->m_len; n = m_free(n); } } /* * Concatenate two pkthdr mbuf chains. */ void m_catpkt(struct mbuf *m, struct mbuf *n) { M_ASSERTPKTHDR(m); M_ASSERTPKTHDR(n); m->m_pkthdr.len += n->m_pkthdr.len; m_demote(n, 1, 0); m_cat(m, n); } void m_adj(struct mbuf *mp, int req_len) { int len = req_len; struct mbuf *m; int count; if ((m = mp) == NULL) return; if (len >= 0) { /* * Trim from head. */ while (m != NULL && len > 0) { if (m->m_len <= len) { len -= m->m_len; m->m_len = 0; m = m->m_next; } else { m->m_len -= len; m->m_data += len; len = 0; } } if (mp->m_flags & M_PKTHDR) mp->m_pkthdr.len -= (req_len - len); } else { /* * Trim from tail. Scan the mbuf chain, * calculating its length and finding the last mbuf. * If the adjustment only affects this mbuf, then just * adjust and return. Otherwise, rescan and truncate * after the remaining size. */ len = -len; count = 0; for (;;) { count += m->m_len; if (m->m_next == (struct mbuf *)0) break; m = m->m_next; } if (m->m_len >= len) { m->m_len -= len; if (mp->m_flags & M_PKTHDR) mp->m_pkthdr.len -= len; return; } count -= len; if (count < 0) count = 0; /* * Correct length for chain is "count". * Find the mbuf with last data, adjust its length, * and toss data from remaining mbufs on chain. */ m = mp; if (m->m_flags & M_PKTHDR) m->m_pkthdr.len = count; for (; m; m = m->m_next) { if (m->m_len >= count) { m->m_len = count; if (m->m_next != NULL) { m_freem(m->m_next); m->m_next = NULL; } break; } count -= m->m_len; } } } /* * Rearange an mbuf chain so that len bytes are contiguous * and in the data area of an mbuf (so that mtod will work * for a structure of size len). Returns the resulting * mbuf chain on success, frees it and returns null on failure. * If there is room, it will add up to max_protohdr-len extra bytes to the * contiguous region in an attempt to avoid being called next time. */ struct mbuf * m_pullup(struct mbuf *n, int len) { struct mbuf *m; int count; int space; /* * If first mbuf has no cluster, and has room for len bytes * without shifting current data, pullup into it, * otherwise allocate a new mbuf to prepend to the chain. */ if ((n->m_flags & M_EXT) == 0 && n->m_data + len < &n->m_dat[MLEN] && n->m_next) { if (n->m_len >= len) return (n); m = n; n = n->m_next; len -= m->m_len; } else { if (len > MHLEN) goto bad; m = m_get(M_NOWAIT, n->m_type); if (m == NULL) goto bad; if (n->m_flags & M_PKTHDR) m_move_pkthdr(m, n); } space = &m->m_dat[MLEN] - (m->m_data + m->m_len); do { count = min(min(max(len, max_protohdr), space), n->m_len); bcopy(mtod(n, caddr_t), mtod(m, caddr_t) + m->m_len, (u_int)count); len -= count; m->m_len += count; n->m_len -= count; space -= count; if (n->m_len) n->m_data += count; else n = m_free(n); } while (len > 0 && n); if (len > 0) { (void) m_free(m); goto bad; } m->m_next = n; return (m); bad: m_freem(n); return (NULL); } /* * Like m_pullup(), except a new mbuf is always allocated, and we allow * the amount of empty space before the data in the new mbuf to be specified * (in the event that the caller expects to prepend later). */ struct mbuf * m_copyup(struct mbuf *n, int len, int dstoff) { struct mbuf *m; int count, space; if (len > (MHLEN - dstoff)) goto bad; m = m_get(M_NOWAIT, n->m_type); if (m == NULL) goto bad; if (n->m_flags & M_PKTHDR) m_move_pkthdr(m, n); m->m_data += dstoff; space = &m->m_dat[MLEN] - (m->m_data + m->m_len); do { count = min(min(max(len, max_protohdr), space), n->m_len); memcpy(mtod(m, caddr_t) + m->m_len, mtod(n, caddr_t), (unsigned)count); len -= count; m->m_len += count; n->m_len -= count; space -= count; if (n->m_len) n->m_data += count; else n = m_free(n); } while (len > 0 && n); if (len > 0) { (void) m_free(m); goto bad; } m->m_next = n; return (m); bad: m_freem(n); return (NULL); } /* * Partition an mbuf chain in two pieces, returning the tail -- * all but the first len0 bytes. In case of failure, it returns NULL and * attempts to restore the chain to its original state. * * Note that the resulting mbufs might be read-only, because the new * mbuf can end up sharing an mbuf cluster with the original mbuf if * the "breaking point" happens to lie within a cluster mbuf. Use the * M_WRITABLE() macro to check for this case. */ struct mbuf * m_split(struct mbuf *m0, int len0, int wait) { struct mbuf *m, *n; u_int len = len0, remain; MBUF_CHECKSLEEP(wait); for (m = m0; m && len > m->m_len; m = m->m_next) len -= m->m_len; if (m == NULL) return (NULL); remain = m->m_len - len; if (m0->m_flags & M_PKTHDR && remain == 0) { n = m_gethdr(wait, m0->m_type); if (n == NULL) return (NULL); n->m_next = m->m_next; m->m_next = NULL; n->m_pkthdr.rcvif = m0->m_pkthdr.rcvif; n->m_pkthdr.len = m0->m_pkthdr.len - len0; m0->m_pkthdr.len = len0; return (n); } else if (m0->m_flags & M_PKTHDR) { n = m_gethdr(wait, m0->m_type); if (n == NULL) return (NULL); n->m_pkthdr.rcvif = m0->m_pkthdr.rcvif; n->m_pkthdr.len = m0->m_pkthdr.len - len0; m0->m_pkthdr.len = len0; if (m->m_flags & M_EXT) goto extpacket; if (remain > MHLEN) { /* m can't be the lead packet */ M_ALIGN(n, 0); n->m_next = m_split(m, len, wait); if (n->m_next == NULL) { (void) m_free(n); return (NULL); } else { n->m_len = 0; return (n); } } else M_ALIGN(n, remain); } else if (remain == 0) { n = m->m_next; m->m_next = NULL; return (n); } else { n = m_get(wait, m->m_type); if (n == NULL) return (NULL); M_ALIGN(n, remain); } extpacket: if (m->m_flags & M_EXT) { n->m_data = m->m_data + len; mb_dupcl(n, m); } else { bcopy(mtod(m, caddr_t) + len, mtod(n, caddr_t), remain); } n->m_len = remain; m->m_len = len; n->m_next = m->m_next; m->m_next = NULL; return (n); } /* * Routine to copy from device local memory into mbufs. * Note that `off' argument is offset into first mbuf of target chain from * which to begin copying the data to. */ struct mbuf * m_devget(char *buf, int totlen, int off, struct ifnet *ifp, void (*copy)(char *from, caddr_t to, u_int len)) { struct mbuf *m; struct mbuf *top = NULL, **mp = ⊤ int len; if (off < 0 || off > MHLEN) return (NULL); while (totlen > 0) { if (top == NULL) { /* First one, must be PKTHDR */ if (totlen + off >= MINCLSIZE) { m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); len = MCLBYTES; } else { m = m_gethdr(M_NOWAIT, MT_DATA); len = MHLEN; /* Place initial small packet/header at end of mbuf */ if (m && totlen + off + max_linkhdr <= MHLEN) { m->m_data += max_linkhdr; len -= max_linkhdr; } } if (m == NULL) return NULL; m->m_pkthdr.rcvif = ifp; m->m_pkthdr.len = totlen; } else { if (totlen + off >= MINCLSIZE) { m = m_getcl(M_NOWAIT, MT_DATA, 0); len = MCLBYTES; } else { m = m_get(M_NOWAIT, MT_DATA); len = MLEN; } if (m == NULL) { m_freem(top); return NULL; } } if (off) { m->m_data += off; len -= off; off = 0; } m->m_len = len = min(totlen, len); if (copy) copy(buf, mtod(m, caddr_t), (u_int)len); else bcopy(buf, mtod(m, caddr_t), (u_int)len); buf += len; *mp = m; mp = &m->m_next; totlen -= len; } return (top); } /* * Copy data from a buffer back into the indicated mbuf chain, * starting "off" bytes from the beginning, extending the mbuf * chain if necessary. */ void m_copyback(struct mbuf *m0, int off, int len, c_caddr_t cp) { int mlen; struct mbuf *m = m0, *n; int totlen = 0; if (m0 == NULL) return; while (off > (mlen = m->m_len)) { off -= mlen; totlen += mlen; if (m->m_next == NULL) { n = m_get(M_NOWAIT, m->m_type); if (n == NULL) goto out; bzero(mtod(n, caddr_t), MLEN); n->m_len = min(MLEN, len + off); m->m_next = n; } m = m->m_next; } while (len > 0) { if (m->m_next == NULL && (len > m->m_len - off)) { m->m_len += min(len - (m->m_len - off), M_TRAILINGSPACE(m)); } mlen = min (m->m_len - off, len); bcopy(cp, off + mtod(m, caddr_t), (u_int)mlen); cp += mlen; len -= mlen; mlen += off; off = 0; totlen += mlen; if (len == 0) break; if (m->m_next == NULL) { n = m_get(M_NOWAIT, m->m_type); if (n == NULL) break; n->m_len = min(MLEN, len); m->m_next = n; } m = m->m_next; } out: if (((m = m0)->m_flags & M_PKTHDR) && (m->m_pkthdr.len < totlen)) m->m_pkthdr.len = totlen; } /* * Append the specified data to the indicated mbuf chain, * Extend the mbuf chain if the new data does not fit in * existing space. * * Return 1 if able to complete the job; otherwise 0. */ int m_append(struct mbuf *m0, int len, c_caddr_t cp) { struct mbuf *m, *n; int remainder, space; for (m = m0; m->m_next != NULL; m = m->m_next) ; remainder = len; space = M_TRAILINGSPACE(m); if (space > 0) { /* * Copy into available space. */ if (space > remainder) space = remainder; bcopy(cp, mtod(m, caddr_t) + m->m_len, space); m->m_len += space; cp += space, remainder -= space; } while (remainder > 0) { /* * Allocate a new mbuf; could check space * and allocate a cluster instead. */ n = m_get(M_NOWAIT, m->m_type); if (n == NULL) break; n->m_len = min(MLEN, remainder); bcopy(cp, mtod(n, caddr_t), n->m_len); cp += n->m_len, remainder -= n->m_len; m->m_next = n; m = n; } if (m0->m_flags & M_PKTHDR) m0->m_pkthdr.len += len - remainder; return (remainder == 0); } /* * Apply function f to the data in an mbuf chain starting "off" bytes from * the beginning, continuing for "len" bytes. */ int m_apply(struct mbuf *m, int off, int len, int (*f)(void *, void *, u_int), void *arg) { u_int count; int rval; KASSERT(off >= 0, ("m_apply, negative off %d", off)); KASSERT(len >= 0, ("m_apply, negative len %d", len)); while (off > 0) { KASSERT(m != NULL, ("m_apply, offset > size of mbuf chain")); if (off < m->m_len) break; off -= m->m_len; m = m->m_next; } while (len > 0) { KASSERT(m != NULL, ("m_apply, offset > size of mbuf chain")); count = min(m->m_len - off, len); rval = (*f)(arg, mtod(m, caddr_t) + off, count); if (rval) return (rval); len -= count; off = 0; m = m->m_next; } return (0); } /* * Return a pointer to mbuf/offset of location in mbuf chain. */ struct mbuf * m_getptr(struct mbuf *m, int loc, int *off) { while (loc >= 0) { /* Normal end of search. */ if (m->m_len > loc) { *off = loc; return (m); } else { loc -= m->m_len; if (m->m_next == NULL) { if (loc == 0) { /* Point at the end of valid data. */ *off = m->m_len; return (m); } return (NULL); } m = m->m_next; } } return (NULL); } void m_print(const struct mbuf *m, int maxlen) { int len; int pdata; const struct mbuf *m2; if (m == NULL) { printf("mbuf: %p\n", m); return; } if (m->m_flags & M_PKTHDR) len = m->m_pkthdr.len; else len = -1; m2 = m; while (m2 != NULL && (len == -1 || len)) { pdata = m2->m_len; if (maxlen != -1 && pdata > maxlen) pdata = maxlen; printf("mbuf: %p len: %d, next: %p, %b%s", m2, m2->m_len, m2->m_next, m2->m_flags, "\20\20freelist\17skipfw" "\11proto5\10proto4\7proto3\6proto2\5proto1\4rdonly" "\3eor\2pkthdr\1ext", pdata ? "" : "\n"); if (pdata) printf(", %*D\n", pdata, (u_char *)m2->m_data, "-"); if (len != -1) len -= m2->m_len; m2 = m2->m_next; } if (len > 0) printf("%d bytes unaccounted for.\n", len); return; } u_int m_fixhdr(struct mbuf *m0) { u_int len; len = m_length(m0, NULL); m0->m_pkthdr.len = len; return (len); } u_int m_length(struct mbuf *m0, struct mbuf **last) { struct mbuf *m; u_int len; len = 0; for (m = m0; m != NULL; m = m->m_next) { len += m->m_len; if (m->m_next == NULL) break; } if (last != NULL) *last = m; return (len); } /* * Defragment a mbuf chain, returning the shortest possible * chain of mbufs and clusters. If allocation fails and * this cannot be completed, NULL will be returned, but * the passed in chain will be unchanged. Upon success, * the original chain will be freed, and the new chain * will be returned. * * If a non-packet header is passed in, the original * mbuf (chain?) will be returned unharmed. */ struct mbuf * m_defrag(struct mbuf *m0, int how) { struct mbuf *m_new = NULL, *m_final = NULL; int progress = 0, length; MBUF_CHECKSLEEP(how); if (!(m0->m_flags & M_PKTHDR)) return (m0); m_fixhdr(m0); /* Needed sanity check */ #ifdef MBUF_STRESS_TEST if (m_defragrandomfailures) { int temp = arc4random() & 0xff; if (temp == 0xba) goto nospace; } #endif if (m0->m_pkthdr.len > MHLEN) m_final = m_getcl(how, MT_DATA, M_PKTHDR); else m_final = m_gethdr(how, MT_DATA); if (m_final == NULL) goto nospace; if (m_dup_pkthdr(m_final, m0, how) == 0) goto nospace; m_new = m_final; while (progress < m0->m_pkthdr.len) { length = m0->m_pkthdr.len - progress; if (length > MCLBYTES) length = MCLBYTES; if (m_new == NULL) { if (length > MLEN) m_new = m_getcl(how, MT_DATA, 0); else m_new = m_get(how, MT_DATA); if (m_new == NULL) goto nospace; } m_copydata(m0, progress, length, mtod(m_new, caddr_t)); progress += length; m_new->m_len = length; if (m_new != m_final) m_cat(m_final, m_new); m_new = NULL; } #ifdef MBUF_STRESS_TEST if (m0->m_next == NULL) m_defraguseless++; #endif m_freem(m0); m0 = m_final; #ifdef MBUF_STRESS_TEST m_defragpackets++; m_defragbytes += m0->m_pkthdr.len; #endif return (m0); nospace: #ifdef MBUF_STRESS_TEST m_defragfailure++; #endif if (m_final) m_freem(m_final); return (NULL); } /* * Defragment an mbuf chain, returning at most maxfrags separate * mbufs+clusters. If this is not possible NULL is returned and * the original mbuf chain is left in its present (potentially * modified) state. We use two techniques: collapsing consecutive * mbufs and replacing consecutive mbufs by a cluster. * * NB: this should really be named m_defrag but that name is taken */ struct mbuf * m_collapse(struct mbuf *m0, int how, int maxfrags) { struct mbuf *m, *n, *n2, **prev; u_int curfrags; /* * Calculate the current number of frags. */ curfrags = 0; for (m = m0; m != NULL; m = m->m_next) curfrags++; /* * First, try to collapse mbufs. Note that we always collapse * towards the front so we don't need to deal with moving the * pkthdr. This may be suboptimal if the first mbuf has much * less data than the following. */ m = m0; again: for (;;) { n = m->m_next; if (n == NULL) break; if (M_WRITABLE(m) && n->m_len < M_TRAILINGSPACE(m)) { bcopy(mtod(n, void *), mtod(m, char *) + m->m_len, n->m_len); m->m_len += n->m_len; m->m_next = n->m_next; m_free(n); if (--curfrags <= maxfrags) return m0; } else m = n; } KASSERT(maxfrags > 1, ("maxfrags %u, but normal collapse failed", maxfrags)); /* * Collapse consecutive mbufs to a cluster. */ prev = &m0->m_next; /* NB: not the first mbuf */ while ((n = *prev) != NULL) { if ((n2 = n->m_next) != NULL && n->m_len + n2->m_len < MCLBYTES) { m = m_getcl(how, MT_DATA, 0); if (m == NULL) goto bad; bcopy(mtod(n, void *), mtod(m, void *), n->m_len); bcopy(mtod(n2, void *), mtod(m, char *) + n->m_len, n2->m_len); m->m_len = n->m_len + n2->m_len; m->m_next = n2->m_next; *prev = m; m_free(n); m_free(n2); if (--curfrags <= maxfrags) /* +1 cl -2 mbufs */ return m0; /* * Still not there, try the normal collapse * again before we allocate another cluster. */ goto again; } prev = &n->m_next; } /* * No place where we can collapse to a cluster; punt. * This can occur if, for example, you request 2 frags * but the packet requires that both be clusters (we * never reallocate the first mbuf to avoid moving the * packet header). */ bad: return NULL; } #ifdef MBUF_STRESS_TEST /* * Fragment an mbuf chain. There's no reason you'd ever want to do * this in normal usage, but it's great for stress testing various * mbuf consumers. * * If fragmentation is not possible, the original chain will be * returned. * * Possible length values: * 0 no fragmentation will occur * > 0 each fragment will be of the specified length * -1 each fragment will be the same random value in length * -2 each fragment's length will be entirely random * (Random values range from 1 to 256) */ struct mbuf * m_fragment(struct mbuf *m0, int how, int length) { struct mbuf *m_new = NULL, *m_final = NULL; int progress = 0; if (!(m0->m_flags & M_PKTHDR)) return (m0); if ((length == 0) || (length < -2)) return (m0); m_fixhdr(m0); /* Needed sanity check */ m_final = m_getcl(how, MT_DATA, M_PKTHDR); if (m_final == NULL) goto nospace; if (m_dup_pkthdr(m_final, m0, how) == 0) goto nospace; m_new = m_final; if (length == -1) length = 1 + (arc4random() & 255); while (progress < m0->m_pkthdr.len) { int fraglen; if (length > 0) fraglen = length; else fraglen = 1 + (arc4random() & 255); if (fraglen > m0->m_pkthdr.len - progress) fraglen = m0->m_pkthdr.len - progress; if (fraglen > MCLBYTES) fraglen = MCLBYTES; if (m_new == NULL) { m_new = m_getcl(how, MT_DATA, 0); if (m_new == NULL) goto nospace; } m_copydata(m0, progress, fraglen, mtod(m_new, caddr_t)); progress += fraglen; m_new->m_len = fraglen; if (m_new != m_final) m_cat(m_final, m_new); m_new = NULL; } m_freem(m0); m0 = m_final; return (m0); nospace: if (m_final) m_freem(m_final); /* Return the original chain on failure */ return (m0); } #endif /* * Copy the contents of uio into a properly sized mbuf chain. */ struct mbuf * m_uiotombuf(struct uio *uio, int how, int len, int align, int flags) { struct mbuf *m, *mb; int error, length; ssize_t total; int progress = 0; /* * len can be zero or an arbitrary large value bound by * the total data supplied by the uio. */ if (len > 0) total = min(uio->uio_resid, len); else total = uio->uio_resid; /* * The smallest unit returned by m_getm2() is a single mbuf * with pkthdr. We can't align past it. */ if (align >= MHLEN) return (NULL); /* * Give us the full allocation or nothing. * If len is zero return the smallest empty mbuf. */ m = m_getm2(NULL, max(total + align, 1), how, MT_DATA, flags); if (m == NULL) return (NULL); m->m_data += align; /* Fill all mbufs with uio data and update header information. */ for (mb = m; mb != NULL; mb = mb->m_next) { length = min(M_TRAILINGSPACE(mb), total - progress); error = uiomove(mtod(mb, void *), length, uio); if (error) { m_freem(m); return (NULL); } mb->m_len = length; progress += length; if (flags & M_PKTHDR) m->m_pkthdr.len += length; } KASSERT(progress == total, ("%s: progress != total", __func__)); return (m); } /* * Copy an mbuf chain into a uio limited by len if set. */ int m_mbuftouio(struct uio *uio, struct mbuf *m, int len) { int error, length, total; int progress = 0; if (len > 0) total = min(uio->uio_resid, len); else total = uio->uio_resid; /* Fill the uio with data from the mbufs. */ for (; m != NULL; m = m->m_next) { length = min(m->m_len, total - progress); error = uiomove(mtod(m, void *), length, uio); if (error) return (error); progress += length; } return (0); } /* * Create a writable copy of the mbuf chain. While doing this * we compact the chain with a goal of producing a chain with * at most two mbufs. The second mbuf in this chain is likely * to be a cluster. The primary purpose of this work is to create * a writable packet for encryption, compression, etc. The * secondary goal is to linearize the data so the data can be * passed to crypto hardware in the most efficient manner possible. */ struct mbuf * m_unshare(struct mbuf *m0, int how) { struct mbuf *m, *mprev; struct mbuf *n, *mfirst, *mlast; int len, off; mprev = NULL; for (m = m0; m != NULL; m = mprev->m_next) { /* * Regular mbufs are ignored unless there's a cluster * in front of it that we can use to coalesce. We do * the latter mainly so later clusters can be coalesced * also w/o having to handle them specially (i.e. convert * mbuf+cluster -> cluster). This optimization is heavily * influenced by the assumption that we're running over * Ethernet where MCLBYTES is large enough that the max * packet size will permit lots of coalescing into a * single cluster. This in turn permits efficient * crypto operations, especially when using hardware. */ if ((m->m_flags & M_EXT) == 0) { if (mprev && (mprev->m_flags & M_EXT) && m->m_len <= M_TRAILINGSPACE(mprev)) { /* XXX: this ignores mbuf types */ memcpy(mtod(mprev, caddr_t) + mprev->m_len, mtod(m, caddr_t), m->m_len); mprev->m_len += m->m_len; mprev->m_next = m->m_next; /* unlink from chain */ m_free(m); /* reclaim mbuf */ #if 0 newipsecstat.ips_mbcoalesced++; #endif } else { mprev = m; } continue; } /* * Writable mbufs are left alone (for now). */ if (M_WRITABLE(m)) { mprev = m; continue; } /* * Not writable, replace with a copy or coalesce with * the previous mbuf if possible (since we have to copy * it anyway, we try to reduce the number of mbufs and * clusters so that future work is easier). */ KASSERT(m->m_flags & M_EXT, ("m_flags 0x%x", m->m_flags)); /* NB: we only coalesce into a cluster or larger */ if (mprev != NULL && (mprev->m_flags & M_EXT) && m->m_len <= M_TRAILINGSPACE(mprev)) { /* XXX: this ignores mbuf types */ memcpy(mtod(mprev, caddr_t) + mprev->m_len, mtod(m, caddr_t), m->m_len); mprev->m_len += m->m_len; mprev->m_next = m->m_next; /* unlink from chain */ m_free(m); /* reclaim mbuf */ #if 0 newipsecstat.ips_clcoalesced++; #endif continue; } /* * Allocate new space to hold the copy and copy the data. * We deal with jumbo mbufs (i.e. m_len > MCLBYTES) by * splitting them into clusters. We could just malloc a * buffer and make it external but too many device drivers * don't know how to break up the non-contiguous memory when * doing DMA. */ n = m_getcl(how, m->m_type, m->m_flags & M_COPYFLAGS); if (n == NULL) { m_freem(m0); return (NULL); } if (m->m_flags & M_PKTHDR) { KASSERT(mprev == NULL, ("%s: m0 %p, m %p has M_PKTHDR", __func__, m0, m)); m_move_pkthdr(n, m); } len = m->m_len; off = 0; mfirst = n; mlast = NULL; for (;;) { int cc = min(len, MCLBYTES); memcpy(mtod(n, caddr_t), mtod(m, caddr_t) + off, cc); n->m_len = cc; if (mlast != NULL) mlast->m_next = n; mlast = n; #if 0 newipsecstat.ips_clcopied++; #endif len -= cc; if (len <= 0) break; off += cc; n = m_getcl(how, m->m_type, m->m_flags & M_COPYFLAGS); if (n == NULL) { m_freem(mfirst); m_freem(m0); return (NULL); } } n->m_next = m->m_next; if (mprev == NULL) m0 = mfirst; /* new head of chain */ else mprev->m_next = mfirst; /* replace old mbuf */ m_free(m); /* release old mbuf */ mprev = mfirst; } return (m0); } #ifdef MBUF_PROFILING #define MP_BUCKETS 32 /* don't just change this as things may overflow.*/ struct mbufprofile { uintmax_t wasted[MP_BUCKETS]; uintmax_t used[MP_BUCKETS]; uintmax_t segments[MP_BUCKETS]; } mbprof; #define MP_MAXDIGITS 21 /* strlen("16,000,000,000,000,000,000") == 21 */ #define MP_NUMLINES 6 #define MP_NUMSPERLINE 16 #define MP_EXTRABYTES 64 /* > strlen("used:\nwasted:\nsegments:\n") */ /* work out max space needed and add a bit of spare space too */ #define MP_MAXLINE ((MP_MAXDIGITS+1) * MP_NUMSPERLINE) #define MP_BUFSIZE ((MP_MAXLINE * MP_NUMLINES) + 1 + MP_EXTRABYTES) char mbprofbuf[MP_BUFSIZE]; void m_profile(struct mbuf *m) { int segments = 0; int used = 0; int wasted = 0; while (m) { segments++; used += m->m_len; if (m->m_flags & M_EXT) { wasted += MHLEN - sizeof(m->m_ext) + m->m_ext.ext_size - m->m_len; } else { if (m->m_flags & M_PKTHDR) wasted += MHLEN - m->m_len; else wasted += MLEN - m->m_len; } m = m->m_next; } /* be paranoid.. it helps */ if (segments > MP_BUCKETS - 1) segments = MP_BUCKETS - 1; if (used > 100000) used = 100000; if (wasted > 100000) wasted = 100000; /* store in the appropriate bucket */ /* don't bother locking. if it's slightly off, so what? */ mbprof.segments[segments]++; mbprof.used[fls(used)]++; mbprof.wasted[fls(wasted)]++; } static void mbprof_textify(void) { int offset; char *c; uint64_t *p; p = &mbprof.wasted[0]; c = mbprofbuf; offset = snprintf(c, MP_MAXLINE + 10, "wasted:\n" "%ju %ju %ju %ju %ju %ju %ju %ju " "%ju %ju %ju %ju %ju %ju %ju %ju\n", p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15]); #ifdef BIG_ARRAY p = &mbprof.wasted[16]; c += offset; offset = snprintf(c, MP_MAXLINE, "%ju %ju %ju %ju %ju %ju %ju %ju " "%ju %ju %ju %ju %ju %ju %ju %ju\n", p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15]); #endif p = &mbprof.used[0]; c += offset; offset = snprintf(c, MP_MAXLINE + 10, "used:\n" "%ju %ju %ju %ju %ju %ju %ju %ju " "%ju %ju %ju %ju %ju %ju %ju %ju\n", p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15]); #ifdef BIG_ARRAY p = &mbprof.used[16]; c += offset; offset = snprintf(c, MP_MAXLINE, "%ju %ju %ju %ju %ju %ju %ju %ju " "%ju %ju %ju %ju %ju %ju %ju %ju\n", p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15]); #endif p = &mbprof.segments[0]; c += offset; offset = snprintf(c, MP_MAXLINE + 10, "segments:\n" "%ju %ju %ju %ju %ju %ju %ju %ju " "%ju %ju %ju %ju %ju %ju %ju %ju\n", p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15]); #ifdef BIG_ARRAY p = &mbprof.segments[16]; c += offset; offset = snprintf(c, MP_MAXLINE, "%ju %ju %ju %ju %ju %ju %ju %ju " "%ju %ju %ju %ju %ju %ju %ju %jju", p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15]); #endif } static int mbprof_handler(SYSCTL_HANDLER_ARGS) { int error; mbprof_textify(); error = SYSCTL_OUT(req, mbprofbuf, strlen(mbprofbuf) + 1); return (error); } static int mbprof_clr_handler(SYSCTL_HANDLER_ARGS) { int clear, error; clear = 0; error = sysctl_handle_int(oidp, &clear, 0, req); if (error || !req->newptr) return (error); if (clear) { bzero(&mbprof, sizeof(mbprof)); } return (error); } SYSCTL_PROC(_kern_ipc, OID_AUTO, mbufprofile, CTLTYPE_STRING|CTLFLAG_RD, NULL, 0, mbprof_handler, "A", "mbuf profiling statistics"); SYSCTL_PROC(_kern_ipc, OID_AUTO, mbufprofileclr, CTLTYPE_INT|CTLFLAG_RW, NULL, 0, mbprof_clr_handler, "I", "clear mbuf profiling statistics"); #endif Index: projects/netbsd-tests-upstream-01-2017/sys/mips/atheros/ar71xxreg.h =================================================================== --- projects/netbsd-tests-upstream-01-2017/sys/mips/atheros/ar71xxreg.h (revision 312217) +++ projects/netbsd-tests-upstream-01-2017/sys/mips/atheros/ar71xxreg.h (revision 312218) @@ -1,569 +1,573 @@ /*- * Copyright (c) 2009 Oleksandr Tymoshenko * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* $FreeBSD$ */ #ifndef _AR71XX_REG_H_ #define _AR71XX_REG_H_ /* PCI region */ #define AR71XX_PCI_MEM_BASE 0x10000000 /* * PCI mem windows is 0x08000000 bytes long but we exclude control * region from the resource manager */ #define AR71XX_PCI_MEM_SIZE 0x07000000 #define AR71XX_PCI_IRQ_START 0 #define AR71XX_PCI_IRQ_END 2 #define AR71XX_PCI_NIRQS 3 /* * PCI devices slots are starting from this number */ #define AR71XX_PCI_BASE_SLOT 17 /* PCI config registers */ #define AR71XX_PCI_LCONF_CMD 0x17010000 #define PCI_LCONF_CMD_READ 0x00000000 #define PCI_LCONF_CMD_WRITE 0x00010000 #define AR71XX_PCI_LCONF_WRITE_DATA 0x17010004 #define AR71XX_PCI_LCONF_READ_DATA 0x17010008 #define AR71XX_PCI_CONF_ADDR 0x1701000C #define AR71XX_PCI_CONF_CMD 0x17010010 #define PCI_CONF_CMD_READ 0x0000000A #define PCI_CONF_CMD_WRITE 0x0000000B #define AR71XX_PCI_CONF_WRITE_DATA 0x17010014 #define AR71XX_PCI_CONF_READ_DATA 0x17010018 #define AR71XX_PCI_ERROR 0x1701001C #define AR71XX_PCI_ERROR_ADDR 0x17010020 #define AR71XX_PCI_AHB_ERROR 0x17010024 #define AR71XX_PCI_AHB_ERROR_ADDR 0x17010028 /* APB region */ /* * Size is not really true actual APB window size is * 0x01000000 but it should handle OHCI memory as well * because this controller's interrupt is routed through * APB. */ #define AR71XX_APB_BASE 0x18000000 #define AR71XX_APB_SIZE 0x06000000 /* DDR registers */ #define AR71XX_DDR_CONFIG 0x18000000 #define AR71XX_DDR_CONFIG2 0x18000004 #define AR71XX_DDR_MODE_REGISTER 0x18000008 #define AR71XX_DDR_EXT_MODE_REGISTER 0x1800000C #define AR71XX_DDR_CONTROL 0x18000010 #define AR71XX_DDR_REFRESH 0x18000014 #define AR71XX_DDR_RD_DATA_THIS_CYCLE 0x18000018 #define AR71XX_TAP_CONTROL0 0x1800001C #define AR71XX_TAP_CONTROL1 0x18000020 #define AR71XX_TAP_CONTROL2 0x18000024 #define AR71XX_TAP_CONTROL3 0x18000028 #define AR71XX_PCI_WINDOW0 0x1800007C #define AR71XX_PCI_WINDOW1 0x18000080 #define AR71XX_PCI_WINDOW2 0x18000084 #define AR71XX_PCI_WINDOW3 0x18000088 #define AR71XX_PCI_WINDOW4 0x1800008C #define AR71XX_PCI_WINDOW5 0x18000090 #define AR71XX_PCI_WINDOW6 0x18000094 #define AR71XX_PCI_WINDOW7 0x18000098 #define AR71XX_WB_FLUSH_GE0 0x1800009C #define AR71XX_WB_FLUSH_GE1 0x180000A0 #define AR71XX_WB_FLUSH_USB 0x180000A4 #define AR71XX_WB_FLUSH_PCI 0x180000A8 /* * Values for PCI_WINDOW_X registers */ #define PCI_WINDOW0_ADDR 0x10000000 #define PCI_WINDOW1_ADDR 0x11000000 #define PCI_WINDOW2_ADDR 0x12000000 #define PCI_WINDOW3_ADDR 0x13000000 #define PCI_WINDOW4_ADDR 0x14000000 #define PCI_WINDOW5_ADDR 0x15000000 #define PCI_WINDOW6_ADDR 0x16000000 #define PCI_WINDOW7_ADDR 0x17000000 /* This value enables acces to PCI config registers */ #define PCI_WINDOW7_CONF_ADDR 0x07000000 #define AR71XX_UART_ADDR 0x18020000 +#define AR71XX_UART_THR 0x0 +#define AR71XX_UART_LSR 0x14 +#define AR71XX_UART_LSR_THRE (1 << 5) +#define AR71XX_UART_LSR_TEMT (1 << 6) #define AR71XX_USB_CTRL_FLADJ 0x18030000 #define USB_CTRL_FLADJ_HOST_SHIFT 12 #define USB_CTRL_FLADJ_A5_SHIFT 10 #define USB_CTRL_FLADJ_A4_SHIFT 8 #define USB_CTRL_FLADJ_A3_SHIFT 6 #define USB_CTRL_FLADJ_A2_SHIFT 4 #define USB_CTRL_FLADJ_A1_SHIFT 2 #define USB_CTRL_FLADJ_A0_SHIFT 0 #define AR71XX_USB_CTRL_CONFIG 0x18030004 #define USB_CTRL_CONFIG_OHCI_DES_SWAP (1 << 19) #define USB_CTRL_CONFIG_OHCI_BUF_SWAP (1 << 18) #define USB_CTRL_CONFIG_EHCI_DES_SWAP (1 << 17) #define USB_CTRL_CONFIG_EHCI_BUF_SWAP (1 << 16) #define USB_CTRL_CONFIG_DISABLE_XTL (1 << 13) #define USB_CTRL_CONFIG_OVERRIDE_XTL (1 << 12) #define USB_CTRL_CONFIG_CLK_SEL_SHIFT 4 #define USB_CTRL_CONFIG_CLK_SEL_MASK 3 #define USB_CTRL_CONFIG_CLK_SEL_12 0 #define USB_CTRL_CONFIG_CLK_SEL_24 1 #define USB_CTRL_CONFIG_CLK_SEL_48 2 #define USB_CTRL_CONFIG_OVER_CURRENT_AS_GPIO (1 << 8) #define USB_CTRL_CONFIG_SS_SIMULATION_MODE (1 << 2) #define USB_CTRL_CONFIG_RESUME_UTMI_PLS_DIS (1 << 1) #define USB_CTRL_CONFIG_UTMI_BACKWARD_ENB (1 << 0) #define AR71XX_GPIO_BASE 0x18040000 #define AR71XX_GPIO_OE 0x00 #define AR71XX_GPIO_IN 0x04 #define AR71XX_GPIO_OUT 0x08 #define AR71XX_GPIO_SET 0x0c #define AR71XX_GPIO_CLEAR 0x10 #define AR71XX_GPIO_INT 0x14 #define AR71XX_GPIO_INT_TYPE 0x18 #define AR71XX_GPIO_INT_POLARITY 0x1c #define AR71XX_GPIO_INT_PENDING 0x20 #define AR71XX_GPIO_INT_MASK 0x24 #define AR71XX_GPIO_FUNCTION 0x28 #define GPIO_FUNC_STEREO_EN (1 << 17) #define GPIO_FUNC_SLIC_EN (1 << 16) #define GPIO_FUNC_SPI_CS2_EN (1 << 13) /* CS2 is shared with GPIO_1 */ #define GPIO_FUNC_SPI_CS1_EN (1 << 12) /* CS1 is shared with GPIO_0 */ #define GPIO_FUNC_UART_EN (1 << 8) #define GPIO_FUNC_USB_OC_EN (1 << 4) #define GPIO_FUNC_USB_CLK_EN (0) #define AR71XX_BASE_FREQ 40000000 #define AR71XX_PLL_CPU_BASE 0x18050000 #define AR71XX_PLL_CPU_CONFIG 0x18050000 #define PLL_SW_UPDATE (1U << 31) #define PLL_LOCKED (1 << 30) #define PLL_AHB_DIV_SHIFT 20 #define PLL_AHB_DIV_MASK 7 #define PLL_DDR_DIV_SEL_SHIFT 18 #define PLL_DDR_DIV_SEL_MASK 3 #define PLL_CPU_DIV_SEL_SHIFT 16 #define PLL_CPU_DIV_SEL_MASK 3 #define PLL_LOOP_BW_SHIFT 12 #define PLL_LOOP_BW_MASK 0xf #define PLL_DIV_IN_SHIFT 10 #define PLL_DIV_IN_MASK 3 #define PLL_DIV_OUT_SHIFT 8 #define PLL_DIV_OUT_MASK 3 #define PLL_FB_SHIFT 3 #define PLL_FB_MASK 0x1f #define PLL_BYPASS (1 << 1) #define PLL_POWER_DOWN (1 << 0) #define AR71XX_PLL_SEC_CONFIG 0x18050004 #define AR71XX_PLL_ETH0_SHIFT 17 #define AR71XX_PLL_ETH1_SHIFT 19 #define AR71XX_PLL_CPU_CLK_CTRL 0x18050008 #define AR71XX_PLL_ETH_INT0_CLK 0x18050010 #define AR71XX_PLL_ETH_INT1_CLK 0x18050014 #define XPLL_ETH_INT_CLK_10 0x00991099 #define XPLL_ETH_INT_CLK_100 0x00441011 #define XPLL_ETH_INT_CLK_1000 0x13110000 #define XPLL_ETH_INT_CLK_1000_GMII 0x14110000 #define PLL_ETH_INT_CLK_10 0x00991099 #define PLL_ETH_INT_CLK_100 0x00001099 #define PLL_ETH_INT_CLK_1000 0x00110000 #define AR71XX_PLL_ETH_EXT_CLK 0x18050018 #define AR71XX_PLL_PCI_CLK 0x1805001C /* Reset block */ #define AR71XX_RST_BLOCK_BASE 0x18060000 #define AR71XX_RST_WDOG_CONTROL 0x18060008 #define RST_WDOG_LAST (1U << 31) #define RST_WDOG_ACTION_MASK 3 #define RST_WDOG_ACTION_RESET 3 #define RST_WDOG_ACTION_NMI 2 #define RST_WDOG_ACTION_GP_INTR 1 #define RST_WDOG_ACTION_NOACTION 0 #define AR71XX_RST_WDOG_TIMER 0x1806000C /* * APB interrupt status and mask register and interrupt bit numbers for */ #define AR71XX_MISC_INTR_STATUS 0x18060010 #define AR71XX_MISC_INTR_MASK 0x18060014 #define MISC_INTR_TIMER 0 #define MISC_INTR_ERROR 1 #define MISC_INTR_GPIO 2 #define MISC_INTR_UART 3 #define MISC_INTR_WATCHDOG 4 #define MISC_INTR_PERF 5 #define MISC_INTR_OHCI 6 #define MISC_INTR_DMA 7 #define AR71XX_PCI_INTR_STATUS 0x18060018 #define AR71XX_PCI_INTR_MASK 0x1806001C #define PCI_INTR_CORE (1 << 4) #define AR71XX_RST_RESET 0x18060024 #define RST_RESET_FULL_CHIP (1 << 24) /* Same as pulling the reset pin */ #define RST_RESET_CPU_COLD (1 << 20) /* Cold reset */ #define RST_RESET_GE1_MAC (1 << 13) #define RST_RESET_GE1_PHY (1 << 12) #define RST_RESET_GE0_MAC (1 << 9) #define RST_RESET_GE0_PHY (1 << 8) #define RST_RESET_USB_OHCI_DLL (1 << 6) #define RST_RESET_USB_HOST (1 << 5) #define RST_RESET_USB_PHY (1 << 4) #define RST_RESET_PCI_BUS (1 << 1) #define RST_RESET_PCI_CORE (1 << 0) /* Chipset revision details */ #define AR71XX_RST_RESET_REG_REV_ID 0x18060090 #define REV_ID_MAJOR_MASK 0xfff0 #define REV_ID_MAJOR_AR71XX 0x00a0 #define REV_ID_MAJOR_AR913X 0x00b0 #define REV_ID_MAJOR_AR7240 0x00c0 #define REV_ID_MAJOR_AR7241 0x0100 #define REV_ID_MAJOR_AR7242 0x1100 /* AR71XX chipset revision details */ #define AR71XX_REV_ID_MINOR_MASK 0x3 #define AR71XX_REV_ID_MINOR_AR7130 0x0 #define AR71XX_REV_ID_MINOR_AR7141 0x1 #define AR71XX_REV_ID_MINOR_AR7161 0x2 #define AR71XX_REV_ID_REVISION_MASK 0x3 #define AR71XX_REV_ID_REVISION_SHIFT 2 /* AR724X chipset revision details */ #define AR724X_REV_ID_REVISION_MASK 0x3 /* AR91XX chipset revision details */ #define AR91XX_REV_ID_MINOR_MASK 0x3 #define AR91XX_REV_ID_MINOR_AR9130 0x0 #define AR91XX_REV_ID_MINOR_AR9132 0x1 #define AR91XX_REV_ID_REVISION_MASK 0x3 #define AR91XX_REV_ID_REVISION_SHIFT 2 typedef enum { AR71XX_MII_MODE_NONE = 0, AR71XX_MII_MODE_GMII, AR71XX_MII_MODE_MII, AR71XX_MII_MODE_RGMII, AR71XX_MII_MODE_RMII, AR71XX_MII_MODE_SGMII /* not hardware defined, though! */ } ar71xx_mii_mode; /* * AR71xx MII control region */ #define AR71XX_MII0_CTRL 0x18070000 #define MII_CTRL_SPEED_SHIFT 4 #define MII_CTRL_SPEED_MASK 3 #define MII_CTRL_SPEED_10 0 #define MII_CTRL_SPEED_100 1 #define MII_CTRL_SPEED_1000 2 #define MII_CTRL_IF_MASK 3 #define MII_CTRL_IF_SHIFT 0 #define MII0_CTRL_IF_GMII 0 #define MII0_CTRL_IF_MII 1 #define MII0_CTRL_IF_RGMII 2 #define MII0_CTRL_IF_RMII 3 #define AR71XX_MII1_CTRL 0x18070004 #define MII1_CTRL_IF_RGMII 0 #define MII1_CTRL_IF_RMII 1 /* * GigE adapters region */ #define AR71XX_MAC0_BASE 0x19000000 #define AR71XX_MAC1_BASE 0x1A000000 #define AR71XX_MAC_CFG1 0x00 #define MAC_CFG1_SOFT_RESET (1U << 31) #define MAC_CFG1_SIMUL_RESET (1 << 30) #define MAC_CFG1_MAC_RX_BLOCK_RESET (1 << 19) #define MAC_CFG1_MAC_TX_BLOCK_RESET (1 << 18) #define MAC_CFG1_RX_FUNC_RESET (1 << 17) #define MAC_CFG1_TX_FUNC_RESET (1 << 16) #define MAC_CFG1_LOOPBACK (1 << 8) #define MAC_CFG1_RXFLOW_CTRL (1 << 5) #define MAC_CFG1_TXFLOW_CTRL (1 << 4) #define MAC_CFG1_SYNC_RX (1 << 3) #define MAC_CFG1_RX_ENABLE (1 << 2) #define MAC_CFG1_SYNC_TX (1 << 1) #define MAC_CFG1_TX_ENABLE (1 << 0) #define AR71XX_MAC_CFG2 0x04 #define MAC_CFG2_PREAMBLE_LEN_MASK 0xf #define MAC_CFG2_PREAMBLE_LEN_SHIFT 12 #define MAC_CFG2_IFACE_MODE_1000 (2 << 8) #define MAC_CFG2_IFACE_MODE_10_100 (1 << 8) #define MAC_CFG2_IFACE_MODE_SHIFT 8 #define MAC_CFG2_IFACE_MODE_MASK 3 #define MAC_CFG2_HUGE_FRAME (1 << 5) #define MAC_CFG2_LENGTH_FIELD (1 << 4) #define MAC_CFG2_ENABLE_PADCRC (1 << 2) #define MAC_CFG2_ENABLE_CRC (1 << 1) #define MAC_CFG2_FULL_DUPLEX (1 << 0) #define AR71XX_MAC_IFG 0x08 #define AR71XX_MAC_HDUPLEX 0x0C #define AR71XX_MAC_MAX_FRAME_LEN 0x10 #define AR71XX_MAC_MII_CFG 0x20 #define MAC_MII_CFG_RESET (1U << 31) #define MAC_MII_CFG_SCAN_AUTO_INC (1 << 5) #define MAC_MII_CFG_PREAMBLE_SUP (1 << 4) #define MAC_MII_CFG_CLOCK_SELECT_MASK 0x7 #define MAC_MII_CFG_CLOCK_SELECT_MASK_AR933X 0xf #define MAC_MII_CFG_CLOCK_DIV_4 0 #define MAC_MII_CFG_CLOCK_DIV_6 2 #define MAC_MII_CFG_CLOCK_DIV_8 3 #define MAC_MII_CFG_CLOCK_DIV_10 4 #define MAC_MII_CFG_CLOCK_DIV_14 5 #define MAC_MII_CFG_CLOCK_DIV_20 6 #define MAC_MII_CFG_CLOCK_DIV_28 7 /* .. and the AR933x/AR934x extensions */ #define MAC_MII_CFG_CLOCK_DIV_34 8 #define MAC_MII_CFG_CLOCK_DIV_42 9 #define MAC_MII_CFG_CLOCK_DIV_50 10 #define MAC_MII_CFG_CLOCK_DIV_58 11 #define MAC_MII_CFG_CLOCK_DIV_66 12 #define MAC_MII_CFG_CLOCK_DIV_74 13 #define MAC_MII_CFG_CLOCK_DIV_82 14 #define MAC_MII_CFG_CLOCK_DIV_98 15 #define AR71XX_MAC_MII_CMD 0x24 #define MAC_MII_CMD_SCAN_CYCLE (1 << 1) #define MAC_MII_CMD_READ 1 #define MAC_MII_CMD_WRITE 0 #define AR71XX_MAC_MII_ADDR 0x28 #define MAC_MII_PHY_ADDR_SHIFT 8 #define MAC_MII_PHY_ADDR_MASK 0xff #define MAC_MII_REG_MASK 0x1f #define AR71XX_MAC_MII_CONTROL 0x2C #define MAC_MII_CONTROL_MASK 0xffff #define AR71XX_MAC_MII_STATUS 0x30 #define MAC_MII_STATUS_MASK 0xffff #define AR71XX_MAC_MII_INDICATOR 0x34 #define MAC_MII_INDICATOR_NOT_VALID (1 << 2) #define MAC_MII_INDICATOR_SCANNING (1 << 1) #define MAC_MII_INDICATOR_BUSY (1 << 0) #define AR71XX_MAC_IFCONTROL 0x38 #define MAC_IFCONTROL_SPEED (1 << 16) #define AR71XX_MAC_STA_ADDR1 0x40 #define AR71XX_MAC_STA_ADDR2 0x44 #define AR71XX_MAC_FIFO_CFG0 0x48 #define FIFO_CFG0_TX_FABRIC (1 << 4) #define FIFO_CFG0_TX_SYSTEM (1 << 3) #define FIFO_CFG0_RX_FABRIC (1 << 2) #define FIFO_CFG0_RX_SYSTEM (1 << 1) #define FIFO_CFG0_WATERMARK (1 << 0) #define FIFO_CFG0_ALL ((1 << 5) - 1) #define FIFO_CFG0_ENABLE_SHIFT 8 #define AR71XX_MAC_FIFO_CFG1 0x4C #define AR71XX_MAC_FIFO_CFG2 0x50 #define AR71XX_MAC_FIFO_TX_THRESHOLD 0x54 #define AR71XX_MAC_FIFO_RX_FILTMATCH 0x58 /* * These flags applicable both to AR71XX_MAC_FIFO_RX_FILTMASK and * to AR71XX_MAC_FIFO_RX_FILTMATCH */ #define FIFO_RX_MATCH_UNICAST (1 << 17) #define FIFO_RX_MATCH_TRUNC_FRAME (1 << 16) #define FIFO_RX_MATCH_VLAN_TAG (1 << 15) #define FIFO_RX_MATCH_UNSUP_OPCODE (1 << 14) #define FIFO_RX_MATCH_PAUSE_FRAME (1 << 13) #define FIFO_RX_MATCH_CTRL_FRAME (1 << 12) #define FIFO_RX_MATCH_LONG_EVENT (1 << 11) #define FIFO_RX_MATCH_DRIBBLE_NIBBLE (1 << 10) #define FIFO_RX_MATCH_BCAST (1 << 9) #define FIFO_RX_MATCH_MCAST (1 << 8) #define FIFO_RX_MATCH_OK (1 << 7) #define FIFO_RX_MATCH_OORANGE (1 << 6) #define FIFO_RX_MATCH_LEN_MSMTCH (1 << 5) #define FIFO_RX_MATCH_CRC_ERROR (1 << 4) #define FIFO_RX_MATCH_CODE_ERROR (1 << 3) #define FIFO_RX_MATCH_FALSE_CARRIER (1 << 2) #define FIFO_RX_MATCH_RX_DV_EVENT (1 << 1) #define FIFO_RX_MATCH_DROP_EVENT (1 << 0) /* * Exclude unicast and truncated frames from matching */ #define FIFO_RX_FILTMATCH_DEFAULT \ (FIFO_RX_MATCH_VLAN_TAG | \ FIFO_RX_MATCH_UNSUP_OPCODE | \ FIFO_RX_MATCH_PAUSE_FRAME | \ FIFO_RX_MATCH_CTRL_FRAME | \ FIFO_RX_MATCH_LONG_EVENT | \ FIFO_RX_MATCH_DRIBBLE_NIBBLE | \ FIFO_RX_MATCH_BCAST | \ FIFO_RX_MATCH_MCAST | \ FIFO_RX_MATCH_OK | \ FIFO_RX_MATCH_OORANGE | \ FIFO_RX_MATCH_LEN_MSMTCH | \ FIFO_RX_MATCH_CRC_ERROR | \ FIFO_RX_MATCH_CODE_ERROR | \ FIFO_RX_MATCH_FALSE_CARRIER | \ FIFO_RX_MATCH_RX_DV_EVENT | \ FIFO_RX_MATCH_DROP_EVENT) #define AR71XX_MAC_FIFO_RX_FILTMASK 0x5C #define FIFO_RX_MASK_BYTE_MODE (1 << 19) #define FIFO_RX_MASK_NO_SHORT_FRAME (1 << 18) #define FIFO_RX_MASK_BIT17 (1 << 17) #define FIFO_RX_MASK_BIT16 (1 << 16) #define FIFO_RX_MASK_TRUNC_FRAME (1 << 15) #define FIFO_RX_MASK_LONG_EVENT (1 << 14) #define FIFO_RX_MASK_VLAN_TAG (1 << 13) #define FIFO_RX_MASK_UNSUP_OPCODE (1 << 12) #define FIFO_RX_MASK_PAUSE_FRAME (1 << 11) #define FIFO_RX_MASK_CTRL_FRAME (1 << 10) #define FIFO_RX_MASK_DRIBBLE_NIBBLE (1 << 9) #define FIFO_RX_MASK_BCAST (1 << 8) #define FIFO_RX_MASK_MCAST (1 << 7) #define FIFO_RX_MASK_OK (1 << 6) #define FIFO_RX_MASK_OORANGE (1 << 5) #define FIFO_RX_MASK_LEN_MSMTCH (1 << 4) #define FIFO_RX_MASK_CODE_ERROR (1 << 3) #define FIFO_RX_MASK_FALSE_CARRIER (1 << 2) #define FIFO_RX_MASK_RX_DV_EVENT (1 << 1) #define FIFO_RX_MASK_DROP_EVENT (1 << 0) /* * Len. mismatch, unsup. opcode and short frmae bits excluded */ #define FIFO_RX_FILTMASK_DEFAULT \ (FIFO_RX_MASK_NO_SHORT_FRAME | \ FIFO_RX_MASK_BIT17 | \ FIFO_RX_MASK_BIT16 | \ FIFO_RX_MASK_TRUNC_FRAME | \ FIFO_RX_MASK_LONG_EVENT | \ FIFO_RX_MASK_VLAN_TAG | \ FIFO_RX_MASK_PAUSE_FRAME | \ FIFO_RX_MASK_CTRL_FRAME | \ FIFO_RX_MASK_DRIBBLE_NIBBLE | \ FIFO_RX_MASK_BCAST | \ FIFO_RX_MASK_MCAST | \ FIFO_RX_MASK_OK | \ FIFO_RX_MASK_OORANGE | \ FIFO_RX_MASK_CODE_ERROR | \ FIFO_RX_MASK_FALSE_CARRIER | \ FIFO_RX_MASK_RX_DV_EVENT | \ FIFO_RX_MASK_DROP_EVENT) #define AR71XX_MAC_FIFO_RAM0 0x60 #define AR71XX_MAC_FIFO_RAM1 0x64 #define AR71XX_MAC_FIFO_RAM2 0x68 #define AR71XX_MAC_FIFO_RAM3 0x6C #define AR71XX_MAC_FIFO_RAM4 0x70 #define AR71XX_MAC_FIFO_RAM5 0x74 #define AR71XX_MAC_FIFO_RAM6 0x78 #define AR71XX_DMA_TX_CONTROL 0x180 #define DMA_TX_CONTROL_EN (1 << 0) #define AR71XX_DMA_TX_DESC 0x184 #define AR71XX_DMA_TX_STATUS 0x188 #define DMA_TX_STATUS_PCOUNT_MASK 0xff #define DMA_TX_STATUS_PCOUNT_SHIFT 16 #define DMA_TX_STATUS_BUS_ERROR (1 << 3) #define DMA_TX_STATUS_UNDERRUN (1 << 1) #define DMA_TX_STATUS_PKT_SENT (1 << 0) #define AR71XX_DMA_RX_CONTROL 0x18C #define DMA_RX_CONTROL_EN (1 << 0) #define AR71XX_DMA_RX_DESC 0x190 #define AR71XX_DMA_RX_STATUS 0x194 #define DMA_RX_STATUS_PCOUNT_MASK 0xff #define DMA_RX_STATUS_PCOUNT_SHIFT 16 #define DMA_RX_STATUS_BUS_ERROR (1 << 3) #define DMA_RX_STATUS_OVERFLOW (1 << 2) #define DMA_RX_STATUS_PKT_RECVD (1 << 0) #define AR71XX_DMA_INTR 0x198 #define AR71XX_DMA_INTR_STATUS 0x19C #define DMA_INTR_ALL ((1 << 8) - 1) #define DMA_INTR_RX_BUS_ERROR (1 << 7) #define DMA_INTR_RX_OVERFLOW (1 << 6) #define DMA_INTR_RX_PKT_RCVD (1 << 4) #define DMA_INTR_TX_BUS_ERROR (1 << 3) #define DMA_INTR_TX_UNDERRUN (1 << 1) #define DMA_INTR_TX_PKT_SENT (1 << 0) #define AR71XX_SPI_BASE 0x1f000000 #define AR71XX_SPI_FS 0x00 #define AR71XX_SPI_CTRL 0x04 #define SPI_CTRL_REMAP_DISABLE (1 << 6) #define SPI_CTRL_CLOCK_DIVIDER_MASK ((1 << 6) - 1) #define AR71XX_SPI_IO_CTRL 0x08 #define SPI_IO_CTRL_CS2 (1 << 18) #define SPI_IO_CTRL_CS1 (1 << 17) #define SPI_IO_CTRL_CS0 (1 << 16) #define SPI_IO_CTRL_CSMASK (7 << 16) #define SPI_IO_CTRL_CLK (1 << 8) #define SPI_IO_CTRL_DO 1 #define AR71XX_SPI_RDS 0x0C #define ATH_READ_REG(reg) \ *((volatile uint32_t *)MIPS_PHYS_TO_KSEG1((reg))) #define ATH_WRITE_REG(reg, val) \ do { \ *((volatile uint32_t *)MIPS_PHYS_TO_KSEG1((reg))) = (val); \ (void) ATH_READ_REG(reg); \ } while (0) static inline void ar71xx_ddr_flush(uint32_t reg) { ATH_WRITE_REG(reg, 1); while ((ATH_READ_REG(reg) & 0x1)) ; ATH_WRITE_REG(reg, 1); while ((ATH_READ_REG(reg) & 0x1)) ; } static inline void ar71xx_write_pll(uint32_t cfg_reg, uint32_t pll_reg, uint32_t pll, uint32_t pll_reg_shift) { uint32_t sec_cfg; /* set PLL registers */ sec_cfg = ATH_READ_REG(cfg_reg); sec_cfg &= ~(3 << pll_reg_shift); sec_cfg |= (2 << pll_reg_shift); ATH_WRITE_REG(cfg_reg, sec_cfg); DELAY(100); ATH_WRITE_REG(pll_reg, pll); sec_cfg |= (3 << pll_reg_shift); ATH_WRITE_REG(cfg_reg, sec_cfg); DELAY(100); sec_cfg &= ~(3 << pll_reg_shift); ATH_WRITE_REG(cfg_reg, sec_cfg); DELAY(100); } #endif /* _AR71XX_REG_H_ */ Index: projects/netbsd-tests-upstream-01-2017/sys/mips/atheros/uart_bus_ar71xx.c =================================================================== --- projects/netbsd-tests-upstream-01-2017/sys/mips/atheros/uart_bus_ar71xx.c (revision 312217) +++ projects/netbsd-tests-upstream-01-2017/sys/mips/atheros/uart_bus_ar71xx.c (revision 312218) @@ -1,89 +1,106 @@ /*- * Copyright (c) 2009, Oleksandr Tymoshenko * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF */ #include "opt_uart.h" #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include "uart_if.h" static int uart_ar71xx_probe(device_t dev); extern struct uart_class uart_ar71xx_uart_class; static device_method_t uart_ar71xx_methods[] = { /* Device interface */ DEVMETHOD(device_probe, uart_ar71xx_probe), DEVMETHOD(device_attach, uart_bus_attach), DEVMETHOD(device_detach, uart_bus_detach), { 0, 0 } }; static driver_t uart_ar71xx_driver = { uart_driver_name, uart_ar71xx_methods, sizeof(struct uart_softc), }; extern SLIST_HEAD(uart_devinfo_list, uart_devinfo) uart_sysdevs; static int uart_ar71xx_probe(device_t dev) { struct uart_softc *sc; uint64_t freq; freq = ar71xx_uart_freq(); sc = device_get_softc(dev); sc->sc_sysdev = SLIST_FIRST(&uart_sysdevs); sc->sc_class = &uart_ns8250_class; bcopy(&sc->sc_sysdev->bas, &sc->sc_bas, sizeof(sc->sc_bas)); sc->sc_sysdev->bas.regshft = 2; sc->sc_sysdev->bas.bst = mips_bus_space_generic; sc->sc_sysdev->bas.bsh = MIPS_PHYS_TO_KSEG1(AR71XX_UART_ADDR) + 3; sc->sc_bas.regshft = 2; sc->sc_bas.bst = mips_bus_space_generic; sc->sc_bas.bsh = MIPS_PHYS_TO_KSEG1(AR71XX_UART_ADDR) + 3; return (uart_bus_probe(dev, 2, freq, 0, 0)); } +#ifdef EARLY_PRINTF +static void +ar71xx_early_putc(int c) +{ + int i; + + for (i = 0; i < 1000; i++) { + if (ATH_READ_REG(AR71XX_UART_ADDR + AR71XX_UART_LSR) + & AR71XX_UART_LSR_THRE) + break; + } + + ATH_WRITE_REG(AR71XX_UART_ADDR + AR71XX_UART_THR, (c & 0xff)); +} +early_putc_t *early_putc = ar71xx_early_putc; +#endif + DRIVER_MODULE(uart, apb, uart_ar71xx_driver, uart_devclass, 0, 0); Index: projects/netbsd-tests-upstream-01-2017/sys/mips/conf/ONIONOMEGA.hints =================================================================== --- projects/netbsd-tests-upstream-01-2017/sys/mips/conf/ONIONOMEGA.hints (revision 312217) +++ projects/netbsd-tests-upstream-01-2017/sys/mips/conf/ONIONOMEGA.hints (revision 312218) @@ -1,94 +1,125 @@ # # This file adds to the values in AR933X_BASE.hints. # # $FreeBSD$ # mdiobus on arge1 hint.argemdio.0.at="nexus0" hint.argemdio.0.maddr=0x1a000000 hint.argemdio.0.msize=0x1000 hint.argemdio.0.order=0 # Embedded Atheros Switch hint.arswitch.0.at="mdio0" # XXX this should really say it's an AR933x switch, as there # are some vlan specific differences here! hint.arswitch.0.is_7240=1 hint.arswitch.0.numphys=4 hint.arswitch.0.phy4cpu=1 # phy 4 is a "CPU" separate PHY hint.arswitch.0.is_rgmii=0 hint.arswitch.0.is_gmii=1 # arge1 <-> switch PHY is GMII # arge0 - MII, autoneg, phy(4) hint.arge.0.phymask=0x10 # PHY4 hint.arge.0.mdio=mdioproxy1 # .. off of the switch mdiobus hint.arge.0.eeprommac=0x1fff0000 # arge1 - GMII, 1000/full hint.arge.1.phymask=0x0 # No directly mapped PHYs hint.arge.1.media=1000 hint.arge.1.fduplex=1 hint.arge.1.eeprommac=0x1fff0006 +# ath0 +hint.ath.0.eepromaddr=0x1fff0000 +hint.ath.0.eepromsize=16384 + # 16MB flash layout: # [ 0.510000] 5 tp-link partitions found on MTD device spi0.0 # [ 0.510000] Creating 5 MTD partitions on "spi0.0": # [ 0.520000] 0x000000000000-0x000000020000 : "u-boot" # [ 0.520000] 0x000000020000-0x000000136468 : "kernel" # [ 0.530000] 0x000000136468-0x000000ff0000 : "rootfs" # [ 0.530000] mtd: device 2 (rootfs) set to be root filesystem # [ 0.540000] 1 squashfs-split partitions found on MTD device rootfs # [ 0.540000] 0x000000730000-0x000000fe0000 : "rootfs_data" # [ 0.540000] 0x000000fe0000-0x000000ff0000 : "nvram" # [ 0.550000] 0x000000ff0000-0x000001000000 : "art" # [ 0.560000] 0x000000020000-0x000000fe0000 : "firmware" # 64KiB uboot hint.map.0.at="flash/spi0" hint.map.0.start=0x00000000 hint.map.0.end=0x00010000 hint.map.0.name="u-boot" hint.map.0.readonly=1 # 64KiB uboot hint.map.1.at="flash/spi0" hint.map.1.start=0x00010000 hint.map.1.end=0x00020000 hint.map.1.name="uboot-env" hint.map.1.readonly=1 # kernel hint.map.2.at="flash/spi0" hint.map.2.start=0x00020000 hint.map.2.end="search:0x00020000:0x10000:.!/bin/sh" hint.map.2.name="kernel" hint.map.2.readonly=1 # rootfs ulzma hint.map.3.at="flash/spi0" hint.map.3.start="search:0x00020000:0x10000:.!/bin/sh" hint.map.3.end=0x00fe0000 hint.map.3.name="rootfs" hint.map.3.readonly=1 # 64KiB cfg hint.map.4.at="flash/spi0" hint.map.4.start=0x00fe0000 hint.map.4.end=0x00ff0000 hint.map.4.name="cfg" hint.map.4.readonly=0 # all firmware 16000KiB hint.map.5.at="flash/spi0" hint.map.5.start=0x00020000 hint.map.5.end=0x00ff0000 hint.map.5.name="firmware" hint.map.5.readonly=0 # 64KiB ART hint.map.6.at="flash/spi0" hint.map.6.start=0x00ff0000 hint.map.6.end=0x01000000 hint.map.6.name="ART" hint.map.6.readonly=1 + +# GPIO +hint.gpio.0.pinmask=0x0c8ff1c3 + +hint.gpioled.0.at="gpiobus0" +hint.gpioled.0.pins=0x08000000 +hint.gpioled.0.name="board" +hint.gpioled.0.invert=0 + +#Red +hint.gpioled.1.at="gpiobus0" +hint.gpioled.1.pins=0x00020000 +hint.gpioled.1.name="red" +hint.gpioled.1.invert=0 + +#Green +hint.gpioled.2.at="gpiobus0" +hint.gpioled.2.pins=0x00010000 +hint.gpioled.2.name="green" +hint.gpioled.2.invert=0 + +#Blue +hint.gpioled.3.at="gpiobus0" +hint.gpioled.3.pins=0x00008000 +hint.gpioled.3.name="blue" +hint.gpioled.3.invert=0 + Index: projects/netbsd-tests-upstream-01-2017/sys/net/iflib.c =================================================================== --- projects/netbsd-tests-upstream-01-2017/sys/net/iflib.c (revision 312217) +++ projects/netbsd-tests-upstream-01-2017/sys/net/iflib.c (revision 312218) @@ -1,5139 +1,5147 @@ /*- * Copyright (c) 2014-2016, Matthew Macy * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Neither the name of Matthew Macy nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_acpi.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "ifdi_if.h" #if defined(__i386__) || defined(__amd64__) #include #include #include #include #include #include #endif /* * enable accounting of every mbuf as it comes in to and goes out of iflib's software descriptor references */ #define MEMORY_LOGGING 0 /* * Enable mbuf vectors for compressing long mbuf chains */ /* * NB: * - Prefetching in tx cleaning should perhaps be a tunable. The distance ahead * we prefetch needs to be determined by the time spent in m_free vis a vis * the cost of a prefetch. This will of course vary based on the workload: * - NFLX's m_free path is dominated by vm-based M_EXT manipulation which * is quite expensive, thus suggesting very little prefetch. * - small packet forwarding which is just returning a single mbuf to * UMA will typically be very fast vis a vis the cost of a memory * access. */ /* * File organization: * - private structures * - iflib private utility functions * - ifnet functions * - vlan registry and other exported functions * - iflib public core functions * * */ static MALLOC_DEFINE(M_IFLIB, "iflib", "ifnet library"); struct iflib_txq; typedef struct iflib_txq *iflib_txq_t; struct iflib_rxq; typedef struct iflib_rxq *iflib_rxq_t; struct iflib_fl; typedef struct iflib_fl *iflib_fl_t; +struct iflib_ctx; + typedef struct iflib_filter_info { driver_filter_t *ifi_filter; void *ifi_filter_arg; struct grouptask *ifi_task; + struct iflib_ctx *ifi_ctx; } *iflib_filter_info_t; struct iflib_ctx { KOBJ_FIELDS; /* * Pointer to hardware driver's softc */ void *ifc_softc; device_t ifc_dev; if_t ifc_ifp; cpuset_t ifc_cpus; if_shared_ctx_t ifc_sctx; struct if_softc_ctx ifc_softc_ctx; struct mtx ifc_mtx; uint16_t ifc_nhwtxqs; uint16_t ifc_nhwrxqs; iflib_txq_t ifc_txqs; iflib_rxq_t ifc_rxqs; uint32_t ifc_if_flags; uint32_t ifc_flags; uint32_t ifc_max_fl_buf_size; int ifc_in_detach; int ifc_link_state; int ifc_link_irq; int ifc_pause_frames; int ifc_watchdog_events; struct cdev *ifc_led_dev; struct resource *ifc_msix_mem; struct if_irq ifc_legacy_irq; struct grouptask ifc_admin_task; struct grouptask ifc_vflr_task; struct iflib_filter_info ifc_filter_info; struct ifmedia ifc_media; struct sysctl_oid *ifc_sysctl_node; uint16_t ifc_sysctl_ntxqs; uint16_t ifc_sysctl_nrxqs; uint16_t ifc_sysctl_qs_eq_override; uint16_t ifc_sysctl_ntxds[8]; uint16_t ifc_sysctl_nrxds[8]; struct if_txrx ifc_txrx; #define isc_txd_encap ifc_txrx.ift_txd_encap #define isc_txd_flush ifc_txrx.ift_txd_flush #define isc_txd_credits_update ifc_txrx.ift_txd_credits_update #define isc_rxd_available ifc_txrx.ift_rxd_available #define isc_rxd_pkt_get ifc_txrx.ift_rxd_pkt_get #define isc_rxd_refill ifc_txrx.ift_rxd_refill #define isc_rxd_flush ifc_txrx.ift_rxd_flush #define isc_rxd_refill ifc_txrx.ift_rxd_refill #define isc_rxd_refill ifc_txrx.ift_rxd_refill #define isc_legacy_intr ifc_txrx.ift_legacy_intr eventhandler_tag ifc_vlan_attach_event; eventhandler_tag ifc_vlan_detach_event; uint8_t ifc_mac[ETHER_ADDR_LEN]; char ifc_mtx_name[16]; }; void * iflib_get_softc(if_ctx_t ctx) { return (ctx->ifc_softc); } device_t iflib_get_dev(if_ctx_t ctx) { return (ctx->ifc_dev); } if_t iflib_get_ifp(if_ctx_t ctx) { return (ctx->ifc_ifp); } struct ifmedia * iflib_get_media(if_ctx_t ctx) { return (&ctx->ifc_media); } void iflib_set_mac(if_ctx_t ctx, uint8_t mac[ETHER_ADDR_LEN]) { bcopy(mac, ctx->ifc_mac, ETHER_ADDR_LEN); } if_softc_ctx_t iflib_get_softc_ctx(if_ctx_t ctx) { return (&ctx->ifc_softc_ctx); } if_shared_ctx_t iflib_get_sctx(if_ctx_t ctx) { return (ctx->ifc_sctx); } #define CACHE_PTR_INCREMENT (CACHE_LINE_SIZE/sizeof(void*)) #define LINK_ACTIVE(ctx) ((ctx)->ifc_link_state == LINK_STATE_UP) #define CTX_IS_VF(ctx) ((ctx)->ifc_sctx->isc_flags & IFLIB_IS_VF) #define RX_SW_DESC_MAP_CREATED (1 << 0) #define TX_SW_DESC_MAP_CREATED (1 << 1) #define RX_SW_DESC_INUSE (1 << 3) #define TX_SW_DESC_MAPPED (1 << 4) typedef struct iflib_sw_rx_desc { bus_dmamap_t ifsd_map; /* bus_dma map for packet */ struct mbuf *ifsd_m; /* rx: uninitialized mbuf */ caddr_t ifsd_cl; /* direct cluster pointer for rx */ uint16_t ifsd_flags; } *iflib_rxsd_t; typedef struct iflib_sw_tx_desc_val { bus_dmamap_t ifsd_map; /* bus_dma map for packet */ struct mbuf *ifsd_m; /* pkthdr mbuf */ uint8_t ifsd_flags; } *iflib_txsd_val_t; typedef struct iflib_sw_tx_desc_array { bus_dmamap_t *ifsd_map; /* bus_dma maps for packet */ struct mbuf **ifsd_m; /* pkthdr mbufs */ uint8_t *ifsd_flags; } iflib_txsd_array_t; /* magic number that should be high enough for any hardware */ #define IFLIB_MAX_TX_SEGS 128 #define IFLIB_MAX_RX_SEGS 32 #define IFLIB_RX_COPY_THRESH 128 #define IFLIB_MAX_RX_REFRESH 32 #define IFLIB_QUEUE_IDLE 0 #define IFLIB_QUEUE_HUNG 1 #define IFLIB_QUEUE_WORKING 2 /* this should really scale with ring size - 32 is a fairly arbitrary value for this */ #define TX_BATCH_SIZE 16 #define IFLIB_RESTART_BUDGET 8 #define IFC_LEGACY 0x01 #define IFC_QFLUSH 0x02 #define IFC_MULTISEG 0x04 #define IFC_DMAR 0x08 #define IFC_SC_ALLOCATED 0x10 +#define IFC_INIT_DONE 0x20 + #define CSUM_OFFLOAD (CSUM_IP_TSO|CSUM_IP6_TSO|CSUM_IP| \ CSUM_IP_UDP|CSUM_IP_TCP|CSUM_IP_SCTP| \ CSUM_IP6_UDP|CSUM_IP6_TCP|CSUM_IP6_SCTP) struct iflib_txq { uint16_t ift_in_use; uint16_t ift_cidx; uint16_t ift_cidx_processed; uint16_t ift_pidx; uint8_t ift_gen; uint8_t ift_db_pending; uint8_t ift_db_pending_queued; uint8_t ift_npending; uint8_t ift_br_offset; /* implicit pad */ uint64_t ift_processed; uint64_t ift_cleaned; #if MEMORY_LOGGING uint64_t ift_enqueued; uint64_t ift_dequeued; #endif uint64_t ift_no_tx_dma_setup; uint64_t ift_no_desc_avail; uint64_t ift_mbuf_defrag_failed; uint64_t ift_mbuf_defrag; uint64_t ift_map_failed; uint64_t ift_txd_encap_efbig; uint64_t ift_pullups; struct mtx ift_mtx; struct mtx ift_db_mtx; /* constant values */ if_ctx_t ift_ctx; struct ifmp_ring **ift_br; struct grouptask ift_task; uint16_t ift_size; uint16_t ift_id; struct callout ift_timer; struct callout ift_db_check; iflib_txsd_array_t ift_sds; uint8_t ift_nbr; uint8_t ift_qstatus; uint8_t ift_active; uint8_t ift_closed; int ift_watchdog_time; struct iflib_filter_info ift_filter_info; bus_dma_tag_t ift_desc_tag; bus_dma_tag_t ift_tso_desc_tag; iflib_dma_info_t ift_ifdi; #define MTX_NAME_LEN 16 char ift_mtx_name[MTX_NAME_LEN]; char ift_db_mtx_name[MTX_NAME_LEN]; bus_dma_segment_t ift_segs[IFLIB_MAX_TX_SEGS] __aligned(CACHE_LINE_SIZE); #ifdef IFLIB_DIAGNOSTICS uint64_t ift_cpu_exec_count[256]; #endif } __aligned(CACHE_LINE_SIZE); struct iflib_fl { uint16_t ifl_cidx; uint16_t ifl_pidx; uint16_t ifl_credits; uint8_t ifl_gen; #if MEMORY_LOGGING uint64_t ifl_m_enqueued; uint64_t ifl_m_dequeued; uint64_t ifl_cl_enqueued; uint64_t ifl_cl_dequeued; #endif /* implicit pad */ /* constant */ uint16_t ifl_size; uint16_t ifl_buf_size; uint16_t ifl_cltype; uma_zone_t ifl_zone; iflib_rxsd_t ifl_sds; iflib_rxq_t ifl_rxq; uint8_t ifl_id; bus_dma_tag_t ifl_desc_tag; iflib_dma_info_t ifl_ifdi; uint64_t ifl_bus_addrs[IFLIB_MAX_RX_REFRESH] __aligned(CACHE_LINE_SIZE); caddr_t ifl_vm_addrs[IFLIB_MAX_RX_REFRESH]; } __aligned(CACHE_LINE_SIZE); static inline int get_inuse(int size, int cidx, int pidx, int gen) { int used; if (pidx > cidx) used = pidx - cidx; else if (pidx < cidx) used = size - cidx + pidx; else if (gen == 0 && pidx == cidx) used = 0; else if (gen == 1 && pidx == cidx) used = size; else panic("bad state"); return (used); } #define TXQ_AVAIL(txq) (txq->ift_size - get_inuse(txq->ift_size, txq->ift_cidx, txq->ift_pidx, txq->ift_gen)) #define IDXDIFF(head, tail, wrap) \ ((head) >= (tail) ? (head) - (tail) : (wrap) - (tail) + (head)) struct iflib_rxq { /* If there is a separate completion queue - * these are the cq cidx and pidx. Otherwise * these are unused. */ uint16_t ifr_size; uint16_t ifr_cq_cidx; uint16_t ifr_cq_pidx; uint8_t ifr_cq_gen; uint8_t ifr_fl_offset; if_ctx_t ifr_ctx; iflib_fl_t ifr_fl; uint64_t ifr_rx_irq; uint16_t ifr_id; uint8_t ifr_lro_enabled; uint8_t ifr_nfl; struct lro_ctrl ifr_lc; struct grouptask ifr_task; struct iflib_filter_info ifr_filter_info; iflib_dma_info_t ifr_ifdi; /* dynamically allocate if any drivers need a value substantially larger than this */ struct if_rxd_frag ifr_frags[IFLIB_MAX_RX_SEGS] __aligned(CACHE_LINE_SIZE); #ifdef IFLIB_DIAGNOSTICS uint64_t ifr_cpu_exec_count[256]; #endif } __aligned(CACHE_LINE_SIZE); /* * Only allow a single packet to take up most 1/nth of the tx ring */ #define MAX_SINGLE_PACKET_FRACTION 12 #define IF_BAD_DMA (bus_addr_t)-1 static int enable_msix = 1; #define CTX_ACTIVE(ctx) ((if_getdrvflags((ctx)->ifc_ifp) & IFF_DRV_RUNNING)) #define CTX_LOCK_INIT(_sc, _name) mtx_init(&(_sc)->ifc_mtx, _name, "iflib ctx lock", MTX_DEF) #define CTX_LOCK(ctx) mtx_lock(&(ctx)->ifc_mtx) #define CTX_UNLOCK(ctx) mtx_unlock(&(ctx)->ifc_mtx) #define CTX_LOCK_DESTROY(ctx) mtx_destroy(&(ctx)->ifc_mtx) #define TXDB_LOCK_INIT(txq) mtx_init(&(txq)->ift_db_mtx, (txq)->ift_db_mtx_name, NULL, MTX_DEF) #define TXDB_TRYLOCK(txq) mtx_trylock(&(txq)->ift_db_mtx) #define TXDB_LOCK(txq) mtx_lock(&(txq)->ift_db_mtx) #define TXDB_UNLOCK(txq) mtx_unlock(&(txq)->ift_db_mtx) #define TXDB_LOCK_DESTROY(txq) mtx_destroy(&(txq)->ift_db_mtx) #define CALLOUT_LOCK(txq) mtx_lock(&txq->ift_mtx) #define CALLOUT_UNLOCK(txq) mtx_unlock(&txq->ift_mtx) /* Our boot-time initialization hook */ static int iflib_module_event_handler(module_t, int, void *); static moduledata_t iflib_moduledata = { "iflib", iflib_module_event_handler, NULL }; DECLARE_MODULE(iflib, iflib_moduledata, SI_SUB_INIT_IF, SI_ORDER_ANY); MODULE_VERSION(iflib, 1); MODULE_DEPEND(iflib, pci, 1, 1, 1); MODULE_DEPEND(iflib, ether, 1, 1, 1); TASKQGROUP_DEFINE(if_io_tqg, mp_ncpus, 1); TASKQGROUP_DEFINE(if_config_tqg, 1, 1); #ifndef IFLIB_DEBUG_COUNTERS #ifdef INVARIANTS #define IFLIB_DEBUG_COUNTERS 1 #else #define IFLIB_DEBUG_COUNTERS 0 #endif /* !INVARIANTS */ #endif static SYSCTL_NODE(_net, OID_AUTO, iflib, CTLFLAG_RD, 0, "iflib driver parameters"); /* * XXX need to ensure that this can't accidentally cause the head to be moved backwards */ static int iflib_min_tx_latency = 0; SYSCTL_INT(_net_iflib, OID_AUTO, min_tx_latency, CTLFLAG_RW, &iflib_min_tx_latency, 0, "minimize transmit latency at the possible expense of throughput"); #if IFLIB_DEBUG_COUNTERS static int iflib_tx_seen; static int iflib_tx_sent; static int iflib_tx_encap; static int iflib_rx_allocs; static int iflib_fl_refills; static int iflib_fl_refills_large; static int iflib_tx_frees; SYSCTL_INT(_net_iflib, OID_AUTO, tx_seen, CTLFLAG_RD, &iflib_tx_seen, 0, "# tx mbufs seen"); SYSCTL_INT(_net_iflib, OID_AUTO, tx_sent, CTLFLAG_RD, &iflib_tx_sent, 0, "# tx mbufs sent"); SYSCTL_INT(_net_iflib, OID_AUTO, tx_encap, CTLFLAG_RD, &iflib_tx_encap, 0, "# tx mbufs encapped"); SYSCTL_INT(_net_iflib, OID_AUTO, tx_frees, CTLFLAG_RD, &iflib_tx_frees, 0, "# tx frees"); SYSCTL_INT(_net_iflib, OID_AUTO, rx_allocs, CTLFLAG_RD, &iflib_rx_allocs, 0, "# rx allocations"); SYSCTL_INT(_net_iflib, OID_AUTO, fl_refills, CTLFLAG_RD, &iflib_fl_refills, 0, "# refills"); SYSCTL_INT(_net_iflib, OID_AUTO, fl_refills_large, CTLFLAG_RD, &iflib_fl_refills_large, 0, "# large refills"); static int iflib_txq_drain_flushing; static int iflib_txq_drain_oactive; static int iflib_txq_drain_notready; static int iflib_txq_drain_encapfail; SYSCTL_INT(_net_iflib, OID_AUTO, txq_drain_flushing, CTLFLAG_RD, &iflib_txq_drain_flushing, 0, "# drain flushes"); SYSCTL_INT(_net_iflib, OID_AUTO, txq_drain_oactive, CTLFLAG_RD, &iflib_txq_drain_oactive, 0, "# drain oactives"); SYSCTL_INT(_net_iflib, OID_AUTO, txq_drain_notready, CTLFLAG_RD, &iflib_txq_drain_notready, 0, "# drain notready"); SYSCTL_INT(_net_iflib, OID_AUTO, txq_drain_encapfail, CTLFLAG_RD, &iflib_txq_drain_encapfail, 0, "# drain encap fails"); static int iflib_encap_load_mbuf_fail; static int iflib_encap_txq_avail_fail; static int iflib_encap_txd_encap_fail; SYSCTL_INT(_net_iflib, OID_AUTO, encap_load_mbuf_fail, CTLFLAG_RD, &iflib_encap_load_mbuf_fail, 0, "# busdma load failures"); SYSCTL_INT(_net_iflib, OID_AUTO, encap_txq_avail_fail, CTLFLAG_RD, &iflib_encap_txq_avail_fail, 0, "# txq avail failures"); SYSCTL_INT(_net_iflib, OID_AUTO, encap_txd_encap_fail, CTLFLAG_RD, &iflib_encap_txd_encap_fail, 0, "# driver encap failures"); static int iflib_task_fn_rxs; static int iflib_rx_intr_enables; static int iflib_fast_intrs; static int iflib_intr_link; static int iflib_intr_msix; static int iflib_rx_unavail; static int iflib_rx_ctx_inactive; static int iflib_rx_zero_len; static int iflib_rx_if_input; static int iflib_rx_mbuf_null; static int iflib_rxd_flush; static int iflib_verbose_debug; SYSCTL_INT(_net_iflib, OID_AUTO, intr_link, CTLFLAG_RD, &iflib_intr_link, 0, "# intr link calls"); SYSCTL_INT(_net_iflib, OID_AUTO, intr_msix, CTLFLAG_RD, &iflib_intr_msix, 0, "# intr msix calls"); SYSCTL_INT(_net_iflib, OID_AUTO, task_fn_rx, CTLFLAG_RD, &iflib_task_fn_rxs, 0, "# task_fn_rx calls"); SYSCTL_INT(_net_iflib, OID_AUTO, rx_intr_enables, CTLFLAG_RD, &iflib_rx_intr_enables, 0, "# rx intr enables"); SYSCTL_INT(_net_iflib, OID_AUTO, fast_intrs, CTLFLAG_RD, &iflib_fast_intrs, 0, "# fast_intr calls"); SYSCTL_INT(_net_iflib, OID_AUTO, rx_unavail, CTLFLAG_RD, &iflib_rx_unavail, 0, "# times rxeof called with no available data"); SYSCTL_INT(_net_iflib, OID_AUTO, rx_ctx_inactive, CTLFLAG_RD, &iflib_rx_ctx_inactive, 0, "# times rxeof called with inactive context"); SYSCTL_INT(_net_iflib, OID_AUTO, rx_zero_len, CTLFLAG_RD, &iflib_rx_zero_len, 0, "# times rxeof saw zero len mbuf"); SYSCTL_INT(_net_iflib, OID_AUTO, rx_if_input, CTLFLAG_RD, &iflib_rx_if_input, 0, "# times rxeof called if_input"); SYSCTL_INT(_net_iflib, OID_AUTO, rx_mbuf_null, CTLFLAG_RD, &iflib_rx_mbuf_null, 0, "# times rxeof got null mbuf"); SYSCTL_INT(_net_iflib, OID_AUTO, rxd_flush, CTLFLAG_RD, &iflib_rxd_flush, 0, "# times rxd_flush called"); SYSCTL_INT(_net_iflib, OID_AUTO, verbose_debug, CTLFLAG_RW, &iflib_verbose_debug, 0, "enable verbose debugging"); #define DBG_COUNTER_INC(name) atomic_add_int(&(iflib_ ## name), 1) static void iflib_debug_reset(void) { iflib_tx_seen = iflib_tx_sent = iflib_tx_encap = iflib_rx_allocs = iflib_fl_refills = iflib_fl_refills_large = iflib_tx_frees = iflib_txq_drain_flushing = iflib_txq_drain_oactive = iflib_txq_drain_notready = iflib_txq_drain_encapfail = iflib_encap_load_mbuf_fail = iflib_encap_txq_avail_fail = iflib_encap_txd_encap_fail = iflib_task_fn_rxs = iflib_rx_intr_enables = iflib_fast_intrs = iflib_intr_link = iflib_intr_msix = iflib_rx_unavail = iflib_rx_ctx_inactive = iflib_rx_zero_len = iflib_rx_if_input = iflib_rx_mbuf_null = iflib_rxd_flush = 0; } #else #define DBG_COUNTER_INC(name) static void iflib_debug_reset(void) {} #endif #define IFLIB_DEBUG 0 static void iflib_tx_structures_free(if_ctx_t ctx); static void iflib_rx_structures_free(if_ctx_t ctx); static int iflib_queues_alloc(if_ctx_t ctx); static int iflib_tx_credits_update(if_ctx_t ctx, iflib_txq_t txq); static int iflib_rxd_avail(if_ctx_t ctx, iflib_rxq_t rxq, int cidx, int budget); static int iflib_qset_structures_setup(if_ctx_t ctx); static int iflib_msix_init(if_ctx_t ctx); static int iflib_legacy_setup(if_ctx_t ctx, driver_filter_t filter, void *filterarg, int *rid, char *str); static void iflib_txq_check_drain(iflib_txq_t txq, int budget); static uint32_t iflib_txq_can_drain(struct ifmp_ring *); static int iflib_register(if_ctx_t); static void iflib_init_locked(if_ctx_t ctx); static void iflib_add_device_sysctl_pre(if_ctx_t ctx); static void iflib_add_device_sysctl_post(if_ctx_t ctx); static void iflib_ifmp_purge(iflib_txq_t txq); static void _iflib_pre_assert(if_softc_ctx_t scctx); #ifdef DEV_NETMAP #include #include #include MODULE_DEPEND(iflib, netmap, 1, 1, 1); /* * device-specific sysctl variables: * * iflib_crcstrip: 0: keep CRC in rx frames (default), 1: strip it. * During regular operations the CRC is stripped, but on some * hardware reception of frames not multiple of 64 is slower, * so using crcstrip=0 helps in benchmarks. * * iflib_rx_miss, iflib_rx_miss_bufs: * count packets that might be missed due to lost interrupts. */ SYSCTL_DECL(_dev_netmap); /* * The xl driver by default strips CRCs and we do not override it. */ int iflib_crcstrip = 1; SYSCTL_INT(_dev_netmap, OID_AUTO, iflib_crcstrip, CTLFLAG_RW, &iflib_crcstrip, 1, "strip CRC on rx frames"); int iflib_rx_miss, iflib_rx_miss_bufs; SYSCTL_INT(_dev_netmap, OID_AUTO, iflib_rx_miss, CTLFLAG_RW, &iflib_rx_miss, 0, "potentially missed rx intr"); SYSCTL_INT(_dev_netmap, OID_AUTO, iflib_rx_miss_bufs, CTLFLAG_RW, &iflib_rx_miss_bufs, 0, "potentially missed rx intr bufs"); /* * Register/unregister. We are already under netmap lock. * Only called on the first register or the last unregister. */ static int iflib_netmap_register(struct netmap_adapter *na, int onoff) { struct ifnet *ifp = na->ifp; if_ctx_t ctx = ifp->if_softc; CTX_LOCK(ctx); IFDI_INTR_DISABLE(ctx); /* Tell the stack that the interface is no longer active */ ifp->if_drv_flags &= ~(IFF_DRV_RUNNING | IFF_DRV_OACTIVE); if (!CTX_IS_VF(ctx)) IFDI_CRCSTRIP_SET(ctx, onoff, iflib_crcstrip); /* enable or disable flags and callbacks in na and ifp */ if (onoff) { nm_set_native_flags(na); } else { nm_clear_native_flags(na); } IFDI_INIT(ctx); IFDI_CRCSTRIP_SET(ctx, onoff, iflib_crcstrip); // XXX why twice ? CTX_UNLOCK(ctx); return (ifp->if_drv_flags & IFF_DRV_RUNNING ? 0 : 1); } /* * Reconcile kernel and user view of the transmit ring. * * All information is in the kring. * Userspace wants to send packets up to the one before kring->rhead, * kernel knows kring->nr_hwcur is the first unsent packet. * * Here we push packets out (as many as possible), and possibly * reclaim buffers from previously completed transmission. * * The caller (netmap) guarantees that there is only one instance * running at any time. Any interference with other driver * methods should be handled by the individual drivers. */ static int iflib_netmap_txsync(struct netmap_kring *kring, int flags) { struct netmap_adapter *na = kring->na; struct ifnet *ifp = na->ifp; struct netmap_ring *ring = kring->ring; u_int nm_i; /* index into the netmap ring */ u_int nic_i; /* index into the NIC ring */ u_int n; u_int const lim = kring->nkr_num_slots - 1; u_int const head = kring->rhead; struct if_pkt_info pi; /* * interrupts on every tx packet are expensive so request * them every half ring, or where NS_REPORT is set */ u_int report_frequency = kring->nkr_num_slots >> 1; /* device-specific */ if_ctx_t ctx = ifp->if_softc; iflib_txq_t txq = &ctx->ifc_txqs[kring->ring_id]; pi.ipi_segs = txq->ift_segs; pi.ipi_qsidx = kring->ring_id; pi.ipi_ndescs = 0; bus_dmamap_sync(txq->ift_desc_tag, txq->ift_ifdi->idi_map, BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE); /* * First part: process new packets to send. * nm_i is the current index in the netmap ring, * nic_i is the corresponding index in the NIC ring. * * If we have packets to send (nm_i != head) * iterate over the netmap ring, fetch length and update * the corresponding slot in the NIC ring. Some drivers also * need to update the buffer's physical address in the NIC slot * even NS_BUF_CHANGED is not set (PNMB computes the addresses). * * The netmap_reload_map() calls is especially expensive, * even when (as in this case) the tag is 0, so do only * when the buffer has actually changed. * * If possible do not set the report/intr bit on all slots, * but only a few times per ring or when NS_REPORT is set. * * Finally, on 10G and faster drivers, it might be useful * to prefetch the next slot and txr entry. */ nm_i = kring->nr_hwcur; if (nm_i != head) { /* we have new packets to send */ nic_i = netmap_idx_k2n(kring, nm_i); __builtin_prefetch(&ring->slot[nm_i]); __builtin_prefetch(&txq->ift_sds.ifsd_m[nic_i]); __builtin_prefetch(&txq->ift_sds.ifsd_map[nic_i]); for (n = 0; nm_i != head; n++) { struct netmap_slot *slot = &ring->slot[nm_i]; u_int len = slot->len; uint64_t paddr; void *addr = PNMB(na, slot, &paddr); int flags = (slot->flags & NS_REPORT || nic_i == 0 || nic_i == report_frequency) ? IPI_TX_INTR : 0; /* device-specific */ pi.ipi_pidx = nic_i; pi.ipi_flags = flags; /* Fill the slot in the NIC ring. */ ctx->isc_txd_encap(ctx->ifc_softc, &pi); /* prefetch for next round */ __builtin_prefetch(&ring->slot[nm_i + 1]); __builtin_prefetch(&txq->ift_sds.ifsd_m[nic_i + 1]); __builtin_prefetch(&txq->ift_sds.ifsd_map[nic_i + 1]); NM_CHECK_ADDR_LEN(na, addr, len); if (slot->flags & NS_BUF_CHANGED) { /* buffer has changed, reload map */ netmap_reload_map(na, txq->ift_desc_tag, txq->ift_sds.ifsd_map[nic_i], addr); } slot->flags &= ~(NS_REPORT | NS_BUF_CHANGED); /* make sure changes to the buffer are synced */ bus_dmamap_sync(txq->ift_ifdi->idi_tag, txq->ift_sds.ifsd_map[nic_i], BUS_DMASYNC_PREWRITE); nm_i = nm_next(nm_i, lim); nic_i = nm_next(nic_i, lim); } kring->nr_hwcur = head; /* synchronize the NIC ring */ bus_dmamap_sync(txq->ift_desc_tag, txq->ift_ifdi->idi_map, BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); /* (re)start the tx unit up to slot nic_i (excluded) */ ctx->isc_txd_flush(ctx->ifc_softc, txq->ift_id, nic_i); } /* * Second part: reclaim buffers for completed transmissions. */ if (iflib_tx_credits_update(ctx, txq)) { /* some tx completed, increment avail */ nic_i = txq->ift_cidx_processed; kring->nr_hwtail = nm_prev(netmap_idx_n2k(kring, nic_i), lim); } return (0); } /* * Reconcile kernel and user view of the receive ring. * Same as for the txsync, this routine must be efficient. * The caller guarantees a single invocations, but races against * the rest of the driver should be handled here. * * On call, kring->rhead is the first packet that userspace wants * to keep, and kring->rcur is the wakeup point. * The kernel has previously reported packets up to kring->rtail. * * If (flags & NAF_FORCE_READ) also check for incoming packets irrespective * of whether or not we received an interrupt. */ static int iflib_netmap_rxsync(struct netmap_kring *kring, int flags) { struct netmap_adapter *na = kring->na; struct ifnet *ifp = na->ifp; struct netmap_ring *ring = kring->ring; u_int nm_i; /* index into the netmap ring */ u_int nic_i; /* index into the NIC ring */ u_int i, n; u_int const lim = kring->nkr_num_slots - 1; u_int const head = kring->rhead; int force_update = (flags & NAF_FORCE_READ) || kring->nr_kflags & NKR_PENDINTR; struct if_rxd_info ri; /* device-specific */ if_ctx_t ctx = ifp->if_softc; iflib_rxq_t rxq = &ctx->ifc_rxqs[kring->ring_id]; iflib_fl_t fl = rxq->ifr_fl; if (head > lim) return netmap_ring_reinit(kring); bzero(&ri, sizeof(ri)); ri.iri_qsidx = kring->ring_id; ri.iri_ifp = ctx->ifc_ifp; /* XXX check sync modes */ for (i = 0, fl = rxq->ifr_fl; i < rxq->ifr_nfl; i++, fl++) bus_dmamap_sync(rxq->ifr_fl[i].ifl_desc_tag, fl->ifl_ifdi->idi_map, BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE); /* * First part: import newly received packets. * * nm_i is the index of the next free slot in the netmap ring, * nic_i is the index of the next received packet in the NIC ring, * and they may differ in case if_init() has been called while * in netmap mode. For the receive ring we have * * nic_i = rxr->next_check; * nm_i = kring->nr_hwtail (previous) * and * nm_i == (nic_i + kring->nkr_hwofs) % ring_size * * rxr->next_check is set to 0 on a ring reinit */ if (netmap_no_pendintr || force_update) { int crclen = iflib_crcstrip ? 0 : 4; int error, avail; uint16_t slot_flags = kring->nkr_slot_flags; for (fl = rxq->ifr_fl, i = 0; i < rxq->ifr_nfl; i++, fl++) { nic_i = fl->ifl_cidx; nm_i = netmap_idx_n2k(kring, nic_i); avail = ctx->isc_rxd_available(ctx->ifc_softc, kring->ring_id, nic_i, INT_MAX); for (n = 0; avail > 0; n++, avail--) { error = ctx->isc_rxd_pkt_get(ctx->ifc_softc, &ri); if (error) ring->slot[nm_i].len = 0; else ring->slot[nm_i].len = ri.iri_len - crclen; ring->slot[nm_i].flags = slot_flags; bus_dmamap_sync(fl->ifl_ifdi->idi_tag, fl->ifl_sds[nic_i].ifsd_map, BUS_DMASYNC_POSTREAD); nm_i = nm_next(nm_i, lim); nic_i = nm_next(nic_i, lim); } if (n) { /* update the state variables */ if (netmap_no_pendintr && !force_update) { /* diagnostics */ iflib_rx_miss ++; iflib_rx_miss_bufs += n; } fl->ifl_cidx = nic_i; kring->nr_hwtail = nm_i; } kring->nr_kflags &= ~NKR_PENDINTR; } } /* * Second part: skip past packets that userspace has released. * (kring->nr_hwcur to head excluded), * and make the buffers available for reception. * As usual nm_i is the index in the netmap ring, * nic_i is the index in the NIC ring, and * nm_i == (nic_i + kring->nkr_hwofs) % ring_size */ /* XXX not sure how this will work with multiple free lists */ nm_i = kring->nr_hwcur; if (nm_i != head) { nic_i = netmap_idx_k2n(kring, nm_i); for (n = 0; nm_i != head; n++) { struct netmap_slot *slot = &ring->slot[nm_i]; uint64_t paddr; caddr_t vaddr; void *addr = PNMB(na, slot, &paddr); if (addr == NETMAP_BUF_BASE(na)) /* bad buf */ goto ring_reset; vaddr = addr; if (slot->flags & NS_BUF_CHANGED) { /* buffer has changed, reload map */ netmap_reload_map(na, fl->ifl_ifdi->idi_tag, fl->ifl_sds[nic_i].ifsd_map, addr); slot->flags &= ~NS_BUF_CHANGED; } /* * XXX we should be batching this operation - TODO */ ctx->isc_rxd_refill(ctx->ifc_softc, rxq->ifr_id, fl->ifl_id, nic_i, &paddr, &vaddr, 1, fl->ifl_buf_size); bus_dmamap_sync(fl->ifl_ifdi->idi_tag, fl->ifl_sds[nic_i].ifsd_map, BUS_DMASYNC_PREREAD); nm_i = nm_next(nm_i, lim); nic_i = nm_next(nic_i, lim); } kring->nr_hwcur = head; bus_dmamap_sync(fl->ifl_ifdi->idi_tag, fl->ifl_ifdi->idi_map, BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); /* * IMPORTANT: we must leave one free slot in the ring, * so move nic_i back by one unit */ nic_i = nm_prev(nic_i, lim); ctx->isc_rxd_flush(ctx->ifc_softc, rxq->ifr_id, fl->ifl_id, nic_i); } return 0; ring_reset: return netmap_ring_reinit(kring); } static int iflib_netmap_attach(if_ctx_t ctx) { struct netmap_adapter na; if_softc_ctx_t scctx = &ctx->ifc_softc_ctx; bzero(&na, sizeof(na)); na.ifp = ctx->ifc_ifp; na.na_flags = NAF_BDG_MAYSLEEP; MPASS(ctx->ifc_softc_ctx.isc_ntxqsets); MPASS(ctx->ifc_softc_ctx.isc_nrxqsets); na.num_tx_desc = scctx->isc_ntxd[0]; na.num_rx_desc = scctx->isc_nrxd[0]; na.nm_txsync = iflib_netmap_txsync; na.nm_rxsync = iflib_netmap_rxsync; na.nm_register = iflib_netmap_register; na.num_tx_rings = ctx->ifc_softc_ctx.isc_ntxqsets; na.num_rx_rings = ctx->ifc_softc_ctx.isc_nrxqsets; return (netmap_attach(&na)); } static void iflib_netmap_txq_init(if_ctx_t ctx, iflib_txq_t txq) { struct netmap_adapter *na = NA(ctx->ifc_ifp); struct netmap_slot *slot; slot = netmap_reset(na, NR_TX, txq->ift_id, 0); if (slot == 0) return; for (int i = 0; i < ctx->ifc_softc_ctx.isc_ntxd[0]; i++) { /* * In netmap mode, set the map for the packet buffer. * NOTE: Some drivers (not this one) also need to set * the physical buffer address in the NIC ring. * netmap_idx_n2k() maps a nic index, i, into the corresponding * netmap slot index, si */ int si = netmap_idx_n2k(&na->tx_rings[txq->ift_id], i); netmap_load_map(na, txq->ift_desc_tag, txq->ift_sds.ifsd_map[i], NMB(na, slot + si)); } } static void iflib_netmap_rxq_init(if_ctx_t ctx, iflib_rxq_t rxq) { struct netmap_adapter *na = NA(ctx->ifc_ifp); struct netmap_slot *slot; iflib_rxsd_t sd; int nrxd; slot = netmap_reset(na, NR_RX, rxq->ifr_id, 0); if (slot == 0) return; sd = rxq->ifr_fl[0].ifl_sds; nrxd = ctx->ifc_softc_ctx.isc_nrxd[0]; for (int i = 0; i < nrxd; i++, sd++) { int sj = netmap_idx_n2k(&na->rx_rings[rxq->ifr_id], i); uint64_t paddr; void *addr; caddr_t vaddr; vaddr = addr = PNMB(na, slot + sj, &paddr); netmap_load_map(na, rxq->ifr_fl[0].ifl_ifdi->idi_tag, sd->ifsd_map, addr); /* Update descriptor and the cached value */ ctx->isc_rxd_refill(ctx->ifc_softc, rxq->ifr_id, 0 /* fl_id */, i, &paddr, &vaddr, 1, rxq->ifr_fl[0].ifl_buf_size); } /* preserve queue */ if (ctx->ifc_ifp->if_capenable & IFCAP_NETMAP) { struct netmap_kring *kring = &na->rx_rings[rxq->ifr_id]; int t = na->num_rx_desc - 1 - nm_kr_rxspace(kring); ctx->isc_rxd_flush(ctx->ifc_softc, rxq->ifr_id, 0 /* fl_id */, t); } else ctx->isc_rxd_flush(ctx->ifc_softc, rxq->ifr_id, 0 /* fl_id */, nrxd-1); } #define iflib_netmap_detach(ifp) netmap_detach(ifp) #else #define iflib_netmap_txq_init(ctx, txq) #define iflib_netmap_rxq_init(ctx, rxq) #define iflib_netmap_detach(ifp) #define iflib_netmap_attach(ctx) (0) #define netmap_rx_irq(ifp, qid, budget) (0) #endif #if defined(__i386__) || defined(__amd64__) static __inline void prefetch(void *x) { __asm volatile("prefetcht0 %0" :: "m" (*(unsigned long *)x)); } #else #define prefetch(x) #endif static void _iflib_dmamap_cb(void *arg, bus_dma_segment_t *segs, int nseg, int err) { if (err) return; *(bus_addr_t *) arg = segs[0].ds_addr; } int iflib_dma_alloc(if_ctx_t ctx, int size, iflib_dma_info_t dma, int mapflags) { int err; if_shared_ctx_t sctx = ctx->ifc_sctx; device_t dev = ctx->ifc_dev; KASSERT(sctx->isc_q_align != 0, ("alignment value not initialized")); err = bus_dma_tag_create(bus_get_dma_tag(dev), /* parent */ sctx->isc_q_align, 0, /* alignment, bounds */ BUS_SPACE_MAXADDR, /* lowaddr */ BUS_SPACE_MAXADDR, /* highaddr */ NULL, NULL, /* filter, filterarg */ size, /* maxsize */ 1, /* nsegments */ size, /* maxsegsize */ BUS_DMA_ALLOCNOW, /* flags */ NULL, /* lockfunc */ NULL, /* lockarg */ &dma->idi_tag); if (err) { device_printf(dev, "%s: bus_dma_tag_create failed: %d\n", __func__, err); goto fail_0; } err = bus_dmamem_alloc(dma->idi_tag, (void**) &dma->idi_vaddr, BUS_DMA_NOWAIT | BUS_DMA_COHERENT | BUS_DMA_ZERO, &dma->idi_map); if (err) { device_printf(dev, "%s: bus_dmamem_alloc(%ju) failed: %d\n", __func__, (uintmax_t)size, err); goto fail_1; } dma->idi_paddr = IF_BAD_DMA; err = bus_dmamap_load(dma->idi_tag, dma->idi_map, dma->idi_vaddr, size, _iflib_dmamap_cb, &dma->idi_paddr, mapflags | BUS_DMA_NOWAIT); if (err || dma->idi_paddr == IF_BAD_DMA) { device_printf(dev, "%s: bus_dmamap_load failed: %d\n", __func__, err); goto fail_2; } dma->idi_size = size; return (0); fail_2: bus_dmamem_free(dma->idi_tag, dma->idi_vaddr, dma->idi_map); fail_1: bus_dma_tag_destroy(dma->idi_tag); fail_0: dma->idi_tag = NULL; return (err); } int iflib_dma_alloc_multi(if_ctx_t ctx, int *sizes, iflib_dma_info_t *dmalist, int mapflags, int count) { int i, err; iflib_dma_info_t *dmaiter; dmaiter = dmalist; for (i = 0; i < count; i++, dmaiter++) { if ((err = iflib_dma_alloc(ctx, sizes[i], *dmaiter, mapflags)) != 0) break; } if (err) iflib_dma_free_multi(dmalist, i); return (err); } void iflib_dma_free(iflib_dma_info_t dma) { if (dma->idi_tag == NULL) return; if (dma->idi_paddr != IF_BAD_DMA) { bus_dmamap_sync(dma->idi_tag, dma->idi_map, BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE); bus_dmamap_unload(dma->idi_tag, dma->idi_map); dma->idi_paddr = IF_BAD_DMA; } if (dma->idi_vaddr != NULL) { bus_dmamem_free(dma->idi_tag, dma->idi_vaddr, dma->idi_map); dma->idi_vaddr = NULL; } bus_dma_tag_destroy(dma->idi_tag); dma->idi_tag = NULL; } void iflib_dma_free_multi(iflib_dma_info_t *dmalist, int count) { int i; iflib_dma_info_t *dmaiter = dmalist; for (i = 0; i < count; i++, dmaiter++) iflib_dma_free(*dmaiter); } static int iflib_fast_intr(void *arg) { iflib_filter_info_t info = arg; struct grouptask *gtask = info->ifi_task; - if (!smp_started) + if (!smp_started && mp_ncpus > 1) return (FILTER_HANDLED); DBG_COUNTER_INC(fast_intrs); if (info->ifi_filter != NULL && info->ifi_filter(info->ifi_filter_arg) == FILTER_HANDLED) return (FILTER_HANDLED); GROUPTASK_ENQUEUE(gtask); return (FILTER_HANDLED); } static int _iflib_irq_alloc(if_ctx_t ctx, if_irq_t irq, int rid, driver_filter_t filter, driver_intr_t handler, void *arg, char *name) { int rc; struct resource *res; void *tag; device_t dev = ctx->ifc_dev; MPASS(rid < 512); irq->ii_rid = rid; res = bus_alloc_resource_any(dev, SYS_RES_IRQ, &irq->ii_rid, RF_SHAREABLE | RF_ACTIVE); if (res == NULL) { device_printf(dev, "failed to allocate IRQ for rid %d, name %s.\n", rid, name); return (ENOMEM); } irq->ii_res = res; KASSERT(filter == NULL || handler == NULL, ("filter and handler can't both be non-NULL")); rc = bus_setup_intr(dev, res, INTR_MPSAFE | INTR_TYPE_NET, filter, handler, arg, &tag); if (rc != 0) { device_printf(dev, "failed to setup interrupt for rid %d, name %s: %d\n", rid, name ? name : "unknown", rc); return (rc); } else if (name) bus_describe_intr(dev, res, tag, "%s", name); irq->ii_tag = tag; return (0); } /********************************************************************* * * Allocate memory for tx_buffer structures. The tx_buffer stores all * the information needed to transmit a packet on the wire. This is * called only once at attach, setup is done every reset. * **********************************************************************/ static int iflib_txsd_alloc(iflib_txq_t txq) { if_ctx_t ctx = txq->ift_ctx; if_shared_ctx_t sctx = ctx->ifc_sctx; if_softc_ctx_t scctx = &ctx->ifc_softc_ctx; device_t dev = ctx->ifc_dev; int err, nsegments, ntsosegments; nsegments = scctx->isc_tx_nsegments; ntsosegments = scctx->isc_tx_tso_segments_max; MPASS(scctx->isc_ntxd[0] > 0); MPASS(scctx->isc_ntxd[txq->ift_br_offset] > 0); MPASS(nsegments > 0); MPASS(ntsosegments > 0); /* * Setup DMA descriptor areas. */ if ((err = bus_dma_tag_create(bus_get_dma_tag(dev), 1, 0, /* alignment, bounds */ BUS_SPACE_MAXADDR, /* lowaddr */ BUS_SPACE_MAXADDR, /* highaddr */ NULL, NULL, /* filter, filterarg */ sctx->isc_tx_maxsize, /* maxsize */ nsegments, /* nsegments */ sctx->isc_tx_maxsegsize, /* maxsegsize */ 0, /* flags */ NULL, /* lockfunc */ NULL, /* lockfuncarg */ &txq->ift_desc_tag))) { device_printf(dev,"Unable to allocate TX DMA tag: %d\n", err); device_printf(dev,"maxsize: %zd nsegments: %d maxsegsize: %zd\n", sctx->isc_tx_maxsize, nsegments, sctx->isc_tx_maxsegsize); goto fail; } if ((err = bus_dma_tag_create(bus_get_dma_tag(dev), 1, 0, /* alignment, bounds */ BUS_SPACE_MAXADDR, /* lowaddr */ BUS_SPACE_MAXADDR, /* highaddr */ NULL, NULL, /* filter, filterarg */ scctx->isc_tx_tso_size_max, /* maxsize */ ntsosegments, /* nsegments */ scctx->isc_tx_tso_segsize_max, /* maxsegsize */ 0, /* flags */ NULL, /* lockfunc */ NULL, /* lockfuncarg */ &txq->ift_tso_desc_tag))) { device_printf(dev,"Unable to allocate TX TSO DMA tag: %d\n", err); goto fail; } if (!(txq->ift_sds.ifsd_flags = (uint8_t *) malloc(sizeof(uint8_t) * scctx->isc_ntxd[txq->ift_br_offset], M_IFLIB, M_NOWAIT | M_ZERO))) { device_printf(dev, "Unable to allocate tx_buffer memory\n"); err = ENOMEM; goto fail; } if (!(txq->ift_sds.ifsd_m = (struct mbuf **) malloc(sizeof(struct mbuf *) * scctx->isc_ntxd[txq->ift_br_offset], M_IFLIB, M_NOWAIT | M_ZERO))) { device_printf(dev, "Unable to allocate tx_buffer memory\n"); err = ENOMEM; goto fail; } /* Create the descriptor buffer dma maps */ #if defined(ACPI_DMAR) || (!(defined(__i386__) && !defined(__amd64__))) if ((ctx->ifc_flags & IFC_DMAR) == 0) return (0); if (!(txq->ift_sds.ifsd_map = (bus_dmamap_t *) malloc(sizeof(bus_dmamap_t) * scctx->isc_ntxd[txq->ift_br_offset], M_IFLIB, M_NOWAIT | M_ZERO))) { device_printf(dev, "Unable to allocate tx_buffer map memory\n"); err = ENOMEM; goto fail; } for (int i = 0; i < scctx->isc_ntxd[txq->ift_br_offset]; i++) { err = bus_dmamap_create(txq->ift_desc_tag, 0, &txq->ift_sds.ifsd_map[i]); if (err != 0) { device_printf(dev, "Unable to create TX DMA map\n"); goto fail; } } #endif return (0); fail: /* We free all, it handles case where we are in the middle */ iflib_tx_structures_free(ctx); return (err); } static void iflib_txsd_destroy(if_ctx_t ctx, iflib_txq_t txq, int i) { bus_dmamap_t map; map = NULL; if (txq->ift_sds.ifsd_map != NULL) map = txq->ift_sds.ifsd_map[i]; if (map != NULL) { bus_dmamap_unload(txq->ift_desc_tag, map); bus_dmamap_destroy(txq->ift_desc_tag, map); txq->ift_sds.ifsd_map[i] = NULL; } } static void iflib_txq_destroy(iflib_txq_t txq) { if_ctx_t ctx = txq->ift_ctx; for (int i = 0; i < txq->ift_size; i++) iflib_txsd_destroy(ctx, txq, i); if (txq->ift_sds.ifsd_map != NULL) { free(txq->ift_sds.ifsd_map, M_IFLIB); txq->ift_sds.ifsd_map = NULL; } if (txq->ift_sds.ifsd_m != NULL) { free(txq->ift_sds.ifsd_m, M_IFLIB); txq->ift_sds.ifsd_m = NULL; } if (txq->ift_sds.ifsd_flags != NULL) { free(txq->ift_sds.ifsd_flags, M_IFLIB); txq->ift_sds.ifsd_flags = NULL; } if (txq->ift_desc_tag != NULL) { bus_dma_tag_destroy(txq->ift_desc_tag); txq->ift_desc_tag = NULL; } if (txq->ift_tso_desc_tag != NULL) { bus_dma_tag_destroy(txq->ift_tso_desc_tag); txq->ift_tso_desc_tag = NULL; } } static void iflib_txsd_free(if_ctx_t ctx, iflib_txq_t txq, int i) { struct mbuf **mp; mp = &txq->ift_sds.ifsd_m[i]; if (*mp == NULL) return; if (txq->ift_sds.ifsd_map != NULL) { bus_dmamap_sync(txq->ift_desc_tag, txq->ift_sds.ifsd_map[i], BUS_DMASYNC_POSTWRITE); bus_dmamap_unload(txq->ift_desc_tag, txq->ift_sds.ifsd_map[i]); } m_free(*mp); DBG_COUNTER_INC(tx_frees); *mp = NULL; } static int iflib_txq_setup(iflib_txq_t txq) { if_ctx_t ctx = txq->ift_ctx; if_softc_ctx_t scctx = &ctx->ifc_softc_ctx; iflib_dma_info_t di; int i; /* Set number of descriptors available */ txq->ift_qstatus = IFLIB_QUEUE_IDLE; /* Reset indices */ txq->ift_cidx_processed = txq->ift_pidx = txq->ift_cidx = txq->ift_npending = 0; txq->ift_size = scctx->isc_ntxd[txq->ift_br_offset]; for (i = 0, di = txq->ift_ifdi; i < ctx->ifc_nhwtxqs; i++, di++) bzero((void *)di->idi_vaddr, di->idi_size); IFDI_TXQ_SETUP(ctx, txq->ift_id); for (i = 0, di = txq->ift_ifdi; i < ctx->ifc_nhwtxqs; i++, di++) bus_dmamap_sync(di->idi_tag, di->idi_map, BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); return (0); } /********************************************************************* * * Allocate memory for rx_buffer structures. Since we use one * rx_buffer per received packet, the maximum number of rx_buffer's * that we'll need is equal to the number of receive descriptors * that we've allocated. * **********************************************************************/ static int iflib_rxsd_alloc(iflib_rxq_t rxq) { if_ctx_t ctx = rxq->ifr_ctx; if_shared_ctx_t sctx = ctx->ifc_sctx; if_softc_ctx_t scctx = &ctx->ifc_softc_ctx; device_t dev = ctx->ifc_dev; iflib_fl_t fl; iflib_rxsd_t rxsd; int err; MPASS(scctx->isc_nrxd[0] > 0); MPASS(scctx->isc_nrxd[rxq->ifr_fl_offset] > 0); fl = rxq->ifr_fl; for (int i = 0; i < rxq->ifr_nfl; i++, fl++) { fl->ifl_sds = malloc(sizeof(struct iflib_sw_rx_desc) * scctx->isc_nrxd[rxq->ifr_fl_offset], M_IFLIB, M_WAITOK | M_ZERO); if (fl->ifl_sds == NULL) { device_printf(dev, "Unable to allocate rx sw desc memory\n"); return (ENOMEM); } fl->ifl_size = scctx->isc_nrxd[rxq->ifr_fl_offset]; /* this isn't necessarily the same */ err = bus_dma_tag_create(bus_get_dma_tag(dev), /* parent */ 1, 0, /* alignment, bounds */ BUS_SPACE_MAXADDR, /* lowaddr */ BUS_SPACE_MAXADDR, /* highaddr */ NULL, NULL, /* filter, filterarg */ sctx->isc_rx_maxsize, /* maxsize */ sctx->isc_rx_nsegments, /* nsegments */ sctx->isc_rx_maxsegsize, /* maxsegsize */ 0, /* flags */ NULL, /* lockfunc */ NULL, /* lockarg */ &fl->ifl_desc_tag); if (err) { device_printf(dev, "%s: bus_dma_tag_create failed %d\n", __func__, err); goto fail; } rxsd = fl->ifl_sds; for (int i = 0; i < scctx->isc_nrxd[rxq->ifr_fl_offset]; i++, rxsd++) { err = bus_dmamap_create(fl->ifl_desc_tag, 0, &rxsd->ifsd_map); if (err) { device_printf(dev, "%s: bus_dmamap_create failed: %d\n", __func__, err); goto fail; } } } return (0); fail: iflib_rx_structures_free(ctx); return (err); } /* * Internal service routines */ struct rxq_refill_cb_arg { int error; bus_dma_segment_t seg; int nseg; }; static void _rxq_refill_cb(void *arg, bus_dma_segment_t *segs, int nseg, int error) { struct rxq_refill_cb_arg *cb_arg = arg; cb_arg->error = error; cb_arg->seg = segs[0]; cb_arg->nseg = nseg; } #ifdef ACPI_DMAR #define IS_DMAR(ctx) (ctx->ifc_flags & IFC_DMAR) #else #define IS_DMAR(ctx) (0) #endif /** * rxq_refill - refill an rxq free-buffer list * @ctx: the iflib context * @rxq: the free-list to refill * @n: the number of new buffers to allocate * * (Re)populate an rxq free-buffer list with up to @n new packet buffers. * The caller must assure that @n does not exceed the queue's capacity. */ static void _iflib_fl_refill(if_ctx_t ctx, iflib_fl_t fl, int count) { struct mbuf *m; int pidx = fl->ifl_pidx; iflib_rxsd_t rxsd = &fl->ifl_sds[pidx]; caddr_t cl; int n, i = 0; uint64_t bus_addr; int err; n = count; MPASS(n > 0); MPASS(fl->ifl_credits + n <= fl->ifl_size); if (pidx < fl->ifl_cidx) MPASS(pidx + n <= fl->ifl_cidx); if (pidx == fl->ifl_cidx && (fl->ifl_credits < fl->ifl_size)) MPASS(fl->ifl_gen == 0); if (pidx > fl->ifl_cidx) MPASS(n <= fl->ifl_size - pidx + fl->ifl_cidx); DBG_COUNTER_INC(fl_refills); if (n > 8) DBG_COUNTER_INC(fl_refills_large); while (n--) { /* * We allocate an uninitialized mbuf + cluster, mbuf is * initialized after rx. * * If the cluster is still set then we know a minimum sized packet was received */ if ((cl = rxsd->ifsd_cl) == NULL) { if ((cl = rxsd->ifsd_cl = m_cljget(NULL, M_NOWAIT, fl->ifl_buf_size)) == NULL) break; #if MEMORY_LOGGING fl->ifl_cl_enqueued++; #endif } if ((m = m_gethdr(M_NOWAIT, MT_NOINIT)) == NULL) { break; } #if MEMORY_LOGGING fl->ifl_m_enqueued++; #endif DBG_COUNTER_INC(rx_allocs); #ifdef notyet if ((rxsd->ifsd_flags & RX_SW_DESC_MAP_CREATED) == 0) { int err; if ((err = bus_dmamap_create(fl->ifl_ifdi->idi_tag, 0, &rxsd->ifsd_map))) { log(LOG_WARNING, "bus_dmamap_create failed %d\n", err); uma_zfree(fl->ifl_zone, cl); n = 0; goto done; } rxsd->ifsd_flags |= RX_SW_DESC_MAP_CREATED; } #endif #if defined(__i386__) || defined(__amd64__) if (!IS_DMAR(ctx)) { bus_addr = pmap_kextract((vm_offset_t)cl); } else #endif { struct rxq_refill_cb_arg cb_arg; iflib_rxq_t q; cb_arg.error = 0; q = fl->ifl_rxq; err = bus_dmamap_load(fl->ifl_desc_tag, rxsd->ifsd_map, cl, fl->ifl_buf_size, _rxq_refill_cb, &cb_arg, 0); if (err != 0 || cb_arg.error) { /* * !zone_pack ? */ if (fl->ifl_zone == zone_pack) uma_zfree(fl->ifl_zone, cl); m_free(m); n = 0; goto done; } bus_addr = cb_arg.seg.ds_addr; } rxsd->ifsd_flags |= RX_SW_DESC_INUSE; MPASS(rxsd->ifsd_m == NULL); rxsd->ifsd_cl = cl; rxsd->ifsd_m = m; fl->ifl_bus_addrs[i] = bus_addr; fl->ifl_vm_addrs[i] = cl; rxsd++; fl->ifl_credits++; i++; MPASS(fl->ifl_credits <= fl->ifl_size); if (++fl->ifl_pidx == fl->ifl_size) { fl->ifl_pidx = 0; fl->ifl_gen = 1; rxsd = fl->ifl_sds; } if (n == 0 || i == IFLIB_MAX_RX_REFRESH) { ctx->isc_rxd_refill(ctx->ifc_softc, fl->ifl_rxq->ifr_id, fl->ifl_id, pidx, fl->ifl_bus_addrs, fl->ifl_vm_addrs, i, fl->ifl_buf_size); i = 0; pidx = fl->ifl_pidx; } } done: DBG_COUNTER_INC(rxd_flush); if (fl->ifl_pidx == 0) pidx = fl->ifl_size - 1; else pidx = fl->ifl_pidx - 1; ctx->isc_rxd_flush(ctx->ifc_softc, fl->ifl_rxq->ifr_id, fl->ifl_id, pidx); } static __inline void __iflib_fl_refill_lt(if_ctx_t ctx, iflib_fl_t fl, int max) { /* we avoid allowing pidx to catch up with cidx as it confuses ixl */ int32_t reclaimable = fl->ifl_size - fl->ifl_credits - 1; #ifdef INVARIANTS int32_t delta = fl->ifl_size - get_inuse(fl->ifl_size, fl->ifl_cidx, fl->ifl_pidx, fl->ifl_gen) - 1; #endif MPASS(fl->ifl_credits <= fl->ifl_size); MPASS(reclaimable == delta); if (reclaimable > 0) _iflib_fl_refill(ctx, fl, min(max, reclaimable)); } static void iflib_fl_bufs_free(iflib_fl_t fl) { iflib_dma_info_t idi = fl->ifl_ifdi; uint32_t i; for (i = 0; i < fl->ifl_size; i++) { iflib_rxsd_t d = &fl->ifl_sds[i]; if (d->ifsd_flags & RX_SW_DESC_INUSE) { bus_dmamap_unload(fl->ifl_desc_tag, d->ifsd_map); bus_dmamap_destroy(fl->ifl_desc_tag, d->ifsd_map); if (d->ifsd_m != NULL) { m_init(d->ifsd_m, M_NOWAIT, MT_DATA, 0); uma_zfree(zone_mbuf, d->ifsd_m); } if (d->ifsd_cl != NULL) uma_zfree(fl->ifl_zone, d->ifsd_cl); d->ifsd_flags = 0; } else { MPASS(d->ifsd_cl == NULL); MPASS(d->ifsd_m == NULL); } #if MEMORY_LOGGING fl->ifl_m_dequeued++; fl->ifl_cl_dequeued++; #endif d->ifsd_cl = NULL; d->ifsd_m = NULL; } /* * Reset free list values */ fl->ifl_credits = fl->ifl_cidx = fl->ifl_pidx = fl->ifl_gen = 0;; bzero(idi->idi_vaddr, idi->idi_size); } /********************************************************************* * * Initialize a receive ring and its buffers. * **********************************************************************/ static int iflib_fl_setup(iflib_fl_t fl) { iflib_rxq_t rxq = fl->ifl_rxq; if_ctx_t ctx = rxq->ifr_ctx; if_softc_ctx_t sctx = &ctx->ifc_softc_ctx; /* ** Free current RX buffer structs and their mbufs */ iflib_fl_bufs_free(fl); /* Now replenish the mbufs */ MPASS(fl->ifl_credits == 0); /* * XXX don't set the max_frame_size to larger * than the hardware can handle */ if (sctx->isc_max_frame_size <= 2048) fl->ifl_buf_size = MCLBYTES; else if (sctx->isc_max_frame_size <= 4096) fl->ifl_buf_size = MJUMPAGESIZE; else if (sctx->isc_max_frame_size <= 9216) fl->ifl_buf_size = MJUM9BYTES; else fl->ifl_buf_size = MJUM16BYTES; if (fl->ifl_buf_size > ctx->ifc_max_fl_buf_size) ctx->ifc_max_fl_buf_size = fl->ifl_buf_size; fl->ifl_cltype = m_gettype(fl->ifl_buf_size); fl->ifl_zone = m_getzone(fl->ifl_buf_size); /* avoid pre-allocating zillions of clusters to an idle card * potentially speeding up attach */ _iflib_fl_refill(ctx, fl, min(128, fl->ifl_size)); MPASS(min(128, fl->ifl_size) == fl->ifl_credits); if (min(128, fl->ifl_size) != fl->ifl_credits) return (ENOBUFS); /* * handle failure */ MPASS(rxq != NULL); MPASS(fl->ifl_ifdi != NULL); bus_dmamap_sync(fl->ifl_ifdi->idi_tag, fl->ifl_ifdi->idi_map, BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); return (0); } /********************************************************************* * * Free receive ring data structures * **********************************************************************/ static void iflib_rx_sds_free(iflib_rxq_t rxq) { iflib_fl_t fl; int i; if (rxq->ifr_fl != NULL) { for (i = 0; i < rxq->ifr_nfl; i++) { fl = &rxq->ifr_fl[i]; if (fl->ifl_desc_tag != NULL) { bus_dma_tag_destroy(fl->ifl_desc_tag); fl->ifl_desc_tag = NULL; } } if (rxq->ifr_fl->ifl_sds != NULL) free(rxq->ifr_fl->ifl_sds, M_IFLIB); free(rxq->ifr_fl, M_IFLIB); rxq->ifr_fl = NULL; rxq->ifr_cq_gen = rxq->ifr_cq_cidx = rxq->ifr_cq_pidx = 0; } } /* * MI independent logic * */ static void iflib_timer(void *arg) { iflib_txq_t txq = arg; if_ctx_t ctx = txq->ift_ctx; if_softc_ctx_t scctx = &ctx->ifc_softc_ctx; if (!(if_getdrvflags(ctx->ifc_ifp) & IFF_DRV_RUNNING)) return; /* ** Check on the state of the TX queue(s), this ** can be done without the lock because its RO ** and the HUNG state will be static if set. */ IFDI_TIMER(ctx, txq->ift_id); if ((txq->ift_qstatus == IFLIB_QUEUE_HUNG) && (ctx->ifc_pause_frames == 0)) goto hung; if (TXQ_AVAIL(txq) <= 2*scctx->isc_tx_nsegments || ifmp_ring_is_stalled(txq->ift_br[0])) GROUPTASK_ENQUEUE(&txq->ift_task); ctx->ifc_pause_frames = 0; if (if_getdrvflags(ctx->ifc_ifp) & IFF_DRV_RUNNING) callout_reset_on(&txq->ift_timer, hz/2, iflib_timer, txq, txq->ift_timer.c_cpu); return; hung: CTX_LOCK(ctx); if_setdrvflagbits(ctx->ifc_ifp, 0, IFF_DRV_RUNNING); device_printf(ctx->ifc_dev, "TX(%d) desc avail = %d, pidx = %d\n", txq->ift_id, TXQ_AVAIL(txq), txq->ift_pidx); IFDI_WATCHDOG_RESET(ctx); ctx->ifc_watchdog_events++; ctx->ifc_pause_frames = 0; iflib_init_locked(ctx); CTX_UNLOCK(ctx); } static void iflib_init_locked(if_ctx_t ctx) { if_softc_ctx_t sctx = &ctx->ifc_softc_ctx; if_softc_ctx_t scctx = &ctx->ifc_softc_ctx; if_t ifp = ctx->ifc_ifp; iflib_fl_t fl; iflib_txq_t txq; iflib_rxq_t rxq; int i, j, tx_ip_csum_flags, tx_ip6_csum_flags; if_setdrvflagbits(ifp, IFF_DRV_OACTIVE, IFF_DRV_RUNNING); IFDI_INTR_DISABLE(ctx); tx_ip_csum_flags = scctx->isc_tx_csum_flags & (CSUM_IP | CSUM_TCP | CSUM_UDP | CSUM_SCTP); tx_ip6_csum_flags = scctx->isc_tx_csum_flags & (CSUM_IP6_TCP | CSUM_IP6_UDP | CSUM_IP6_SCTP); /* Set hardware offload abilities */ if_clearhwassist(ifp); if (if_getcapenable(ifp) & IFCAP_TXCSUM) if_sethwassistbits(ifp, tx_ip_csum_flags, 0); if (if_getcapenable(ifp) & IFCAP_TXCSUM_IPV6) if_sethwassistbits(ifp, tx_ip6_csum_flags, 0); if (if_getcapenable(ifp) & IFCAP_TSO4) if_sethwassistbits(ifp, CSUM_IP_TSO, 0); if (if_getcapenable(ifp) & IFCAP_TSO6) if_sethwassistbits(ifp, CSUM_IP6_TSO, 0); for (i = 0, txq = ctx->ifc_txqs; i < sctx->isc_ntxqsets; i++, txq++) { CALLOUT_LOCK(txq); callout_stop(&txq->ift_timer); callout_stop(&txq->ift_db_check); CALLOUT_UNLOCK(txq); iflib_netmap_txq_init(ctx, txq); } for (i = 0, rxq = ctx->ifc_rxqs; i < sctx->isc_nrxqsets; i++, rxq++) { iflib_netmap_rxq_init(ctx, rxq); } #ifdef INVARIANTS i = if_getdrvflags(ifp); #endif IFDI_INIT(ctx); MPASS(if_getdrvflags(ifp) == i); for (i = 0, rxq = ctx->ifc_rxqs; i < sctx->isc_nrxqsets; i++, rxq++) { for (j = 0, fl = rxq->ifr_fl; j < rxq->ifr_nfl; j++, fl++) { if (iflib_fl_setup(fl)) { device_printf(ctx->ifc_dev, "freelist setup failed - check cluster settings\n"); goto done; } } } done: if_setdrvflagbits(ctx->ifc_ifp, IFF_DRV_RUNNING, IFF_DRV_OACTIVE); IFDI_INTR_ENABLE(ctx); txq = ctx->ifc_txqs; for (i = 0; i < sctx->isc_ntxqsets; i++, txq++) callout_reset_on(&txq->ift_timer, hz/2, iflib_timer, txq, txq->ift_timer.c_cpu); } static int iflib_media_change(if_t ifp) { if_ctx_t ctx = if_getsoftc(ifp); int err; CTX_LOCK(ctx); if ((err = IFDI_MEDIA_CHANGE(ctx)) == 0) iflib_init_locked(ctx); CTX_UNLOCK(ctx); return (err); } static void iflib_media_status(if_t ifp, struct ifmediareq *ifmr) { if_ctx_t ctx = if_getsoftc(ifp); CTX_LOCK(ctx); IFDI_UPDATE_ADMIN_STATUS(ctx); IFDI_MEDIA_STATUS(ctx, ifmr); CTX_UNLOCK(ctx); } static void iflib_stop(if_ctx_t ctx) { iflib_txq_t txq = ctx->ifc_txqs; iflib_rxq_t rxq = ctx->ifc_rxqs; if_softc_ctx_t scctx = &ctx->ifc_softc_ctx; iflib_dma_info_t di; iflib_fl_t fl; int i, j; /* Tell the stack that the interface is no longer active */ if_setdrvflagbits(ctx->ifc_ifp, IFF_DRV_OACTIVE, IFF_DRV_RUNNING); IFDI_INTR_DISABLE(ctx); DELAY(100000); IFDI_STOP(ctx); DELAY(100000); iflib_debug_reset(); /* Wait for current tx queue users to exit to disarm watchdog timer. */ for (i = 0; i < scctx->isc_ntxqsets; i++, txq++) { /* make sure all transmitters have completed before proceeding XXX */ /* clean any enqueued buffers */ iflib_ifmp_purge(txq); /* Free any existing tx buffers. */ for (j = 0; j < txq->ift_size; j++) { iflib_txsd_free(ctx, txq, j); } txq->ift_processed = txq->ift_cleaned = txq->ift_cidx_processed = 0; txq->ift_in_use = txq->ift_gen = txq->ift_cidx = txq->ift_pidx = txq->ift_no_desc_avail = 0; txq->ift_closed = txq->ift_mbuf_defrag = txq->ift_mbuf_defrag_failed = 0; txq->ift_no_tx_dma_setup = txq->ift_txd_encap_efbig = txq->ift_map_failed = 0; txq->ift_pullups = 0; ifmp_ring_reset_stats(txq->ift_br[0]); for (j = 0, di = txq->ift_ifdi; j < ctx->ifc_nhwtxqs; j++, di++) bzero((void *)di->idi_vaddr, di->idi_size); } for (i = 0; i < scctx->isc_nrxqsets; i++, rxq++) { /* make sure all transmitters have completed before proceeding XXX */ for (j = 0, di = txq->ift_ifdi; j < ctx->ifc_nhwrxqs; j++, di++) bzero((void *)di->idi_vaddr, di->idi_size); /* also resets the free lists pidx/cidx */ for (j = 0, fl = rxq->ifr_fl; j < rxq->ifr_nfl; j++, fl++) iflib_fl_bufs_free(fl); } } static iflib_rxsd_t rxd_frag_to_sd(iflib_rxq_t rxq, if_rxd_frag_t irf, int *cltype, int unload) { int flid, cidx; iflib_rxsd_t sd; iflib_fl_t fl; iflib_dma_info_t di; flid = irf->irf_flid; cidx = irf->irf_idx; fl = &rxq->ifr_fl[flid]; fl->ifl_credits--; #if MEMORY_LOGGING fl->ifl_m_dequeued++; if (cltype) fl->ifl_cl_dequeued++; #endif sd = &fl->ifl_sds[cidx]; di = fl->ifl_ifdi; bus_dmamap_sync(di->idi_tag, di->idi_map, BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE); /* not valid assert if bxe really does SGE from non-contiguous elements */ MPASS(fl->ifl_cidx == cidx); if (unload) bus_dmamap_unload(fl->ifl_desc_tag, sd->ifsd_map); if (__predict_false(++fl->ifl_cidx == fl->ifl_size)) { fl->ifl_cidx = 0; fl->ifl_gen = 0; } /* YES ick */ if (cltype) *cltype = fl->ifl_cltype; return (sd); } static struct mbuf * assemble_segments(iflib_rxq_t rxq, if_rxd_info_t ri) { int i, padlen , flags, cltype; struct mbuf *m, *mh, *mt; iflib_rxsd_t sd; caddr_t cl; i = 0; mh = NULL; do { sd = rxd_frag_to_sd(rxq, &ri->iri_frags[i], &cltype, TRUE); MPASS(sd->ifsd_cl != NULL); MPASS(sd->ifsd_m != NULL); /* Don't include zero-length frags */ if (ri->iri_frags[i].irf_len == 0) { /* XXX we can save the cluster here, but not the mbuf */ m_init(sd->ifsd_m, M_NOWAIT, MT_DATA, 0); m_free(sd->ifsd_m); sd->ifsd_m = NULL; continue; } m = sd->ifsd_m; if (mh == NULL) { flags = M_PKTHDR|M_EXT; mh = mt = m; padlen = ri->iri_pad; } else { flags = M_EXT; mt->m_next = m; mt = m; /* assuming padding is only on the first fragment */ padlen = 0; } sd->ifsd_m = NULL; cl = sd->ifsd_cl; sd->ifsd_cl = NULL; /* Can these two be made one ? */ m_init(m, M_NOWAIT, MT_DATA, flags); m_cljset(m, cl, cltype); /* * These must follow m_init and m_cljset */ m->m_data += padlen; ri->iri_len -= padlen; m->m_len = ri->iri_frags[i].irf_len; } while (++i < ri->iri_nfrags); return (mh); } /* * Process one software descriptor */ static struct mbuf * iflib_rxd_pkt_get(iflib_rxq_t rxq, if_rxd_info_t ri) { struct mbuf *m; iflib_rxsd_t sd; /* should I merge this back in now that the two paths are basically duplicated? */ if (ri->iri_nfrags == 1 && ri->iri_frags[0].irf_len <= IFLIB_RX_COPY_THRESH) { sd = rxd_frag_to_sd(rxq, &ri->iri_frags[0], NULL, FALSE); m = sd->ifsd_m; sd->ifsd_m = NULL; m_init(m, M_NOWAIT, MT_DATA, M_PKTHDR); memcpy(m->m_data, sd->ifsd_cl, ri->iri_len); m->m_len = ri->iri_frags[0].irf_len; } else { m = assemble_segments(rxq, ri); } m->m_pkthdr.len = ri->iri_len; m->m_pkthdr.rcvif = ri->iri_ifp; m->m_flags |= ri->iri_flags; m->m_pkthdr.ether_vtag = ri->iri_vtag; m->m_pkthdr.flowid = ri->iri_flowid; M_HASHTYPE_SET(m, ri->iri_rsstype); m->m_pkthdr.csum_flags = ri->iri_csum_flags; m->m_pkthdr.csum_data = ri->iri_csum_data; return (m); } static bool iflib_rxeof(iflib_rxq_t rxq, int budget) { if_ctx_t ctx = rxq->ifr_ctx; if_shared_ctx_t sctx = ctx->ifc_sctx; if_softc_ctx_t scctx = &ctx->ifc_softc_ctx; int avail, i; uint16_t *cidxp; struct if_rxd_info ri; int err, budget_left, rx_bytes, rx_pkts; iflib_fl_t fl; struct ifnet *ifp; int lro_enabled; /* * XXX early demux data packets so that if_input processing only handles * acks in interrupt context */ struct mbuf *m, *mh, *mt; if (netmap_rx_irq(ctx->ifc_ifp, rxq->ifr_id, &budget)) { return (FALSE); } mh = mt = NULL; MPASS(budget > 0); rx_pkts = rx_bytes = 0; if (sctx->isc_flags & IFLIB_HAS_RXCQ) cidxp = &rxq->ifr_cq_cidx; else cidxp = &rxq->ifr_fl[0].ifl_cidx; if ((avail = iflib_rxd_avail(ctx, rxq, *cidxp, budget)) == 0) { for (i = 0, fl = &rxq->ifr_fl[0]; i < sctx->isc_nfl; i++, fl++) __iflib_fl_refill_lt(ctx, fl, budget + 8); DBG_COUNTER_INC(rx_unavail); return (false); } for (budget_left = budget; (budget_left > 0) && (avail > 0); budget_left--, avail--) { if (__predict_false(!CTX_ACTIVE(ctx))) { DBG_COUNTER_INC(rx_ctx_inactive); break; } /* * Reset client set fields to their default values */ bzero(&ri, sizeof(ri)); ri.iri_qsidx = rxq->ifr_id; ri.iri_cidx = *cidxp; ri.iri_ifp = ctx->ifc_ifp; ri.iri_frags = rxq->ifr_frags; err = ctx->isc_rxd_pkt_get(ctx->ifc_softc, &ri); /* in lieu of handling correctly - make sure it isn't being unhandled */ MPASS(err == 0); if (sctx->isc_flags & IFLIB_HAS_RXCQ) { *cidxp = ri.iri_cidx; /* Update our consumer index */ while (rxq->ifr_cq_cidx >= scctx->isc_nrxd[0]) { rxq->ifr_cq_cidx -= scctx->isc_nrxd[0]; rxq->ifr_cq_gen = 0; } /* was this only a completion queue message? */ if (__predict_false(ri.iri_nfrags == 0)) continue; } MPASS(ri.iri_nfrags != 0); MPASS(ri.iri_len != 0); /* will advance the cidx on the corresponding free lists */ m = iflib_rxd_pkt_get(rxq, &ri); if (avail == 0 && budget_left) avail = iflib_rxd_avail(ctx, rxq, *cidxp, budget_left); if (__predict_false(m == NULL)) { DBG_COUNTER_INC(rx_mbuf_null); continue; } /* imm_pkt: -- cxgb */ if (mh == NULL) mh = mt = m; else { mt->m_nextpkt = m; mt = m; } } /* make sure that we can refill faster than drain */ for (i = 0, fl = &rxq->ifr_fl[0]; i < sctx->isc_nfl; i++, fl++) __iflib_fl_refill_lt(ctx, fl, budget + 8); ifp = ctx->ifc_ifp; lro_enabled = (if_getcapenable(ifp) & IFCAP_LRO); while (mh != NULL) { m = mh; mh = mh->m_nextpkt; m->m_nextpkt = NULL; rx_bytes += m->m_pkthdr.len; rx_pkts++; #if defined(INET6) || defined(INET) if (lro_enabled && tcp_lro_rx(&rxq->ifr_lc, m, 0) == 0) continue; #endif DBG_COUNTER_INC(rx_if_input); ifp->if_input(ifp, m); } if_inc_counter(ifp, IFCOUNTER_IBYTES, rx_bytes); if_inc_counter(ifp, IFCOUNTER_IPACKETS, rx_pkts); /* * Flush any outstanding LRO work */ #if defined(INET6) || defined(INET) tcp_lro_flush_all(&rxq->ifr_lc); #endif if (avail) return true; return (iflib_rxd_avail(ctx, rxq, *cidxp, 1)); } #define M_CSUM_FLAGS(m) ((m)->m_pkthdr.csum_flags) #define M_HAS_VLANTAG(m) (m->m_flags & M_VLANTAG) #define TXQ_MAX_DB_DEFERRED(size) (size >> 5) #define TXQ_MAX_DB_CONSUMED(size) (size >> 4) static __inline void iflib_txd_db_check(if_ctx_t ctx, iflib_txq_t txq, int ring) { uint32_t dbval; if (ring || txq->ift_db_pending >= TXQ_MAX_DB_DEFERRED(txq->ift_size)) { /* the lock will only ever be contended in the !min_latency case */ if (!TXDB_TRYLOCK(txq)) return; dbval = txq->ift_npending ? txq->ift_npending : txq->ift_pidx; ctx->isc_txd_flush(ctx->ifc_softc, txq->ift_id, dbval); txq->ift_db_pending = txq->ift_npending = 0; TXDB_UNLOCK(txq); } } static void iflib_txd_deferred_db_check(void * arg) { iflib_txq_t txq = arg; /* simple non-zero boolean so use bitwise OR */ if ((txq->ift_db_pending | txq->ift_npending) && txq->ift_db_pending >= txq->ift_db_pending_queued) iflib_txd_db_check(txq->ift_ctx, txq, TRUE); txq->ift_db_pending_queued = 0; if (ifmp_ring_is_stalled(txq->ift_br[0])) iflib_txq_check_drain(txq, 4); } #ifdef PKT_DEBUG static void print_pkt(if_pkt_info_t pi) { printf("pi len: %d qsidx: %d nsegs: %d ndescs: %d flags: %x pidx: %d\n", pi->ipi_len, pi->ipi_qsidx, pi->ipi_nsegs, pi->ipi_ndescs, pi->ipi_flags, pi->ipi_pidx); printf("pi new_pidx: %d csum_flags: %lx tso_segsz: %d mflags: %x vtag: %d\n", pi->ipi_new_pidx, pi->ipi_csum_flags, pi->ipi_tso_segsz, pi->ipi_mflags, pi->ipi_vtag); printf("pi etype: %d ehdrlen: %d ip_hlen: %d ipproto: %d\n", pi->ipi_etype, pi->ipi_ehdrlen, pi->ipi_ip_hlen, pi->ipi_ipproto); } #endif #define IS_TSO4(pi) ((pi)->ipi_csum_flags & CSUM_IP_TSO) #define IS_TSO6(pi) ((pi)->ipi_csum_flags & CSUM_IP6_TSO) static int iflib_parse_header(iflib_txq_t txq, if_pkt_info_t pi, struct mbuf **mp) { if_shared_ctx_t sctx = txq->ift_ctx->ifc_sctx; struct ether_vlan_header *eh; struct mbuf *m, *n; n = m = *mp; if ((sctx->isc_flags & IFLIB_NEED_SCRATCH) && M_WRITABLE(m) == 0) { if ((m = m_dup(m, M_NOWAIT)) == NULL) { return (ENOMEM); } else { m_freem(*mp); n = *mp = m; } } /* * Determine where frame payload starts. * Jump over vlan headers if already present, * helpful for QinQ too. */ if (__predict_false(m->m_len < sizeof(*eh))) { txq->ift_pullups++; if (__predict_false((m = m_pullup(m, sizeof(*eh))) == NULL)) return (ENOMEM); } eh = mtod(m, struct ether_vlan_header *); if (eh->evl_encap_proto == htons(ETHERTYPE_VLAN)) { pi->ipi_etype = ntohs(eh->evl_proto); pi->ipi_ehdrlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN; } else { pi->ipi_etype = ntohs(eh->evl_encap_proto); pi->ipi_ehdrlen = ETHER_HDR_LEN; } switch (pi->ipi_etype) { #ifdef INET case ETHERTYPE_IP: { struct ip *ip = NULL; struct tcphdr *th = NULL; int minthlen; minthlen = min(m->m_pkthdr.len, pi->ipi_ehdrlen + sizeof(*ip) + sizeof(*th)); if (__predict_false(m->m_len < minthlen)) { /* * if this code bloat is causing too much of a hit * move it to a separate function and mark it noinline */ if (m->m_len == pi->ipi_ehdrlen) { n = m->m_next; MPASS(n); if (n->m_len >= sizeof(*ip)) { ip = (struct ip *)n->m_data; if (n->m_len >= (ip->ip_hl << 2) + sizeof(*th)) th = (struct tcphdr *)((caddr_t)ip + (ip->ip_hl << 2)); } else { txq->ift_pullups++; if (__predict_false((m = m_pullup(m, minthlen)) == NULL)) return (ENOMEM); ip = (struct ip *)(m->m_data + pi->ipi_ehdrlen); } } else { txq->ift_pullups++; if (__predict_false((m = m_pullup(m, minthlen)) == NULL)) return (ENOMEM); ip = (struct ip *)(m->m_data + pi->ipi_ehdrlen); if (m->m_len >= (ip->ip_hl << 2) + sizeof(*th)) th = (struct tcphdr *)((caddr_t)ip + (ip->ip_hl << 2)); } } else { ip = (struct ip *)(m->m_data + pi->ipi_ehdrlen); if (m->m_len >= (ip->ip_hl << 2) + sizeof(*th)) th = (struct tcphdr *)((caddr_t)ip + (ip->ip_hl << 2)); } pi->ipi_ip_hlen = ip->ip_hl << 2; pi->ipi_ipproto = ip->ip_p; pi->ipi_flags |= IPI_TX_IPV4; if (pi->ipi_csum_flags & CSUM_IP) ip->ip_sum = 0; if (pi->ipi_ipproto == IPPROTO_TCP) { if (__predict_false(th == NULL)) { txq->ift_pullups++; if (__predict_false((m = m_pullup(m, (ip->ip_hl << 2) + sizeof(*th))) == NULL)) return (ENOMEM); th = (struct tcphdr *)((caddr_t)ip + pi->ipi_ip_hlen); } pi->ipi_tcp_hflags = th->th_flags; pi->ipi_tcp_hlen = th->th_off << 2; pi->ipi_tcp_seq = th->th_seq; } if (IS_TSO4(pi)) { if (__predict_false(ip->ip_p != IPPROTO_TCP)) return (ENXIO); th->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, htons(IPPROTO_TCP)); pi->ipi_tso_segsz = m->m_pkthdr.tso_segsz; if (sctx->isc_flags & IFLIB_TSO_INIT_IP) { ip->ip_sum = 0; ip->ip_len = htons(pi->ipi_ip_hlen + pi->ipi_tcp_hlen + pi->ipi_tso_segsz); } } break; } #endif #ifdef INET6 case ETHERTYPE_IPV6: { struct ip6_hdr *ip6 = (struct ip6_hdr *)(m->m_data + pi->ipi_ehdrlen); struct tcphdr *th; pi->ipi_ip_hlen = sizeof(struct ip6_hdr); if (__predict_false(m->m_len < pi->ipi_ehdrlen + sizeof(struct ip6_hdr))) { if (__predict_false((m = m_pullup(m, pi->ipi_ehdrlen + sizeof(struct ip6_hdr))) == NULL)) return (ENOMEM); } th = (struct tcphdr *)((caddr_t)ip6 + pi->ipi_ip_hlen); /* XXX-BZ this will go badly in case of ext hdrs. */ pi->ipi_ipproto = ip6->ip6_nxt; pi->ipi_flags |= IPI_TX_IPV6; if (pi->ipi_ipproto == IPPROTO_TCP) { if (__predict_false(m->m_len < pi->ipi_ehdrlen + sizeof(struct ip6_hdr) + sizeof(struct tcphdr))) { if (__predict_false((m = m_pullup(m, pi->ipi_ehdrlen + sizeof(struct ip6_hdr) + sizeof(struct tcphdr))) == NULL)) return (ENOMEM); } pi->ipi_tcp_hflags = th->th_flags; pi->ipi_tcp_hlen = th->th_off << 2; } if (IS_TSO6(pi)) { if (__predict_false(ip6->ip6_nxt != IPPROTO_TCP)) return (ENXIO); /* * The corresponding flag is set by the stack in the IPv4 * TSO case, but not in IPv6 (at least in FreeBSD 10.2). * So, set it here because the rest of the flow requires it. */ pi->ipi_csum_flags |= CSUM_TCP_IPV6; th->th_sum = in6_cksum_pseudo(ip6, 0, IPPROTO_TCP, 0); pi->ipi_tso_segsz = m->m_pkthdr.tso_segsz; } break; } #endif default: pi->ipi_csum_flags &= ~CSUM_OFFLOAD; pi->ipi_ip_hlen = 0; break; } *mp = m; return (0); } static __noinline struct mbuf * collapse_pkthdr(struct mbuf *m0) { struct mbuf *m, *m_next, *tmp; m = m0; m_next = m->m_next; while (m_next != NULL && m_next->m_len == 0) { m = m_next; m->m_next = NULL; m_free(m); m_next = m_next->m_next; } m = m0; m->m_next = m_next; if ((m_next->m_flags & M_EXT) == 0) { m = m_defrag(m, M_NOWAIT); } else { tmp = m_next->m_next; memcpy(m_next, m, MPKTHSIZE); m = m_next; m->m_next = tmp; } return (m); } /* * If dodgy hardware rejects the scatter gather chain we've handed it * we'll need to remove the mbuf chain from ifsg_m[] before we can add the * m_defrag'd mbufs */ static __noinline struct mbuf * iflib_remove_mbuf(iflib_txq_t txq) { int ntxd, i, pidx; struct mbuf *m, *mh, **ifsd_m; pidx = txq->ift_pidx; ifsd_m = txq->ift_sds.ifsd_m; ntxd = txq->ift_size; mh = m = ifsd_m[pidx]; ifsd_m[pidx] = NULL; #if MEMORY_LOGGING txq->ift_dequeued++; #endif i = 1; while (m) { ifsd_m[(pidx + i) & (ntxd -1)] = NULL; #if MEMORY_LOGGING txq->ift_dequeued++; #endif m = m->m_next; i++; } return (mh); } static int iflib_busdma_load_mbuf_sg(iflib_txq_t txq, bus_dma_tag_t tag, bus_dmamap_t map, struct mbuf **m0, bus_dma_segment_t *segs, int *nsegs, int max_segs, int flags) { if_ctx_t ctx; if_shared_ctx_t sctx; if_softc_ctx_t scctx; int i, next, pidx, mask, err, maxsegsz, ntxd, count; struct mbuf *m, *tmp, **ifsd_m, **mp; m = *m0; /* * Please don't ever do this */ if (__predict_false(m->m_len == 0)) *m0 = m = collapse_pkthdr(m); ctx = txq->ift_ctx; sctx = ctx->ifc_sctx; scctx = &ctx->ifc_softc_ctx; ifsd_m = txq->ift_sds.ifsd_m; ntxd = txq->ift_size; pidx = txq->ift_pidx; if (map != NULL) { uint8_t *ifsd_flags = txq->ift_sds.ifsd_flags; err = bus_dmamap_load_mbuf_sg(tag, map, *m0, segs, nsegs, BUS_DMA_NOWAIT); if (err) return (err); ifsd_flags[pidx] |= TX_SW_DESC_MAPPED; i = 0; next = pidx; mask = (txq->ift_size-1); m = *m0; do { mp = &ifsd_m[next]; *mp = m; m = m->m_next; if (__predict_false((*mp)->m_len == 0)) { m_free(*mp); *mp = NULL; } else next = (pidx + i) & (ntxd-1); } while (m != NULL); } else { int buflen, sgsize, max_sgsize; vm_offset_t vaddr; vm_paddr_t curaddr; count = i = 0; maxsegsz = sctx->isc_tx_maxsize; m = *m0; do { if (__predict_false(m->m_len <= 0)) { tmp = m; m = m->m_next; tmp->m_next = NULL; m_free(tmp); continue; } buflen = m->m_len; vaddr = (vm_offset_t)m->m_data; /* * see if we can't be smarter about physically * contiguous mappings */ next = (pidx + count) & (ntxd-1); MPASS(ifsd_m[next] == NULL); #if MEMORY_LOGGING txq->ift_enqueued++; #endif ifsd_m[next] = m; while (buflen > 0) { max_sgsize = MIN(buflen, maxsegsz); curaddr = pmap_kextract(vaddr); sgsize = PAGE_SIZE - (curaddr & PAGE_MASK); sgsize = MIN(sgsize, max_sgsize); segs[i].ds_addr = curaddr; segs[i].ds_len = sgsize; vaddr += sgsize; buflen -= sgsize; i++; if (i >= max_segs) goto err; } count++; tmp = m; m = m->m_next; } while (m != NULL); *nsegs = i; } return (0); err: *m0 = iflib_remove_mbuf(txq); return (EFBIG); } static int iflib_encap(iflib_txq_t txq, struct mbuf **m_headp) { if_ctx_t ctx; if_shared_ctx_t sctx; if_softc_ctx_t scctx; bus_dma_segment_t *segs; struct mbuf *m_head; bus_dmamap_t map; struct if_pkt_info pi; int remap = 0; int err, nsegs, ndesc, max_segs, pidx, cidx, next, ntxd; bus_dma_tag_t desc_tag; segs = txq->ift_segs; ctx = txq->ift_ctx; sctx = ctx->ifc_sctx; scctx = &ctx->ifc_softc_ctx; segs = txq->ift_segs; ntxd = txq->ift_size; m_head = *m_headp; map = NULL; /* * If we're doing TSO the next descriptor to clean may be quite far ahead */ cidx = txq->ift_cidx; pidx = txq->ift_pidx; next = (cidx + CACHE_PTR_INCREMENT) & (ntxd-1); /* prefetch the next cache line of mbuf pointers and flags */ prefetch(&txq->ift_sds.ifsd_m[next]); if (txq->ift_sds.ifsd_map != NULL) { prefetch(&txq->ift_sds.ifsd_map[next]); map = txq->ift_sds.ifsd_map[pidx]; next = (cidx + CACHE_LINE_SIZE) & (ntxd-1); prefetch(&txq->ift_sds.ifsd_flags[next]); } if (m_head->m_pkthdr.csum_flags & CSUM_TSO) { desc_tag = txq->ift_tso_desc_tag; max_segs = scctx->isc_tx_tso_segments_max; } else { desc_tag = txq->ift_desc_tag; max_segs = scctx->isc_tx_nsegments; } m_head = *m_headp; bzero(&pi, sizeof(pi)); pi.ipi_len = m_head->m_pkthdr.len; pi.ipi_mflags = (m_head->m_flags & (M_VLANTAG|M_BCAST|M_MCAST)); pi.ipi_csum_flags = m_head->m_pkthdr.csum_flags; pi.ipi_vtag = (m_head->m_flags & M_VLANTAG) ? m_head->m_pkthdr.ether_vtag : 0; pi.ipi_pidx = pidx; pi.ipi_qsidx = txq->ift_id; /* deliberate bitwise OR to make one condition */ if (__predict_true((pi.ipi_csum_flags | pi.ipi_vtag))) { if (__predict_false((err = iflib_parse_header(txq, &pi, m_headp)) != 0)) return (err); m_head = *m_headp; } retry: err = iflib_busdma_load_mbuf_sg(txq, desc_tag, map, m_headp, segs, &nsegs, max_segs, BUS_DMA_NOWAIT); defrag: if (__predict_false(err)) { switch (err) { case EFBIG: /* try collapse once and defrag once */ if (remap == 0) m_head = m_collapse(*m_headp, M_NOWAIT, max_segs); if (remap == 1) m_head = m_defrag(*m_headp, M_NOWAIT); remap++; if (__predict_false(m_head == NULL)) goto defrag_failed; txq->ift_mbuf_defrag++; *m_headp = m_head; goto retry; break; case ENOMEM: txq->ift_no_tx_dma_setup++; break; default: txq->ift_no_tx_dma_setup++; m_freem(*m_headp); DBG_COUNTER_INC(tx_frees); *m_headp = NULL; break; } txq->ift_map_failed++; DBG_COUNTER_INC(encap_load_mbuf_fail); return (err); } /* * XXX assumes a 1 to 1 relationship between segments and * descriptors - this does not hold true on all drivers, e.g. * cxgb */ if (__predict_false(nsegs + 2 > TXQ_AVAIL(txq))) { txq->ift_no_desc_avail++; if (map != NULL) bus_dmamap_unload(desc_tag, map); DBG_COUNTER_INC(encap_txq_avail_fail); if ((txq->ift_task.gt_task.ta_flags & TASK_ENQUEUED) == 0) GROUPTASK_ENQUEUE(&txq->ift_task); return (ENOBUFS); } pi.ipi_segs = segs; pi.ipi_nsegs = nsegs; MPASS(pidx >= 0 && pidx < txq->ift_size); #ifdef PKT_DEBUG print_pkt(&pi); #endif if ((err = ctx->isc_txd_encap(ctx->ifc_softc, &pi)) == 0) { bus_dmamap_sync(txq->ift_ifdi->idi_tag, txq->ift_ifdi->idi_map, BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); DBG_COUNTER_INC(tx_encap); MPASS(pi.ipi_new_pidx >= 0 && pi.ipi_new_pidx < txq->ift_size); ndesc = pi.ipi_new_pidx - pi.ipi_pidx; if (pi.ipi_new_pidx < pi.ipi_pidx) { ndesc += txq->ift_size; txq->ift_gen = 1; } /* * drivers can need as many as * two sentinels */ MPASS(ndesc <= pi.ipi_nsegs + 2); MPASS(pi.ipi_new_pidx != pidx); MPASS(ndesc > 0); txq->ift_in_use += ndesc; /* * We update the last software descriptor again here because there may * be a sentinel and/or there may be more mbufs than segments */ txq->ift_pidx = pi.ipi_new_pidx; txq->ift_npending += pi.ipi_ndescs; } else if (__predict_false(err == EFBIG && remap < 2)) { *m_headp = m_head = iflib_remove_mbuf(txq); remap = 1; txq->ift_txd_encap_efbig++; goto defrag; } else DBG_COUNTER_INC(encap_txd_encap_fail); return (err); defrag_failed: txq->ift_mbuf_defrag_failed++; txq->ift_map_failed++; m_freem(*m_headp); DBG_COUNTER_INC(tx_frees); *m_headp = NULL; return (ENOMEM); } /* forward compatibility for cxgb */ #define FIRST_QSET(ctx) 0 #define NTXQSETS(ctx) ((ctx)->ifc_softc_ctx.isc_ntxqsets) #define NRXQSETS(ctx) ((ctx)->ifc_softc_ctx.isc_nrxqsets) #define QIDX(ctx, m) ((((m)->m_pkthdr.flowid & ctx->ifc_softc_ctx.isc_rss_table_mask) % NTXQSETS(ctx)) + FIRST_QSET(ctx)) #define DESC_RECLAIMABLE(q) ((int)((q)->ift_processed - (q)->ift_cleaned - (q)->ift_ctx->ifc_softc_ctx.isc_tx_nsegments)) #define RECLAIM_THRESH(ctx) ((ctx)->ifc_sctx->isc_tx_reclaim_thresh) #define MAX_TX_DESC(ctx) ((ctx)->ifc_softc_ctx.isc_tx_tso_segments_max) /* if there are more than TXQ_MIN_OCCUPANCY packets pending we consider deferring * doorbell writes * * ORing with 2 assures that min occupancy is never less than 2 without any conditional logic */ #define TXQ_MIN_OCCUPANCY(size) ((size >> 6)| 0x2) static inline int iflib_txq_min_occupancy(iflib_txq_t txq) { if_ctx_t ctx; ctx = txq->ift_ctx; return (get_inuse(txq->ift_size, txq->ift_cidx, txq->ift_pidx, txq->ift_gen) < TXQ_MIN_OCCUPANCY(txq->ift_size) + MAX_TX_DESC(ctx)); } static void iflib_tx_desc_free(iflib_txq_t txq, int n) { int hasmap; uint32_t qsize, cidx, mask, gen; struct mbuf *m, **ifsd_m; uint8_t *ifsd_flags; bus_dmamap_t *ifsd_map; cidx = txq->ift_cidx; gen = txq->ift_gen; qsize = txq->ift_size; mask = qsize-1; hasmap = txq->ift_sds.ifsd_map != NULL; ifsd_flags = txq->ift_sds.ifsd_flags; ifsd_m = txq->ift_sds.ifsd_m; ifsd_map = txq->ift_sds.ifsd_map; while (n--) { prefetch(ifsd_m[(cidx + 3) & mask]); prefetch(ifsd_m[(cidx + 4) & mask]); if (ifsd_m[cidx] != NULL) { prefetch(&ifsd_m[(cidx + CACHE_PTR_INCREMENT) & mask]); prefetch(&ifsd_flags[(cidx + CACHE_PTR_INCREMENT) & mask]); if (hasmap && (ifsd_flags[cidx] & TX_SW_DESC_MAPPED)) { /* * does it matter if it's not the TSO tag? If so we'll * have to add the type to flags */ bus_dmamap_unload(txq->ift_desc_tag, ifsd_map[cidx]); ifsd_flags[cidx] &= ~TX_SW_DESC_MAPPED; } if ((m = ifsd_m[cidx]) != NULL) { /* XXX we don't support any drivers that batch packets yet */ MPASS(m->m_nextpkt == NULL); m_free(m); ifsd_m[cidx] = NULL; #if MEMORY_LOGGING txq->ift_dequeued++; #endif DBG_COUNTER_INC(tx_frees); } } if (__predict_false(++cidx == qsize)) { cidx = 0; gen = 0; } } txq->ift_cidx = cidx; txq->ift_gen = gen; } static __inline int iflib_completed_tx_reclaim(iflib_txq_t txq, int thresh) { int reclaim; if_ctx_t ctx = txq->ift_ctx; KASSERT(thresh >= 0, ("invalid threshold to reclaim")); MPASS(thresh /*+ MAX_TX_DESC(txq->ift_ctx) */ < txq->ift_size); /* * Need a rate-limiting check so that this isn't called every time */ iflib_tx_credits_update(ctx, txq); reclaim = DESC_RECLAIMABLE(txq); if (reclaim <= thresh /* + MAX_TX_DESC(txq->ift_ctx) */) { #ifdef INVARIANTS if (iflib_verbose_debug) { printf("%s processed=%ju cleaned=%ju tx_nsegments=%d reclaim=%d thresh=%d\n", __FUNCTION__, txq->ift_processed, txq->ift_cleaned, txq->ift_ctx->ifc_softc_ctx.isc_tx_nsegments, reclaim, thresh); } #endif return (0); } iflib_tx_desc_free(txq, reclaim); txq->ift_cleaned += reclaim; txq->ift_in_use -= reclaim; if (txq->ift_active == FALSE) txq->ift_active = TRUE; return (reclaim); } static struct mbuf ** _ring_peek_one(struct ifmp_ring *r, int cidx, int offset) { return (__DEVOLATILE(struct mbuf **, &r->items[(cidx + offset) & (r->size-1)])); } static void iflib_txq_check_drain(iflib_txq_t txq, int budget) { ifmp_ring_check_drainage(txq->ift_br[0], budget); } static uint32_t iflib_txq_can_drain(struct ifmp_ring *r) { iflib_txq_t txq = r->cookie; if_ctx_t ctx = txq->ift_ctx; return ((TXQ_AVAIL(txq) > MAX_TX_DESC(ctx) + 2) || ctx->isc_txd_credits_update(ctx->ifc_softc, txq->ift_id, txq->ift_cidx_processed, false)); } static uint32_t iflib_txq_drain(struct ifmp_ring *r, uint32_t cidx, uint32_t pidx) { iflib_txq_t txq = r->cookie; if_ctx_t ctx = txq->ift_ctx; if_t ifp = ctx->ifc_ifp; struct mbuf **mp, *m; int i, count, consumed, pkt_sent, bytes_sent, mcast_sent, avail, err, in_use_prev, desc_used; if (__predict_false(!(if_getdrvflags(ifp) & IFF_DRV_RUNNING) || !LINK_ACTIVE(ctx))) { DBG_COUNTER_INC(txq_drain_notready); return (0); } avail = IDXDIFF(pidx, cidx, r->size); if (__predict_false(ctx->ifc_flags & IFC_QFLUSH)) { DBG_COUNTER_INC(txq_drain_flushing); for (i = 0; i < avail; i++) { m_free(r->items[(cidx + i) & (r->size-1)]); r->items[(cidx + i) & (r->size-1)] = NULL; } return (avail); } iflib_completed_tx_reclaim(txq, RECLAIM_THRESH(ctx)); if (__predict_false(if_getdrvflags(ctx->ifc_ifp) & IFF_DRV_OACTIVE)) { txq->ift_qstatus = IFLIB_QUEUE_IDLE; CALLOUT_LOCK(txq); callout_stop(&txq->ift_timer); callout_stop(&txq->ift_db_check); CALLOUT_UNLOCK(txq); DBG_COUNTER_INC(txq_drain_oactive); return (0); } consumed = mcast_sent = bytes_sent = pkt_sent = 0; count = MIN(avail, TX_BATCH_SIZE); #ifdef INVARIANTS if (iflib_verbose_debug) printf("%s avail=%d ifc_flags=%x txq_avail=%d ", __FUNCTION__, avail, ctx->ifc_flags, TXQ_AVAIL(txq)); #endif for (desc_used = i = 0; i < count && TXQ_AVAIL(txq) > MAX_TX_DESC(ctx) + 2; i++) { mp = _ring_peek_one(r, cidx, i); MPASS(mp != NULL && *mp != NULL); in_use_prev = txq->ift_in_use; if ((err = iflib_encap(txq, mp)) == ENOBUFS) { DBG_COUNTER_INC(txq_drain_encapfail); /* no room - bail out */ break; } consumed++; if (err) { DBG_COUNTER_INC(txq_drain_encapfail); /* we can't send this packet - skip it */ continue; } pkt_sent++; m = *mp; DBG_COUNTER_INC(tx_sent); bytes_sent += m->m_pkthdr.len; if (m->m_flags & M_MCAST) mcast_sent++; txq->ift_db_pending += (txq->ift_in_use - in_use_prev); desc_used += (txq->ift_in_use - in_use_prev); iflib_txd_db_check(ctx, txq, FALSE); ETHER_BPF_MTAP(ifp, m); if (__predict_false(!(if_getdrvflags(ctx->ifc_ifp) & IFF_DRV_RUNNING))) break; if (desc_used >= TXQ_MAX_DB_CONSUMED(txq->ift_size)) break; } if ((iflib_min_tx_latency || iflib_txq_min_occupancy(txq)) && txq->ift_db_pending) iflib_txd_db_check(ctx, txq, TRUE); else if ((txq->ift_db_pending || TXQ_AVAIL(txq) <= MAX_TX_DESC(ctx) + 2) && (callout_pending(&txq->ift_db_check) == 0)) { txq->ift_db_pending_queued = txq->ift_db_pending; callout_reset_on(&txq->ift_db_check, 1, iflib_txd_deferred_db_check, txq, txq->ift_db_check.c_cpu); } if_inc_counter(ifp, IFCOUNTER_OBYTES, bytes_sent); if_inc_counter(ifp, IFCOUNTER_OPACKETS, pkt_sent); if (mcast_sent) if_inc_counter(ifp, IFCOUNTER_OMCASTS, mcast_sent); #ifdef INVARIANTS if (iflib_verbose_debug) printf("consumed=%d\n", consumed); #endif return (consumed); } static uint32_t iflib_txq_drain_always(struct ifmp_ring *r) { return (1); } static uint32_t iflib_txq_drain_free(struct ifmp_ring *r, uint32_t cidx, uint32_t pidx) { int i, avail; struct mbuf **mp; iflib_txq_t txq; txq = r->cookie; txq->ift_qstatus = IFLIB_QUEUE_IDLE; CALLOUT_LOCK(txq); callout_stop(&txq->ift_timer); callout_stop(&txq->ift_db_check); CALLOUT_UNLOCK(txq); avail = IDXDIFF(pidx, cidx, r->size); for (i = 0; i < avail; i++) { mp = _ring_peek_one(r, cidx, i); m_freem(*mp); } MPASS(ifmp_ring_is_stalled(r) == 0); return (avail); } static void iflib_ifmp_purge(iflib_txq_t txq) { struct ifmp_ring *r; r = txq->ift_br[0]; r->drain = iflib_txq_drain_free; r->can_drain = iflib_txq_drain_always; ifmp_ring_check_drainage(r, r->size); r->drain = iflib_txq_drain; r->can_drain = iflib_txq_can_drain; } static void _task_fn_tx(void *context) { iflib_txq_t txq = context; if_ctx_t ctx = txq->ift_ctx; #ifdef IFLIB_DIAGNOSTICS txq->ift_cpu_exec_count[curcpu]++; #endif if (!(if_getdrvflags(ctx->ifc_ifp) & IFF_DRV_RUNNING)) return; ifmp_ring_check_drainage(txq->ift_br[0], TX_BATCH_SIZE); } static void _task_fn_rx(void *context) { iflib_rxq_t rxq = context; if_ctx_t ctx = rxq->ifr_ctx; bool more; int rc; #ifdef IFLIB_DIAGNOSTICS rxq->ifr_cpu_exec_count[curcpu]++; #endif DBG_COUNTER_INC(task_fn_rxs); if (__predict_false(!(if_getdrvflags(ctx->ifc_ifp) & IFF_DRV_RUNNING))) return; if ((more = iflib_rxeof(rxq, 16 /* XXX */)) == false) { if (ctx->ifc_flags & IFC_LEGACY) IFDI_INTR_ENABLE(ctx); else { DBG_COUNTER_INC(rx_intr_enables); rc = IFDI_QUEUE_INTR_ENABLE(ctx, rxq->ifr_id); KASSERT(rc != ENOTSUP, ("MSI-X support requires queue_intr_enable, but not implemented in driver")); } } if (__predict_false(!(if_getdrvflags(ctx->ifc_ifp) & IFF_DRV_RUNNING))) return; if (more) GROUPTASK_ENQUEUE(&rxq->ifr_task); } static void _task_fn_admin(void *context) { if_ctx_t ctx = context; if_softc_ctx_t sctx = &ctx->ifc_softc_ctx; iflib_txq_t txq; int i; if (!(if_getdrvflags(ctx->ifc_ifp) & IFF_DRV_RUNNING)) return; CTX_LOCK(ctx); for (txq = ctx->ifc_txqs, i = 0; i < sctx->isc_ntxqsets; i++, txq++) { CALLOUT_LOCK(txq); callout_stop(&txq->ift_timer); CALLOUT_UNLOCK(txq); } IFDI_UPDATE_ADMIN_STATUS(ctx); for (txq = ctx->ifc_txqs, i = 0; i < sctx->isc_ntxqsets; i++, txq++) callout_reset_on(&txq->ift_timer, hz/2, iflib_timer, txq, txq->ift_timer.c_cpu); IFDI_LINK_INTR_ENABLE(ctx); CTX_UNLOCK(ctx); if (LINK_ACTIVE(ctx) == 0) return; for (txq = ctx->ifc_txqs, i = 0; i < sctx->isc_ntxqsets; i++, txq++) iflib_txq_check_drain(txq, IFLIB_RESTART_BUDGET); } static void _task_fn_iov(void *context) { if_ctx_t ctx = context; if (!(if_getdrvflags(ctx->ifc_ifp) & IFF_DRV_RUNNING)) return; CTX_LOCK(ctx); IFDI_VFLR_HANDLE(ctx); CTX_UNLOCK(ctx); } static int iflib_sysctl_int_delay(SYSCTL_HANDLER_ARGS) { int err; if_int_delay_info_t info; if_ctx_t ctx; info = (if_int_delay_info_t)arg1; ctx = info->iidi_ctx; info->iidi_req = req; info->iidi_oidp = oidp; CTX_LOCK(ctx); err = IFDI_SYSCTL_INT_DELAY(ctx, info); CTX_UNLOCK(ctx); return (err); } /********************************************************************* * * IFNET FUNCTIONS * **********************************************************************/ static void iflib_if_init_locked(if_ctx_t ctx) { iflib_stop(ctx); iflib_init_locked(ctx); } static void iflib_if_init(void *arg) { if_ctx_t ctx = arg; CTX_LOCK(ctx); iflib_if_init_locked(ctx); CTX_UNLOCK(ctx); } static int iflib_if_transmit(if_t ifp, struct mbuf *m) { if_ctx_t ctx = if_getsoftc(ifp); iflib_txq_t txq; int err, qidx; if (__predict_false((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0 || !LINK_ACTIVE(ctx))) { DBG_COUNTER_INC(tx_frees); m_freem(m); return (ENOBUFS); } MPASS(m->m_nextpkt == NULL); qidx = 0; if ((NTXQSETS(ctx) > 1) && M_HASHTYPE_GET(m)) qidx = QIDX(ctx, m); /* * XXX calculate buf_ring based on flowid (divvy up bits?) */ txq = &ctx->ifc_txqs[qidx]; #ifdef DRIVER_BACKPRESSURE if (txq->ift_closed) { while (m != NULL) { next = m->m_nextpkt; m->m_nextpkt = NULL; m_freem(m); m = next; } return (ENOBUFS); } #endif #ifdef notyet qidx = count = 0; mp = marr; next = m; do { count++; next = next->m_nextpkt; } while (next != NULL); if (count > nitems(marr)) if ((mp = malloc(count*sizeof(struct mbuf *), M_IFLIB, M_NOWAIT)) == NULL) { /* XXX check nextpkt */ m_freem(m); /* XXX simplify for now */ DBG_COUNTER_INC(tx_frees); return (ENOBUFS); } for (next = m, i = 0; next != NULL; i++) { mp[i] = next; next = next->m_nextpkt; mp[i]->m_nextpkt = NULL; } #endif DBG_COUNTER_INC(tx_seen); err = ifmp_ring_enqueue(txq->ift_br[0], (void **)&m, 1, TX_BATCH_SIZE); if (err) { GROUPTASK_ENQUEUE(&txq->ift_task); /* support forthcoming later */ #ifdef DRIVER_BACKPRESSURE txq->ift_closed = TRUE; #endif ifmp_ring_check_drainage(txq->ift_br[0], TX_BATCH_SIZE); m_freem(m); } else if (TXQ_AVAIL(txq) < (txq->ift_size >> 1)) { GROUPTASK_ENQUEUE(&txq->ift_task); } return (err); } static void iflib_if_qflush(if_t ifp) { if_ctx_t ctx = if_getsoftc(ifp); iflib_txq_t txq = ctx->ifc_txqs; int i; CTX_LOCK(ctx); ctx->ifc_flags |= IFC_QFLUSH; CTX_UNLOCK(ctx); for (i = 0; i < NTXQSETS(ctx); i++, txq++) while (!(ifmp_ring_is_idle(txq->ift_br[0]) || ifmp_ring_is_stalled(txq->ift_br[0]))) iflib_txq_check_drain(txq, 0); CTX_LOCK(ctx); ctx->ifc_flags &= ~IFC_QFLUSH; CTX_UNLOCK(ctx); if_qflush(ifp); } #define IFCAP_FLAGS (IFCAP_TXCSUM_IPV6 | IFCAP_RXCSUM_IPV6 | IFCAP_HWCSUM | IFCAP_LRO | \ IFCAP_TSO4 | IFCAP_TSO6 | IFCAP_VLAN_HWTAGGING | \ IFCAP_VLAN_MTU | IFCAP_VLAN_HWFILTER | IFCAP_VLAN_HWTSO) static int iflib_if_ioctl(if_t ifp, u_long command, caddr_t data) { if_ctx_t ctx = if_getsoftc(ifp); struct ifreq *ifr = (struct ifreq *)data; #if defined(INET) || defined(INET6) struct ifaddr *ifa = (struct ifaddr *)data; #endif bool avoid_reset = FALSE; int err = 0, reinit = 0, bits; switch (command) { case SIOCSIFADDR: #ifdef INET if (ifa->ifa_addr->sa_family == AF_INET) avoid_reset = TRUE; #endif #ifdef INET6 if (ifa->ifa_addr->sa_family == AF_INET6) avoid_reset = TRUE; #endif /* ** Calling init results in link renegotiation, ** so we avoid doing it when possible. */ if (avoid_reset) { if_setflagbits(ifp, IFF_UP,0); if (!(if_getdrvflags(ifp)& IFF_DRV_RUNNING)) reinit = 1; #ifdef INET if (!(if_getflags(ifp) & IFF_NOARP)) arp_ifinit(ifp, ifa); #endif } else err = ether_ioctl(ifp, command, data); break; case SIOCSIFMTU: CTX_LOCK(ctx); if (ifr->ifr_mtu == if_getmtu(ifp)) { CTX_UNLOCK(ctx); break; } bits = if_getdrvflags(ifp); /* stop the driver and free any clusters before proceeding */ iflib_stop(ctx); if ((err = IFDI_MTU_SET(ctx, ifr->ifr_mtu)) == 0) { if (ifr->ifr_mtu > ctx->ifc_max_fl_buf_size) ctx->ifc_flags |= IFC_MULTISEG; else ctx->ifc_flags &= ~IFC_MULTISEG; err = if_setmtu(ifp, ifr->ifr_mtu); } iflib_init_locked(ctx); if_setdrvflags(ifp, bits); CTX_UNLOCK(ctx); break; case SIOCSIFFLAGS: CTX_LOCK(ctx); if (if_getflags(ifp) & IFF_UP) { if (if_getdrvflags(ifp) & IFF_DRV_RUNNING) { if ((if_getflags(ifp) ^ ctx->ifc_if_flags) & (IFF_PROMISC | IFF_ALLMULTI)) { err = IFDI_PROMISC_SET(ctx, if_getflags(ifp)); } } else reinit = 1; } else if (if_getdrvflags(ifp) & IFF_DRV_RUNNING) { iflib_stop(ctx); } ctx->ifc_if_flags = if_getflags(ifp); CTX_UNLOCK(ctx); break; break; case SIOCADDMULTI: case SIOCDELMULTI: if (if_getdrvflags(ifp) & IFF_DRV_RUNNING) { CTX_LOCK(ctx); IFDI_INTR_DISABLE(ctx); IFDI_MULTI_SET(ctx); IFDI_INTR_ENABLE(ctx); CTX_UNLOCK(ctx); } break; case SIOCSIFMEDIA: CTX_LOCK(ctx); IFDI_MEDIA_SET(ctx); CTX_UNLOCK(ctx); /* falls thru */ case SIOCGIFMEDIA: err = ifmedia_ioctl(ifp, ifr, &ctx->ifc_media, command); break; case SIOCGI2C: { struct ifi2creq i2c; err = copyin(ifr->ifr_data, &i2c, sizeof(i2c)); if (err != 0) break; if (i2c.dev_addr != 0xA0 && i2c.dev_addr != 0xA2) { err = EINVAL; break; } if (i2c.len > sizeof(i2c.data)) { err = EINVAL; break; } if ((err = IFDI_I2C_REQ(ctx, &i2c)) == 0) err = copyout(&i2c, ifr->ifr_data, sizeof(i2c)); break; } case SIOCSIFCAP: { int mask, setmask; mask = ifr->ifr_reqcap ^ if_getcapenable(ifp); setmask = 0; #ifdef TCP_OFFLOAD setmask |= mask & (IFCAP_TOE4|IFCAP_TOE6); #endif setmask |= (mask & IFCAP_FLAGS); if (setmask & (IFCAP_RXCSUM | IFCAP_RXCSUM_IPV6)) setmask |= (IFCAP_RXCSUM | IFCAP_RXCSUM_IPV6); if ((mask & IFCAP_WOL) && (if_getcapabilities(ifp) & IFCAP_WOL) != 0) setmask |= (mask & (IFCAP_WOL_MCAST|IFCAP_WOL_MAGIC)); if_vlancap(ifp); /* * want to ensure that traffic has stopped before we change any of the flags */ if (setmask) { CTX_LOCK(ctx); bits = if_getdrvflags(ifp); if (bits & IFF_DRV_RUNNING) iflib_stop(ctx); if_togglecapenable(ifp, setmask); if (bits & IFF_DRV_RUNNING) iflib_init_locked(ctx); if_setdrvflags(ifp, bits); CTX_UNLOCK(ctx); } break; } case SIOCGPRIVATE_0: case SIOCSDRVSPEC: case SIOCGDRVSPEC: CTX_LOCK(ctx); err = IFDI_PRIV_IOCTL(ctx, command, data); CTX_UNLOCK(ctx); break; default: err = ether_ioctl(ifp, command, data); break; } if (reinit) iflib_if_init(ctx); return (err); } static uint64_t iflib_if_get_counter(if_t ifp, ift_counter cnt) { if_ctx_t ctx = if_getsoftc(ifp); return (IFDI_GET_COUNTER(ctx, cnt)); } /********************************************************************* * * OTHER FUNCTIONS EXPORTED TO THE STACK * **********************************************************************/ static void iflib_vlan_register(void *arg, if_t ifp, uint16_t vtag) { if_ctx_t ctx = if_getsoftc(ifp); if ((void *)ctx != arg) return; if ((vtag == 0) || (vtag > 4095)) return; CTX_LOCK(ctx); IFDI_VLAN_REGISTER(ctx, vtag); /* Re-init to load the changes */ if (if_getcapenable(ifp) & IFCAP_VLAN_HWFILTER) iflib_init_locked(ctx); CTX_UNLOCK(ctx); } static void iflib_vlan_unregister(void *arg, if_t ifp, uint16_t vtag) { if_ctx_t ctx = if_getsoftc(ifp); if ((void *)ctx != arg) return; if ((vtag == 0) || (vtag > 4095)) return; CTX_LOCK(ctx); IFDI_VLAN_UNREGISTER(ctx, vtag); /* Re-init to load the changes */ if (if_getcapenable(ifp) & IFCAP_VLAN_HWFILTER) iflib_init_locked(ctx); CTX_UNLOCK(ctx); } static void iflib_led_func(void *arg, int onoff) { if_ctx_t ctx = arg; CTX_LOCK(ctx); IFDI_LED_FUNC(ctx, onoff); CTX_UNLOCK(ctx); } /********************************************************************* * * BUS FUNCTION DEFINITIONS * **********************************************************************/ int iflib_device_probe(device_t dev) { pci_vendor_info_t *ent; uint16_t pci_vendor_id, pci_device_id; uint16_t pci_subvendor_id, pci_subdevice_id; uint16_t pci_rev_id; if_shared_ctx_t sctx; if ((sctx = DEVICE_REGISTER(dev)) == NULL || sctx->isc_magic != IFLIB_MAGIC) return (ENOTSUP); pci_vendor_id = pci_get_vendor(dev); pci_device_id = pci_get_device(dev); pci_subvendor_id = pci_get_subvendor(dev); pci_subdevice_id = pci_get_subdevice(dev); pci_rev_id = pci_get_revid(dev); if (sctx->isc_parse_devinfo != NULL) sctx->isc_parse_devinfo(&pci_device_id, &pci_subvendor_id, &pci_subdevice_id, &pci_rev_id); ent = sctx->isc_vendor_info; while (ent->pvi_vendor_id != 0) { if (pci_vendor_id != ent->pvi_vendor_id) { ent++; continue; } if ((pci_device_id == ent->pvi_device_id) && ((pci_subvendor_id == ent->pvi_subvendor_id) || (ent->pvi_subvendor_id == 0)) && ((pci_subdevice_id == ent->pvi_subdevice_id) || (ent->pvi_subdevice_id == 0)) && ((pci_rev_id == ent->pvi_rev_id) || (ent->pvi_rev_id == 0))) { device_set_desc_copy(dev, ent->pvi_name); /* this needs to be changed to zero if the bus probing code * ever stops re-probing on best match because the sctx * may have its values over written by register calls * in subsequent probes */ return (BUS_PROBE_DEFAULT); } ent++; } return (ENXIO); } int iflib_device_register(device_t dev, void *sc, if_shared_ctx_t sctx, if_ctx_t *ctxp) { int err, rid, msix, msix_bar; if_ctx_t ctx; if_t ifp; if_softc_ctx_t scctx; int i; uint16_t main_txq; uint16_t main_rxq; ctx = malloc(sizeof(* ctx), M_IFLIB, M_WAITOK|M_ZERO); if (sc == NULL) { sc = malloc(sctx->isc_driver->size, M_IFLIB, M_WAITOK|M_ZERO); device_set_softc(dev, ctx); ctx->ifc_flags |= IFC_SC_ALLOCATED; } ctx->ifc_sctx = sctx; ctx->ifc_dev = dev; ctx->ifc_softc = sc; if ((err = iflib_register(ctx)) != 0) { device_printf(dev, "iflib_register failed %d\n", err); return (err); } iflib_add_device_sysctl_pre(ctx); scctx = &ctx->ifc_softc_ctx; ifp = ctx->ifc_ifp; /* * XXX sanity check that ntxd & nrxd are a power of 2 */ if (ctx->ifc_sysctl_ntxqs != 0) scctx->isc_ntxqsets = ctx->ifc_sysctl_ntxqs; if (ctx->ifc_sysctl_nrxqs != 0) scctx->isc_nrxqsets = ctx->ifc_sysctl_nrxqs; for (i = 0; i < sctx->isc_ntxqs; i++) { if (ctx->ifc_sysctl_ntxds[i] != 0) scctx->isc_ntxd[i] = ctx->ifc_sysctl_ntxds[i]; else scctx->isc_ntxd[i] = sctx->isc_ntxd_default[i]; } for (i = 0; i < sctx->isc_nrxqs; i++) { if (ctx->ifc_sysctl_nrxds[i] != 0) scctx->isc_nrxd[i] = ctx->ifc_sysctl_nrxds[i]; else scctx->isc_nrxd[i] = sctx->isc_nrxd_default[i]; } for (i = 0; i < sctx->isc_nrxqs; i++) { if (scctx->isc_nrxd[i] < sctx->isc_nrxd_min[i]) { device_printf(dev, "nrxd%d: %d less than nrxd_min %d - resetting to min\n", i, scctx->isc_nrxd[i], sctx->isc_nrxd_min[i]); scctx->isc_nrxd[i] = sctx->isc_nrxd_min[i]; } if (scctx->isc_nrxd[i] > sctx->isc_nrxd_max[i]) { device_printf(dev, "nrxd%d: %d greater than nrxd_max %d - resetting to max\n", i, scctx->isc_nrxd[i], sctx->isc_nrxd_max[i]); scctx->isc_nrxd[i] = sctx->isc_nrxd_max[i]; } } for (i = 0; i < sctx->isc_ntxqs; i++) { if (scctx->isc_ntxd[i] < sctx->isc_ntxd_min[i]) { device_printf(dev, "ntxd%d: %d less than ntxd_min %d - resetting to min\n", i, scctx->isc_ntxd[i], sctx->isc_ntxd_min[i]); scctx->isc_ntxd[i] = sctx->isc_ntxd_min[i]; } if (scctx->isc_ntxd[i] > sctx->isc_ntxd_max[i]) { device_printf(dev, "ntxd%d: %d greater than ntxd_max %d - resetting to max\n", i, scctx->isc_ntxd[i], sctx->isc_ntxd_max[i]); scctx->isc_ntxd[i] = sctx->isc_ntxd_max[i]; } } if ((err = IFDI_ATTACH_PRE(ctx)) != 0) { device_printf(dev, "IFDI_ATTACH_PRE failed %d\n", err); return (err); } _iflib_pre_assert(scctx); ctx->ifc_txrx = *scctx->isc_txrx; #ifdef INVARIANTS MPASS(scctx->isc_capenable); if (scctx->isc_capenable & IFCAP_TXCSUM) MPASS(scctx->isc_tx_csum_flags); #endif if_setcapabilities(ifp, scctx->isc_capenable); if_setcapenable(ifp, scctx->isc_capenable); if (scctx->isc_ntxqsets == 0 || (scctx->isc_ntxqsets_max && scctx->isc_ntxqsets_max < scctx->isc_ntxqsets)) scctx->isc_ntxqsets = scctx->isc_ntxqsets_max; if (scctx->isc_nrxqsets == 0 || (scctx->isc_nrxqsets_max && scctx->isc_nrxqsets_max < scctx->isc_nrxqsets)) scctx->isc_nrxqsets = scctx->isc_nrxqsets_max; #ifdef ACPI_DMAR if (dmar_get_dma_tag(device_get_parent(dev), dev) != NULL) ctx->ifc_flags |= IFC_DMAR; #endif msix_bar = scctx->isc_msix_bar; if(sctx->isc_flags & IFLIB_HAS_TXCQ) main_txq = 1; else main_txq = 0; if(sctx->isc_flags & IFLIB_HAS_RXCQ) main_rxq = 1; else main_rxq = 0; /* XXX change for per-queue sizes */ device_printf(dev, "using %d tx descriptors and %d rx descriptors\n", scctx->isc_ntxd[main_txq], scctx->isc_nrxd[main_rxq]); for (i = 0; i < sctx->isc_nrxqs; i++) { if (!powerof2(scctx->isc_nrxd[i])) { /* round down instead? */ device_printf(dev, "# rx descriptors must be a power of 2\n"); err = EINVAL; goto fail; } } for (i = 0; i < sctx->isc_ntxqs; i++) { if (!powerof2(scctx->isc_ntxd[i])) { device_printf(dev, "# tx descriptors must be a power of 2"); err = EINVAL; goto fail; } } if (scctx->isc_tx_nsegments > scctx->isc_ntxd[main_txq] / MAX_SINGLE_PACKET_FRACTION) scctx->isc_tx_nsegments = max(1, scctx->isc_ntxd[main_txq] / MAX_SINGLE_PACKET_FRACTION); if (scctx->isc_tx_tso_segments_max > scctx->isc_ntxd[main_txq] / MAX_SINGLE_PACKET_FRACTION) scctx->isc_tx_tso_segments_max = max(1, scctx->isc_ntxd[main_txq] / MAX_SINGLE_PACKET_FRACTION); /* * Protect the stack against modern hardware */ if (scctx->isc_tx_tso_size_max > FREEBSD_TSO_SIZE_MAX) scctx->isc_tx_tso_size_max = FREEBSD_TSO_SIZE_MAX; /* TSO parameters - dig these out of the data sheet - simply correspond to tag setup */ ifp->if_hw_tsomaxsegcount = scctx->isc_tx_tso_segments_max; ifp->if_hw_tsomax = scctx->isc_tx_tso_size_max; ifp->if_hw_tsomaxsegsize = scctx->isc_tx_tso_segsize_max; if (scctx->isc_rss_table_size == 0) scctx->isc_rss_table_size = 64; scctx->isc_rss_table_mask = scctx->isc_rss_table_size-1; GROUPTASK_INIT(&ctx->ifc_admin_task, 0, _task_fn_admin, ctx); /* XXX format name */ taskqgroup_attach(qgroup_if_config_tqg, &ctx->ifc_admin_task, ctx, -1, "admin"); /* ** Now setup MSI or MSI/X, should ** return us the number of supported ** vectors. (Will be 1 for MSI) */ if (sctx->isc_flags & IFLIB_SKIP_MSIX) { msix = scctx->isc_vectors; } else if (scctx->isc_msix_bar != 0) msix = iflib_msix_init(ctx); else { scctx->isc_vectors = 1; scctx->isc_ntxqsets = 1; scctx->isc_nrxqsets = 1; scctx->isc_intr = IFLIB_INTR_LEGACY; msix = 0; } /* Get memory for the station queues */ if ((err = iflib_queues_alloc(ctx))) { device_printf(dev, "Unable to allocate queue memory\n"); goto fail; } if ((err = iflib_qset_structures_setup(ctx))) { device_printf(dev, "qset structure setup failed %d\n", err); goto fail_queues; } IFDI_INTR_DISABLE(ctx); if (msix > 1 && (err = IFDI_MSIX_INTR_ASSIGN(ctx, msix)) != 0) { device_printf(dev, "IFDI_MSIX_INTR_ASSIGN failed %d\n", err); goto fail_intr_free; } if (msix <= 1) { rid = 0; if (scctx->isc_intr == IFLIB_INTR_MSI) { MPASS(msix == 1); rid = 1; } if ((err = iflib_legacy_setup(ctx, ctx->isc_legacy_intr, ctx->ifc_softc, &rid, "irq0")) != 0) { device_printf(dev, "iflib_legacy_setup failed %d\n", err); goto fail_intr_free; } } ether_ifattach(ctx->ifc_ifp, ctx->ifc_mac); if ((err = IFDI_ATTACH_POST(ctx)) != 0) { device_printf(dev, "IFDI_ATTACH_POST failed %d\n", err); goto fail_detach; } if ((err = iflib_netmap_attach(ctx))) { device_printf(ctx->ifc_dev, "netmap attach failed: %d\n", err); goto fail_detach; } *ctxp = ctx; if_setgetcounterfn(ctx->ifc_ifp, iflib_if_get_counter); iflib_add_device_sysctl_post(ctx); + ctx->ifc_flags |= IFC_INIT_DONE; return (0); fail_detach: ether_ifdetach(ctx->ifc_ifp); fail_intr_free: if (scctx->isc_intr == IFLIB_INTR_MSIX || scctx->isc_intr == IFLIB_INTR_MSI) pci_release_msi(ctx->ifc_dev); fail_queues: /* XXX free queues */ fail: IFDI_DETACH(ctx); return (err); } int iflib_device_attach(device_t dev) { if_ctx_t ctx; if_shared_ctx_t sctx; if ((sctx = DEVICE_REGISTER(dev)) == NULL || sctx->isc_magic != IFLIB_MAGIC) return (ENOTSUP); pci_enable_busmaster(dev); return (iflib_device_register(dev, NULL, sctx, &ctx)); } int iflib_device_deregister(if_ctx_t ctx) { if_t ifp = ctx->ifc_ifp; iflib_txq_t txq; iflib_rxq_t rxq; device_t dev = ctx->ifc_dev; int i; struct taskqgroup *tqg; /* Make sure VLANS are not using driver */ if (if_vlantrunkinuse(ifp)) { device_printf(dev,"Vlan in use, detach first\n"); return (EBUSY); } CTX_LOCK(ctx); ctx->ifc_in_detach = 1; iflib_stop(ctx); CTX_UNLOCK(ctx); /* Unregister VLAN events */ if (ctx->ifc_vlan_attach_event != NULL) EVENTHANDLER_DEREGISTER(vlan_config, ctx->ifc_vlan_attach_event); if (ctx->ifc_vlan_detach_event != NULL) EVENTHANDLER_DEREGISTER(vlan_unconfig, ctx->ifc_vlan_detach_event); iflib_netmap_detach(ifp); ether_ifdetach(ifp); /* ether_ifdetach calls if_qflush - lock must be destroy afterwards*/ CTX_LOCK_DESTROY(ctx); if (ctx->ifc_led_dev != NULL) led_destroy(ctx->ifc_led_dev); /* XXX drain any dependent tasks */ tqg = qgroup_if_io_tqg; for (txq = ctx->ifc_txqs, i = 0; i < NTXQSETS(ctx); i++, txq++) { callout_drain(&txq->ift_timer); callout_drain(&txq->ift_db_check); if (txq->ift_task.gt_uniq != NULL) taskqgroup_detach(tqg, &txq->ift_task); } for (i = 0, rxq = ctx->ifc_rxqs; i < NRXQSETS(ctx); i++, rxq++) { if (rxq->ifr_task.gt_uniq != NULL) taskqgroup_detach(tqg, &rxq->ifr_task); } tqg = qgroup_if_config_tqg; if (ctx->ifc_admin_task.gt_uniq != NULL) taskqgroup_detach(tqg, &ctx->ifc_admin_task); if (ctx->ifc_vflr_task.gt_uniq != NULL) taskqgroup_detach(tqg, &ctx->ifc_vflr_task); IFDI_DETACH(ctx); device_set_softc(ctx->ifc_dev, NULL); if (ctx->ifc_softc_ctx.isc_intr != IFLIB_INTR_LEGACY) { pci_release_msi(dev); } if (ctx->ifc_softc_ctx.isc_intr != IFLIB_INTR_MSIX) { iflib_irq_free(ctx, &ctx->ifc_legacy_irq); } if (ctx->ifc_msix_mem != NULL) { bus_release_resource(ctx->ifc_dev, SYS_RES_MEMORY, ctx->ifc_softc_ctx.isc_msix_bar, ctx->ifc_msix_mem); ctx->ifc_msix_mem = NULL; } bus_generic_detach(dev); if_free(ifp); iflib_tx_structures_free(ctx); iflib_rx_structures_free(ctx); if (ctx->ifc_flags & IFC_SC_ALLOCATED) free(ctx->ifc_softc, M_IFLIB); free(ctx, M_IFLIB); return (0); } int iflib_device_detach(device_t dev) { if_ctx_t ctx = device_get_softc(dev); return (iflib_device_deregister(ctx)); } int iflib_device_suspend(device_t dev) { if_ctx_t ctx = device_get_softc(dev); CTX_LOCK(ctx); IFDI_SUSPEND(ctx); CTX_UNLOCK(ctx); return bus_generic_suspend(dev); } int iflib_device_shutdown(device_t dev) { if_ctx_t ctx = device_get_softc(dev); CTX_LOCK(ctx); IFDI_SHUTDOWN(ctx); CTX_UNLOCK(ctx); return bus_generic_suspend(dev); } int iflib_device_resume(device_t dev) { if_ctx_t ctx = device_get_softc(dev); iflib_txq_t txq = ctx->ifc_txqs; CTX_LOCK(ctx); IFDI_RESUME(ctx); iflib_init_locked(ctx); CTX_UNLOCK(ctx); for (int i = 0; i < NTXQSETS(ctx); i++, txq++) iflib_txq_check_drain(txq, IFLIB_RESTART_BUDGET); return (bus_generic_resume(dev)); } int iflib_device_iov_init(device_t dev, uint16_t num_vfs, const nvlist_t *params) { int error; if_ctx_t ctx = device_get_softc(dev); CTX_LOCK(ctx); error = IFDI_IOV_INIT(ctx, num_vfs, params); CTX_UNLOCK(ctx); return (error); } void iflib_device_iov_uninit(device_t dev) { if_ctx_t ctx = device_get_softc(dev); CTX_LOCK(ctx); IFDI_IOV_UNINIT(ctx); CTX_UNLOCK(ctx); } int iflib_device_iov_add_vf(device_t dev, uint16_t vfnum, const nvlist_t *params) { int error; if_ctx_t ctx = device_get_softc(dev); CTX_LOCK(ctx); error = IFDI_IOV_VF_ADD(ctx, vfnum, params); CTX_UNLOCK(ctx); return (error); } /********************************************************************* * * MODULE FUNCTION DEFINITIONS * **********************************************************************/ /* * - Start a fast taskqueue thread for each core * - Start a taskqueue for control operations */ static int iflib_module_init(void) { return (0); } static int iflib_module_event_handler(module_t mod, int what, void *arg) { int err; switch (what) { case MOD_LOAD: if ((err = iflib_module_init()) != 0) return (err); break; case MOD_UNLOAD: return (EBUSY); default: return (EOPNOTSUPP); } return (0); } /********************************************************************* * * PUBLIC FUNCTION DEFINITIONS * ordered as in iflib.h * **********************************************************************/ static void _iflib_assert(if_shared_ctx_t sctx) { MPASS(sctx->isc_tx_maxsize); MPASS(sctx->isc_tx_maxsegsize); MPASS(sctx->isc_rx_maxsize); MPASS(sctx->isc_rx_nsegments); MPASS(sctx->isc_rx_maxsegsize); MPASS(sctx->isc_nrxd_min[0]); MPASS(sctx->isc_nrxd_max[0]); MPASS(sctx->isc_nrxd_default[0]); MPASS(sctx->isc_ntxd_min[0]); MPASS(sctx->isc_ntxd_max[0]); MPASS(sctx->isc_ntxd_default[0]); } static void _iflib_pre_assert(if_softc_ctx_t scctx) { MPASS(scctx->isc_txrx->ift_txd_encap); MPASS(scctx->isc_txrx->ift_txd_flush); MPASS(scctx->isc_txrx->ift_txd_credits_update); MPASS(scctx->isc_txrx->ift_rxd_available); MPASS(scctx->isc_txrx->ift_rxd_pkt_get); MPASS(scctx->isc_txrx->ift_rxd_refill); MPASS(scctx->isc_txrx->ift_rxd_flush); } static int iflib_register(if_ctx_t ctx) { if_shared_ctx_t sctx = ctx->ifc_sctx; driver_t *driver = sctx->isc_driver; device_t dev = ctx->ifc_dev; if_t ifp; _iflib_assert(sctx); CTX_LOCK_INIT(ctx, device_get_nameunit(ctx->ifc_dev)); ifp = ctx->ifc_ifp = if_gethandle(IFT_ETHER); if (ifp == NULL) { device_printf(dev, "can not allocate ifnet structure\n"); return (ENOMEM); } /* * Initialize our context's device specific methods */ kobj_init((kobj_t) ctx, (kobj_class_t) driver); kobj_class_compile((kobj_class_t) driver); driver->refs++; if_initname(ifp, device_get_name(dev), device_get_unit(dev)); if_setsoftc(ifp, ctx); if_setdev(ifp, dev); if_setinitfn(ifp, iflib_if_init); if_setioctlfn(ifp, iflib_if_ioctl); if_settransmitfn(ifp, iflib_if_transmit); if_setqflushfn(ifp, iflib_if_qflush); if_setflags(ifp, IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST); ctx->ifc_vlan_attach_event = EVENTHANDLER_REGISTER(vlan_config, iflib_vlan_register, ctx, EVENTHANDLER_PRI_FIRST); ctx->ifc_vlan_detach_event = EVENTHANDLER_REGISTER(vlan_unconfig, iflib_vlan_unregister, ctx, EVENTHANDLER_PRI_FIRST); ifmedia_init(&ctx->ifc_media, IFM_IMASK, iflib_media_change, iflib_media_status); return (0); } static int iflib_queues_alloc(if_ctx_t ctx) { if_shared_ctx_t sctx = ctx->ifc_sctx; if_softc_ctx_t scctx = &ctx->ifc_softc_ctx; device_t dev = ctx->ifc_dev; int nrxqsets = scctx->isc_nrxqsets; int ntxqsets = scctx->isc_ntxqsets; iflib_txq_t txq; iflib_rxq_t rxq; iflib_fl_t fl = NULL; int i, j, cpu, err, txconf, rxconf; iflib_dma_info_t ifdip; uint32_t *rxqsizes = scctx->isc_rxqsizes; uint32_t *txqsizes = scctx->isc_txqsizes; uint8_t nrxqs = sctx->isc_nrxqs; uint8_t ntxqs = sctx->isc_ntxqs; int nfree_lists = sctx->isc_nfl ? sctx->isc_nfl : 1; caddr_t *vaddrs; uint64_t *paddrs; struct ifmp_ring **brscp; int nbuf_rings = 1; /* XXX determine dynamically */ KASSERT(ntxqs > 0, ("number of queues per qset must be at least 1")); KASSERT(nrxqs > 0, ("number of queues per qset must be at least 1")); brscp = NULL; txq = NULL; rxq = NULL; /* Allocate the TX ring struct memory */ if (!(txq = (iflib_txq_t) malloc(sizeof(struct iflib_txq) * ntxqsets, M_IFLIB, M_NOWAIT | M_ZERO))) { device_printf(dev, "Unable to allocate TX ring memory\n"); err = ENOMEM; goto fail; } /* Now allocate the RX */ if (!(rxq = (iflib_rxq_t) malloc(sizeof(struct iflib_rxq) * nrxqsets, M_IFLIB, M_NOWAIT | M_ZERO))) { device_printf(dev, "Unable to allocate RX ring memory\n"); err = ENOMEM; goto rx_fail; } if (!(brscp = malloc(sizeof(void *) * nbuf_rings * nrxqsets, M_IFLIB, M_NOWAIT | M_ZERO))) { device_printf(dev, "Unable to buf_ring_sc * memory\n"); err = ENOMEM; goto rx_fail; } ctx->ifc_txqs = txq; ctx->ifc_rxqs = rxq; /* * XXX handle allocation failure */ for (txconf = i = 0, cpu = CPU_FIRST(); i < ntxqsets; i++, txconf++, txq++, cpu = CPU_NEXT(cpu)) { /* Set up some basics */ if ((ifdip = malloc(sizeof(struct iflib_dma_info) * ntxqs, M_IFLIB, M_WAITOK|M_ZERO)) == NULL) { device_printf(dev, "failed to allocate iflib_dma_info\n"); err = ENOMEM; goto err_tx_desc; } txq->ift_ifdi = ifdip; for (j = 0; j < ntxqs; j++, ifdip++) { if (iflib_dma_alloc(ctx, txqsizes[j], ifdip, BUS_DMA_NOWAIT)) { device_printf(dev, "Unable to allocate Descriptor memory\n"); err = ENOMEM; goto err_tx_desc; } bzero((void *)ifdip->idi_vaddr, txqsizes[j]); } txq->ift_ctx = ctx; txq->ift_id = i; if (sctx->isc_flags & IFLIB_HAS_TXCQ) { txq->ift_br_offset = 1; } else { txq->ift_br_offset = 0; } /* XXX fix this */ txq->ift_timer.c_cpu = cpu; txq->ift_db_check.c_cpu = cpu; txq->ift_nbr = nbuf_rings; if (iflib_txsd_alloc(txq)) { device_printf(dev, "Critical Failure setting up TX buffers\n"); err = ENOMEM; goto err_tx_desc; } /* Initialize the TX lock */ snprintf(txq->ift_mtx_name, MTX_NAME_LEN, "%s:tx(%d):callout", device_get_nameunit(dev), txq->ift_id); mtx_init(&txq->ift_mtx, txq->ift_mtx_name, NULL, MTX_DEF); callout_init_mtx(&txq->ift_timer, &txq->ift_mtx, 0); callout_init_mtx(&txq->ift_db_check, &txq->ift_mtx, 0); snprintf(txq->ift_db_mtx_name, MTX_NAME_LEN, "%s:tx(%d):db", device_get_nameunit(dev), txq->ift_id); TXDB_LOCK_INIT(txq); txq->ift_br = brscp + i*nbuf_rings; for (j = 0; j < nbuf_rings; j++) { err = ifmp_ring_alloc(&txq->ift_br[j], 2048, txq, iflib_txq_drain, iflib_txq_can_drain, M_IFLIB, M_WAITOK); if (err) { /* XXX free any allocated rings */ device_printf(dev, "Unable to allocate buf_ring\n"); goto err_tx_desc; } } } for (rxconf = i = 0; i < nrxqsets; i++, rxconf++, rxq++) { /* Set up some basics */ if ((ifdip = malloc(sizeof(struct iflib_dma_info) * nrxqs, M_IFLIB, M_WAITOK|M_ZERO)) == NULL) { device_printf(dev, "failed to allocate iflib_dma_info\n"); err = ENOMEM; goto err_tx_desc; } rxq->ifr_ifdi = ifdip; for (j = 0; j < nrxqs; j++, ifdip++) { if (iflib_dma_alloc(ctx, rxqsizes[j], ifdip, BUS_DMA_NOWAIT)) { device_printf(dev, "Unable to allocate Descriptor memory\n"); err = ENOMEM; goto err_tx_desc; } bzero((void *)ifdip->idi_vaddr, rxqsizes[j]); } rxq->ifr_ctx = ctx; rxq->ifr_id = i; if (sctx->isc_flags & IFLIB_HAS_RXCQ) { rxq->ifr_fl_offset = 1; } else { rxq->ifr_fl_offset = 0; } rxq->ifr_nfl = nfree_lists; if (!(fl = (iflib_fl_t) malloc(sizeof(struct iflib_fl) * nfree_lists, M_IFLIB, M_NOWAIT | M_ZERO))) { device_printf(dev, "Unable to allocate free list memory\n"); err = ENOMEM; goto err_tx_desc; } rxq->ifr_fl = fl; for (j = 0; j < nfree_lists; j++) { rxq->ifr_fl[j].ifl_rxq = rxq; rxq->ifr_fl[j].ifl_id = j; rxq->ifr_fl[j].ifl_ifdi = &rxq->ifr_ifdi[j + rxq->ifr_fl_offset]; } /* Allocate receive buffers for the ring*/ if (iflib_rxsd_alloc(rxq)) { device_printf(dev, "Critical Failure setting up receive buffers\n"); err = ENOMEM; goto err_rx_desc; } } /* TXQs */ vaddrs = malloc(sizeof(caddr_t)*ntxqsets*ntxqs, M_IFLIB, M_WAITOK); paddrs = malloc(sizeof(uint64_t)*ntxqsets*ntxqs, M_IFLIB, M_WAITOK); for (i = 0; i < ntxqsets; i++) { iflib_dma_info_t di = ctx->ifc_txqs[i].ift_ifdi; for (j = 0; j < ntxqs; j++, di++) { vaddrs[i*ntxqs + j] = di->idi_vaddr; paddrs[i*ntxqs + j] = di->idi_paddr; } } if ((err = IFDI_TX_QUEUES_ALLOC(ctx, vaddrs, paddrs, ntxqs, ntxqsets)) != 0) { device_printf(ctx->ifc_dev, "device queue allocation failed\n"); iflib_tx_structures_free(ctx); free(vaddrs, M_IFLIB); free(paddrs, M_IFLIB); goto err_rx_desc; } free(vaddrs, M_IFLIB); free(paddrs, M_IFLIB); /* RXQs */ vaddrs = malloc(sizeof(caddr_t)*nrxqsets*nrxqs, M_IFLIB, M_WAITOK); paddrs = malloc(sizeof(uint64_t)*nrxqsets*nrxqs, M_IFLIB, M_WAITOK); for (i = 0; i < nrxqsets; i++) { iflib_dma_info_t di = ctx->ifc_rxqs[i].ifr_ifdi; for (j = 0; j < nrxqs; j++, di++) { vaddrs[i*nrxqs + j] = di->idi_vaddr; paddrs[i*nrxqs + j] = di->idi_paddr; } } if ((err = IFDI_RX_QUEUES_ALLOC(ctx, vaddrs, paddrs, nrxqs, nrxqsets)) != 0) { device_printf(ctx->ifc_dev, "device queue allocation failed\n"); iflib_tx_structures_free(ctx); free(vaddrs, M_IFLIB); free(paddrs, M_IFLIB); goto err_rx_desc; } free(vaddrs, M_IFLIB); free(paddrs, M_IFLIB); return (0); /* XXX handle allocation failure changes */ err_rx_desc: err_tx_desc: if (ctx->ifc_rxqs != NULL) free(ctx->ifc_rxqs, M_IFLIB); ctx->ifc_rxqs = NULL; if (ctx->ifc_txqs != NULL) free(ctx->ifc_txqs, M_IFLIB); ctx->ifc_txqs = NULL; rx_fail: if (brscp != NULL) free(brscp, M_IFLIB); if (rxq != NULL) free(rxq, M_IFLIB); if (txq != NULL) free(txq, M_IFLIB); fail: return (err); } static int iflib_tx_structures_setup(if_ctx_t ctx) { iflib_txq_t txq = ctx->ifc_txqs; int i; for (i = 0; i < NTXQSETS(ctx); i++, txq++) iflib_txq_setup(txq); return (0); } static void iflib_tx_structures_free(if_ctx_t ctx) { iflib_txq_t txq = ctx->ifc_txqs; int i, j; for (i = 0; i < NTXQSETS(ctx); i++, txq++) { iflib_txq_destroy(txq); for (j = 0; j < ctx->ifc_nhwtxqs; j++) iflib_dma_free(&txq->ift_ifdi[j]); } free(ctx->ifc_txqs, M_IFLIB); ctx->ifc_txqs = NULL; IFDI_QUEUES_FREE(ctx); } /********************************************************************* * * Initialize all receive rings. * **********************************************************************/ static int iflib_rx_structures_setup(if_ctx_t ctx) { iflib_rxq_t rxq = ctx->ifc_rxqs; int q; #if defined(INET6) || defined(INET) int i, err; #endif for (q = 0; q < ctx->ifc_softc_ctx.isc_nrxqsets; q++, rxq++) { #if defined(INET6) || defined(INET) tcp_lro_free(&rxq->ifr_lc); if ((err = tcp_lro_init_args(&rxq->ifr_lc, ctx->ifc_ifp, TCP_LRO_ENTRIES, min(1024, ctx->ifc_softc_ctx.isc_nrxd[rxq->ifr_fl_offset]))) != 0) { device_printf(ctx->ifc_dev, "LRO Initialization failed!\n"); goto fail; } rxq->ifr_lro_enabled = TRUE; #endif IFDI_RXQ_SETUP(ctx, rxq->ifr_id); } return (0); #if defined(INET6) || defined(INET) fail: /* * Free RX software descriptors allocated so far, we will only handle * the rings that completed, the failing case will have * cleaned up for itself. 'q' failed, so its the terminus. */ rxq = ctx->ifc_rxqs; for (i = 0; i < q; ++i, rxq++) { iflib_rx_sds_free(rxq); rxq->ifr_cq_gen = rxq->ifr_cq_cidx = rxq->ifr_cq_pidx = 0; } return (err); #endif } /********************************************************************* * * Free all receive rings. * **********************************************************************/ static void iflib_rx_structures_free(if_ctx_t ctx) { iflib_rxq_t rxq = ctx->ifc_rxqs; for (int i = 0; i < ctx->ifc_softc_ctx.isc_nrxqsets; i++, rxq++) { iflib_rx_sds_free(rxq); } } static int iflib_qset_structures_setup(if_ctx_t ctx) { int err; if ((err = iflib_tx_structures_setup(ctx)) != 0) return (err); if ((err = iflib_rx_structures_setup(ctx)) != 0) { device_printf(ctx->ifc_dev, "iflib_rx_structures_setup failed: %d\n", err); iflib_tx_structures_free(ctx); iflib_rx_structures_free(ctx); } return (err); } int iflib_irq_alloc(if_ctx_t ctx, if_irq_t irq, int rid, driver_filter_t filter, void *filter_arg, driver_intr_t handler, void *arg, char *name) { return (_iflib_irq_alloc(ctx, irq, rid, filter, handler, arg, name)); } static int find_nth(if_ctx_t ctx, cpuset_t *cpus, int qid) { int i, cpuid, eqid, count; CPU_COPY(&ctx->ifc_cpus, cpus); count = CPU_COUNT(&ctx->ifc_cpus); eqid = qid % count; /* clear up to the qid'th bit */ for (i = 0; i < eqid; i++) { cpuid = CPU_FFS(cpus); MPASS(cpuid != 0); CPU_CLR(cpuid-1, cpus); } cpuid = CPU_FFS(cpus); MPASS(cpuid != 0); return (cpuid-1); } int iflib_irq_alloc_generic(if_ctx_t ctx, if_irq_t irq, int rid, iflib_intr_type_t type, driver_filter_t *filter, void *filter_arg, int qid, char *name) { struct grouptask *gtask; struct taskqgroup *tqg; iflib_filter_info_t info; cpuset_t cpus; gtask_fn_t *fn; int tqrid, err, cpuid; void *q; info = &ctx->ifc_filter_info; tqrid = rid; switch (type) { /* XXX merge tx/rx for netmap? */ case IFLIB_INTR_TX: q = &ctx->ifc_txqs[qid]; info = &ctx->ifc_txqs[qid].ift_filter_info; gtask = &ctx->ifc_txqs[qid].ift_task; tqg = qgroup_if_io_tqg; fn = _task_fn_tx; GROUPTASK_INIT(gtask, 0, fn, q); break; case IFLIB_INTR_RX: q = &ctx->ifc_rxqs[qid]; info = &ctx->ifc_rxqs[qid].ifr_filter_info; gtask = &ctx->ifc_rxqs[qid].ifr_task; tqg = qgroup_if_io_tqg; fn = _task_fn_rx; GROUPTASK_INIT(gtask, 0, fn, q); break; case IFLIB_INTR_ADMIN: q = ctx; tqrid = -1; info = &ctx->ifc_filter_info; gtask = &ctx->ifc_admin_task; tqg = qgroup_if_config_tqg; fn = _task_fn_admin; break; default: panic("unknown net intr type"); } info->ifi_filter = filter; info->ifi_filter_arg = filter_arg; info->ifi_task = gtask; + info->ifi_ctx = ctx; err = _iflib_irq_alloc(ctx, irq, rid, iflib_fast_intr, NULL, info, name); if (err != 0) { device_printf(ctx->ifc_dev, "_iflib_irq_alloc failed %d\n", err); return (err); } if (type == IFLIB_INTR_ADMIN) return (0); if (tqrid != -1) { cpuid = find_nth(ctx, &cpus, qid); taskqgroup_attach_cpu(tqg, gtask, q, cpuid, irq->ii_rid, name); } else { taskqgroup_attach(tqg, gtask, q, tqrid, name); } return (0); } void iflib_softirq_alloc_generic(if_ctx_t ctx, int rid, iflib_intr_type_t type, void *arg, int qid, char *name) { struct grouptask *gtask; struct taskqgroup *tqg; gtask_fn_t *fn; void *q; switch (type) { case IFLIB_INTR_TX: q = &ctx->ifc_txqs[qid]; gtask = &ctx->ifc_txqs[qid].ift_task; tqg = qgroup_if_io_tqg; fn = _task_fn_tx; break; case IFLIB_INTR_RX: q = &ctx->ifc_rxqs[qid]; gtask = &ctx->ifc_rxqs[qid].ifr_task; tqg = qgroup_if_io_tqg; fn = _task_fn_rx; break; case IFLIB_INTR_IOV: q = ctx; gtask = &ctx->ifc_vflr_task; tqg = qgroup_if_config_tqg; rid = -1; fn = _task_fn_iov; break; default: panic("unknown net intr type"); } GROUPTASK_INIT(gtask, 0, fn, q); taskqgroup_attach(tqg, gtask, q, rid, name); } void iflib_irq_free(if_ctx_t ctx, if_irq_t irq) { if (irq->ii_tag) bus_teardown_intr(ctx->ifc_dev, irq->ii_res, irq->ii_tag); if (irq->ii_res) bus_release_resource(ctx->ifc_dev, SYS_RES_IRQ, irq->ii_rid, irq->ii_res); } static int iflib_legacy_setup(if_ctx_t ctx, driver_filter_t filter, void *filter_arg, int *rid, char *name) { iflib_txq_t txq = ctx->ifc_txqs; iflib_rxq_t rxq = ctx->ifc_rxqs; if_irq_t irq = &ctx->ifc_legacy_irq; iflib_filter_info_t info; struct grouptask *gtask; struct taskqgroup *tqg; gtask_fn_t *fn; int tqrid; void *q; int err; /* * group taskqueues aren't properly set up until SMP is started * so we disable interrupts until we can handle them post * SI_SUB_SMP */ IFDI_INTR_DISABLE(ctx); q = &ctx->ifc_rxqs[0]; info = &rxq[0].ifr_filter_info; gtask = &rxq[0].ifr_task; tqg = qgroup_if_io_tqg; tqrid = irq->ii_rid = *rid; fn = _task_fn_rx; ctx->ifc_flags |= IFC_LEGACY; info->ifi_filter = filter; info->ifi_filter_arg = filter_arg; info->ifi_task = gtask; + info->ifi_ctx = ctx; /* We allocate a single interrupt resource */ if ((err = _iflib_irq_alloc(ctx, irq, tqrid, iflib_fast_intr, NULL, info, name)) != 0) return (err); GROUPTASK_INIT(gtask, 0, fn, q); taskqgroup_attach(tqg, gtask, q, tqrid, name); GROUPTASK_INIT(&txq->ift_task, 0, _task_fn_tx, txq); taskqgroup_attach(qgroup_if_io_tqg, &txq->ift_task, txq, tqrid, "tx"); return (0); } void iflib_led_create(if_ctx_t ctx) { ctx->ifc_led_dev = led_create(iflib_led_func, ctx, device_get_nameunit(ctx->ifc_dev)); } void iflib_tx_intr_deferred(if_ctx_t ctx, int txqid) { GROUPTASK_ENQUEUE(&ctx->ifc_txqs[txqid].ift_task); } void iflib_rx_intr_deferred(if_ctx_t ctx, int rxqid) { GROUPTASK_ENQUEUE(&ctx->ifc_rxqs[rxqid].ifr_task); } void iflib_admin_intr_deferred(if_ctx_t ctx) { #ifdef INVARIANTS struct grouptask *gtask; gtask = &ctx->ifc_admin_task; MPASS(gtask->gt_taskqueue != NULL); #endif GROUPTASK_ENQUEUE(&ctx->ifc_admin_task); } void iflib_iov_intr_deferred(if_ctx_t ctx) { GROUPTASK_ENQUEUE(&ctx->ifc_vflr_task); } void iflib_io_tqg_attach(struct grouptask *gt, void *uniq, int cpu, char *name) { taskqgroup_attach_cpu(qgroup_if_io_tqg, gt, uniq, cpu, -1, name); } void iflib_config_gtask_init(if_ctx_t ctx, struct grouptask *gtask, gtask_fn_t *fn, char *name) { GROUPTASK_INIT(gtask, 0, fn, ctx); taskqgroup_attach(qgroup_if_config_tqg, gtask, gtask, -1, name); } void iflib_config_gtask_deinit(struct grouptask *gtask) { taskqgroup_detach(qgroup_if_config_tqg, gtask); } void iflib_link_state_change(if_ctx_t ctx, int link_state, uint64_t baudrate) { if_t ifp = ctx->ifc_ifp; iflib_txq_t txq = ctx->ifc_txqs; if_setbaudrate(ifp, baudrate); /* If link down, disable watchdog */ if ((ctx->ifc_link_state == LINK_STATE_UP) && (link_state == LINK_STATE_DOWN)) { for (int i = 0; i < ctx->ifc_softc_ctx.isc_ntxqsets; i++, txq++) txq->ift_qstatus = IFLIB_QUEUE_IDLE; } ctx->ifc_link_state = link_state; if_link_state_change(ifp, link_state); } static int iflib_tx_credits_update(if_ctx_t ctx, iflib_txq_t txq) { int credits; #ifdef INVARIANTS int credits_pre = txq->ift_cidx_processed; #endif if (ctx->isc_txd_credits_update == NULL) return (0); if ((credits = ctx->isc_txd_credits_update(ctx->ifc_softc, txq->ift_id, txq->ift_cidx_processed, true)) == 0) return (0); txq->ift_processed += credits; txq->ift_cidx_processed += credits; MPASS(credits_pre + credits == txq->ift_cidx_processed); if (txq->ift_cidx_processed >= txq->ift_size) txq->ift_cidx_processed -= txq->ift_size; return (credits); } static int iflib_rxd_avail(if_ctx_t ctx, iflib_rxq_t rxq, int cidx, int budget) { return (ctx->isc_rxd_available(ctx->ifc_softc, rxq->ifr_id, cidx, budget)); } void iflib_add_int_delay_sysctl(if_ctx_t ctx, const char *name, const char *description, if_int_delay_info_t info, int offset, int value) { info->iidi_ctx = ctx; info->iidi_offset = offset; info->iidi_value = value; SYSCTL_ADD_PROC(device_get_sysctl_ctx(ctx->ifc_dev), SYSCTL_CHILDREN(device_get_sysctl_tree(ctx->ifc_dev)), OID_AUTO, name, CTLTYPE_INT|CTLFLAG_RW, info, 0, iflib_sysctl_int_delay, "I", description); } struct mtx * iflib_ctx_lock_get(if_ctx_t ctx) { return (&ctx->ifc_mtx); } static int iflib_msix_init(if_ctx_t ctx) { device_t dev = ctx->ifc_dev; if_shared_ctx_t sctx = ctx->ifc_sctx; if_softc_ctx_t scctx = &ctx->ifc_softc_ctx; int vectors, queues, rx_queues, tx_queues, queuemsgs, msgs; int iflib_num_tx_queues, iflib_num_rx_queues; int err, admincnt, bar; iflib_num_tx_queues = scctx->isc_ntxqsets; iflib_num_rx_queues = scctx->isc_nrxqsets; device_printf(dev, "msix_init qsets capped at %d\n", iflib_num_tx_queues); bar = ctx->ifc_softc_ctx.isc_msix_bar; admincnt = sctx->isc_admin_intrcnt; /* Override by tuneable */ if (enable_msix == 0) goto msi; /* ** When used in a virtualized environment ** PCI BUSMASTER capability may not be set ** so explicity set it here and rewrite ** the ENABLE in the MSIX control register ** at this point to cause the host to ** successfully initialize us. */ { uint16_t pci_cmd_word; int msix_ctrl, rid; rid = 0; pci_cmd_word = pci_read_config(dev, PCIR_COMMAND, 2); pci_cmd_word |= PCIM_CMD_BUSMASTEREN; pci_write_config(dev, PCIR_COMMAND, pci_cmd_word, 2); pci_find_cap(dev, PCIY_MSIX, &rid); rid += PCIR_MSIX_CTRL; msix_ctrl = pci_read_config(dev, rid, 2); msix_ctrl |= PCIM_MSIXCTRL_MSIX_ENABLE; pci_write_config(dev, rid, msix_ctrl, 2); } /* * bar == -1 => "trust me I know what I'm doing" * https://www.youtube.com/watch?v=nnwWKkNau4I * Some drivers are for hardware that is so shoddily * documented that no one knows which bars are which * so the developer has to map all bars. This hack * allows shoddy garbage to use msix in this framework. */ if (bar != -1) { ctx->ifc_msix_mem = bus_alloc_resource_any(dev, SYS_RES_MEMORY, &bar, RF_ACTIVE); if (ctx->ifc_msix_mem == NULL) { /* May not be enabled */ device_printf(dev, "Unable to map MSIX table \n"); goto msi; } } /* First try MSI/X */ if ((msgs = pci_msix_count(dev)) == 0) { /* system has msix disabled */ device_printf(dev, "System has MSIX disabled \n"); bus_release_resource(dev, SYS_RES_MEMORY, bar, ctx->ifc_msix_mem); ctx->ifc_msix_mem = NULL; goto msi; } #if IFLIB_DEBUG /* use only 1 qset in debug mode */ queuemsgs = min(msgs - admincnt, 1); #else queuemsgs = msgs - admincnt; #endif if (bus_get_cpus(dev, INTR_CPUS, sizeof(ctx->ifc_cpus), &ctx->ifc_cpus) == 0) { #ifdef RSS queues = imin(queuemsgs, rss_getnumbuckets()); #else queues = queuemsgs; #endif queues = imin(CPU_COUNT(&ctx->ifc_cpus), queues); device_printf(dev, "pxm cpus: %d queue msgs: %d admincnt: %d\n", CPU_COUNT(&ctx->ifc_cpus), queuemsgs, admincnt); } else { device_printf(dev, "Unable to fetch CPU list\n"); /* Figure out a reasonable auto config value */ queues = min(queuemsgs, mp_ncpus); } #ifdef RSS /* If we're doing RSS, clamp at the number of RSS buckets */ if (queues > rss_getnumbuckets()) queues = rss_getnumbuckets(); #endif if (iflib_num_rx_queues > 0 && iflib_num_rx_queues < queuemsgs - admincnt) rx_queues = iflib_num_rx_queues; else rx_queues = queues; /* * We want this to be all logical CPUs by default */ if (iflib_num_tx_queues > 0 && iflib_num_tx_queues < queues) tx_queues = iflib_num_tx_queues; else tx_queues = mp_ncpus; if (ctx->ifc_sysctl_qs_eq_override == 0) { #ifdef INVARIANTS if (tx_queues != rx_queues) device_printf(dev, "queue equality override not set, capping rx_queues at %d and tx_queues at %d\n", min(rx_queues, tx_queues), min(rx_queues, tx_queues)); #endif tx_queues = min(rx_queues, tx_queues); rx_queues = min(rx_queues, tx_queues); } device_printf(dev, "using %d rx queues %d tx queues \n", rx_queues, tx_queues); vectors = rx_queues + admincnt; if ((err = pci_alloc_msix(dev, &vectors)) == 0) { device_printf(dev, "Using MSIX interrupts with %d vectors\n", vectors); scctx->isc_vectors = vectors; scctx->isc_nrxqsets = rx_queues; scctx->isc_ntxqsets = tx_queues; scctx->isc_intr = IFLIB_INTR_MSIX; return (vectors); } else { device_printf(dev, "failed to allocate %d msix vectors, err: %d - using MSI\n", vectors, err); } msi: vectors = pci_msi_count(dev); scctx->isc_nrxqsets = 1; scctx->isc_ntxqsets = 1; scctx->isc_vectors = vectors; if (vectors == 1 && pci_alloc_msi(dev, &vectors) == 0) { device_printf(dev,"Using an MSI interrupt\n"); scctx->isc_intr = IFLIB_INTR_MSI; } else { device_printf(dev,"Using a Legacy interrupt\n"); scctx->isc_intr = IFLIB_INTR_LEGACY; } return (vectors); } char * ring_states[] = { "IDLE", "BUSY", "STALLED", "ABDICATED" }; static int mp_ring_state_handler(SYSCTL_HANDLER_ARGS) { int rc; uint16_t *state = ((uint16_t *)oidp->oid_arg1); struct sbuf *sb; char *ring_state = "UNKNOWN"; /* XXX needed ? */ rc = sysctl_wire_old_buffer(req, 0); MPASS(rc == 0); if (rc != 0) return (rc); sb = sbuf_new_for_sysctl(NULL, NULL, 80, req); MPASS(sb != NULL); if (sb == NULL) return (ENOMEM); if (state[3] <= 3) ring_state = ring_states[state[3]]; sbuf_printf(sb, "pidx_head: %04hd pidx_tail: %04hd cidx: %04hd state: %s", state[0], state[1], state[2], ring_state); rc = sbuf_finish(sb); sbuf_delete(sb); return(rc); } enum iflib_ndesc_handler { IFLIB_NTXD_HANDLER, IFLIB_NRXD_HANDLER, }; static int mp_ndesc_handler(SYSCTL_HANDLER_ARGS) { if_ctx_t ctx = (void *)arg1; enum iflib_ndesc_handler type = arg2; char buf[256] = {0}; uint16_t *ndesc; char *p, *next; int nqs, rc, i; MPASS(type == IFLIB_NTXD_HANDLER || type == IFLIB_NRXD_HANDLER); nqs = 8; switch(type) { case IFLIB_NTXD_HANDLER: ndesc = ctx->ifc_sysctl_ntxds; if (ctx->ifc_sctx) nqs = ctx->ifc_sctx->isc_ntxqs; break; case IFLIB_NRXD_HANDLER: ndesc = ctx->ifc_sysctl_nrxds; if (ctx->ifc_sctx) nqs = ctx->ifc_sctx->isc_nrxqs; break; } if (nqs == 0) nqs = 8; for (i=0; i<8; i++) { if (i >= nqs) break; if (i) strcat(buf, ","); sprintf(strchr(buf, 0), "%d", ndesc[i]); } rc = sysctl_handle_string(oidp, buf, sizeof(buf), req); if (rc || req->newptr == NULL) return rc; for (i = 0, next = buf, p = strsep(&next, " ,"); i < 8 && p; i++, p = strsep(&next, " ,")) { ndesc[i] = strtoul(p, NULL, 10); } return(rc); } #define NAME_BUFLEN 32 static void iflib_add_device_sysctl_pre(if_ctx_t ctx) { device_t dev = iflib_get_dev(ctx); struct sysctl_oid_list *child, *oid_list; struct sysctl_ctx_list *ctx_list; struct sysctl_oid *node; ctx_list = device_get_sysctl_ctx(dev); child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev)); ctx->ifc_sysctl_node = node = SYSCTL_ADD_NODE(ctx_list, child, OID_AUTO, "iflib", CTLFLAG_RD, NULL, "IFLIB fields"); oid_list = SYSCTL_CHILDREN(node); SYSCTL_ADD_STRING(ctx_list, oid_list, OID_AUTO, "driver_version", CTLFLAG_RD, ctx->ifc_sctx->isc_driver_version, 0, "driver version"); SYSCTL_ADD_U16(ctx_list, oid_list, OID_AUTO, "override_ntxqs", CTLFLAG_RWTUN, &ctx->ifc_sysctl_ntxqs, 0, "# of txqs to use, 0 => use default #"); SYSCTL_ADD_U16(ctx_list, oid_list, OID_AUTO, "override_nrxqs", CTLFLAG_RWTUN, &ctx->ifc_sysctl_nrxqs, 0, "# of rxqs to use, 0 => use default #"); SYSCTL_ADD_U16(ctx_list, oid_list, OID_AUTO, "override_qs_enable", CTLFLAG_RWTUN, &ctx->ifc_sysctl_qs_eq_override, 0, "permit #txq != #rxq"); /* XXX change for per-queue sizes */ SYSCTL_ADD_PROC(ctx_list, oid_list, OID_AUTO, "override_ntxds", CTLTYPE_STRING|CTLFLAG_RWTUN, ctx, IFLIB_NTXD_HANDLER, mp_ndesc_handler, "A", "list of # of tx descriptors to use, 0 = use default #"); SYSCTL_ADD_PROC(ctx_list, oid_list, OID_AUTO, "override_nrxds", CTLTYPE_STRING|CTLFLAG_RWTUN, ctx, IFLIB_NRXD_HANDLER, mp_ndesc_handler, "A", "list of # of rx descriptors to use, 0 = use default #"); } static void iflib_add_device_sysctl_post(if_ctx_t ctx) { if_shared_ctx_t sctx = ctx->ifc_sctx; if_softc_ctx_t scctx = &ctx->ifc_softc_ctx; device_t dev = iflib_get_dev(ctx); struct sysctl_oid_list *child; struct sysctl_ctx_list *ctx_list; iflib_fl_t fl; iflib_txq_t txq; iflib_rxq_t rxq; int i, j; char namebuf[NAME_BUFLEN]; char *qfmt; struct sysctl_oid *queue_node, *fl_node, *node; struct sysctl_oid_list *queue_list, *fl_list; ctx_list = device_get_sysctl_ctx(dev); node = ctx->ifc_sysctl_node; child = SYSCTL_CHILDREN(node); if (scctx->isc_ntxqsets > 100) qfmt = "txq%03d"; else if (scctx->isc_ntxqsets > 10) qfmt = "txq%02d"; else qfmt = "txq%d"; for (i = 0, txq = ctx->ifc_txqs; i < scctx->isc_ntxqsets; i++, txq++) { snprintf(namebuf, NAME_BUFLEN, qfmt, i); queue_node = SYSCTL_ADD_NODE(ctx_list, child, OID_AUTO, namebuf, CTLFLAG_RD, NULL, "Queue Name"); queue_list = SYSCTL_CHILDREN(queue_node); #if MEMORY_LOGGING SYSCTL_ADD_QUAD(ctx_list, queue_list, OID_AUTO, "txq_dequeued", CTLFLAG_RD, &txq->ift_dequeued, "total mbufs freed"); SYSCTL_ADD_QUAD(ctx_list, queue_list, OID_AUTO, "txq_enqueued", CTLFLAG_RD, &txq->ift_enqueued, "total mbufs enqueued"); #endif SYSCTL_ADD_QUAD(ctx_list, queue_list, OID_AUTO, "mbuf_defrag", CTLFLAG_RD, &txq->ift_mbuf_defrag, "# of times m_defrag was called"); SYSCTL_ADD_QUAD(ctx_list, queue_list, OID_AUTO, "m_pullups", CTLFLAG_RD, &txq->ift_pullups, "# of times m_pullup was called"); SYSCTL_ADD_QUAD(ctx_list, queue_list, OID_AUTO, "mbuf_defrag_failed", CTLFLAG_RD, &txq->ift_mbuf_defrag_failed, "# of times m_defrag failed"); SYSCTL_ADD_QUAD(ctx_list, queue_list, OID_AUTO, "no_desc_avail", CTLFLAG_RD, &txq->ift_no_desc_avail, "# of times no descriptors were available"); SYSCTL_ADD_QUAD(ctx_list, queue_list, OID_AUTO, "tx_map_failed", CTLFLAG_RD, &txq->ift_map_failed, "# of times dma map failed"); SYSCTL_ADD_QUAD(ctx_list, queue_list, OID_AUTO, "txd_encap_efbig", CTLFLAG_RD, &txq->ift_txd_encap_efbig, "# of times txd_encap returned EFBIG"); SYSCTL_ADD_QUAD(ctx_list, queue_list, OID_AUTO, "no_tx_dma_setup", CTLFLAG_RD, &txq->ift_no_tx_dma_setup, "# of times map failed for other than EFBIG"); SYSCTL_ADD_U16(ctx_list, queue_list, OID_AUTO, "txq_pidx", CTLFLAG_RD, &txq->ift_pidx, 1, "Producer Index"); SYSCTL_ADD_U16(ctx_list, queue_list, OID_AUTO, "txq_cidx", CTLFLAG_RD, &txq->ift_cidx, 1, "Consumer Index"); SYSCTL_ADD_U16(ctx_list, queue_list, OID_AUTO, "txq_cidx_processed", CTLFLAG_RD, &txq->ift_cidx_processed, 1, "Consumer Index seen by credit update"); SYSCTL_ADD_U16(ctx_list, queue_list, OID_AUTO, "txq_in_use", CTLFLAG_RD, &txq->ift_in_use, 1, "descriptors in use"); SYSCTL_ADD_QUAD(ctx_list, queue_list, OID_AUTO, "txq_processed", CTLFLAG_RD, &txq->ift_processed, "descriptors procesed for clean"); SYSCTL_ADD_QUAD(ctx_list, queue_list, OID_AUTO, "txq_cleaned", CTLFLAG_RD, &txq->ift_cleaned, "total cleaned"); SYSCTL_ADD_PROC(ctx_list, queue_list, OID_AUTO, "ring_state", CTLTYPE_STRING | CTLFLAG_RD, __DEVOLATILE(uint64_t *, &txq->ift_br[0]->state), 0, mp_ring_state_handler, "A", "soft ring state"); SYSCTL_ADD_COUNTER_U64(ctx_list, queue_list, OID_AUTO, "r_enqueues", CTLFLAG_RD, &txq->ift_br[0]->enqueues, "# of enqueues to the mp_ring for this queue"); SYSCTL_ADD_COUNTER_U64(ctx_list, queue_list, OID_AUTO, "r_drops", CTLFLAG_RD, &txq->ift_br[0]->drops, "# of drops in the mp_ring for this queue"); SYSCTL_ADD_COUNTER_U64(ctx_list, queue_list, OID_AUTO, "r_starts", CTLFLAG_RD, &txq->ift_br[0]->starts, "# of normal consumer starts in the mp_ring for this queue"); SYSCTL_ADD_COUNTER_U64(ctx_list, queue_list, OID_AUTO, "r_stalls", CTLFLAG_RD, &txq->ift_br[0]->stalls, "# of consumer stalls in the mp_ring for this queue"); SYSCTL_ADD_COUNTER_U64(ctx_list, queue_list, OID_AUTO, "r_restarts", CTLFLAG_RD, &txq->ift_br[0]->restarts, "# of consumer restarts in the mp_ring for this queue"); SYSCTL_ADD_COUNTER_U64(ctx_list, queue_list, OID_AUTO, "r_abdications", CTLFLAG_RD, &txq->ift_br[0]->abdications, "# of consumer abdications in the mp_ring for this queue"); } if (scctx->isc_nrxqsets > 100) qfmt = "rxq%03d"; else if (scctx->isc_nrxqsets > 10) qfmt = "rxq%02d"; else qfmt = "rxq%d"; for (i = 0, rxq = ctx->ifc_rxqs; i < scctx->isc_nrxqsets; i++, rxq++) { snprintf(namebuf, NAME_BUFLEN, qfmt, i); queue_node = SYSCTL_ADD_NODE(ctx_list, child, OID_AUTO, namebuf, CTLFLAG_RD, NULL, "Queue Name"); queue_list = SYSCTL_CHILDREN(queue_node); if (sctx->isc_flags & IFLIB_HAS_RXCQ) { SYSCTL_ADD_U16(ctx_list, queue_list, OID_AUTO, "rxq_cq_pidx", CTLFLAG_RD, &rxq->ifr_cq_pidx, 1, "Producer Index"); SYSCTL_ADD_U16(ctx_list, queue_list, OID_AUTO, "rxq_cq_cidx", CTLFLAG_RD, &rxq->ifr_cq_cidx, 1, "Consumer Index"); } for (j = 0, fl = rxq->ifr_fl; j < rxq->ifr_nfl; j++, fl++) { snprintf(namebuf, NAME_BUFLEN, "rxq_fl%d", j); fl_node = SYSCTL_ADD_NODE(ctx_list, queue_list, OID_AUTO, namebuf, CTLFLAG_RD, NULL, "freelist Name"); fl_list = SYSCTL_CHILDREN(fl_node); SYSCTL_ADD_U16(ctx_list, fl_list, OID_AUTO, "pidx", CTLFLAG_RD, &fl->ifl_pidx, 1, "Producer Index"); SYSCTL_ADD_U16(ctx_list, fl_list, OID_AUTO, "cidx", CTLFLAG_RD, &fl->ifl_cidx, 1, "Consumer Index"); SYSCTL_ADD_U16(ctx_list, fl_list, OID_AUTO, "credits", CTLFLAG_RD, &fl->ifl_credits, 1, "credits available"); #if MEMORY_LOGGING SYSCTL_ADD_QUAD(ctx_list, fl_list, OID_AUTO, "fl_m_enqueued", CTLFLAG_RD, &fl->ifl_m_enqueued, "mbufs allocated"); SYSCTL_ADD_QUAD(ctx_list, fl_list, OID_AUTO, "fl_m_dequeued", CTLFLAG_RD, &fl->ifl_m_dequeued, "mbufs freed"); SYSCTL_ADD_QUAD(ctx_list, fl_list, OID_AUTO, "fl_cl_enqueued", CTLFLAG_RD, &fl->ifl_cl_enqueued, "clusters allocated"); SYSCTL_ADD_QUAD(ctx_list, fl_list, OID_AUTO, "fl_cl_dequeued", CTLFLAG_RD, &fl->ifl_cl_dequeued, "clusters freed"); #endif } } } Index: projects/netbsd-tests-upstream-01-2017/sys/sys/ata.h =================================================================== --- projects/netbsd-tests-upstream-01-2017/sys/sys/ata.h (revision 312217) +++ projects/netbsd-tests-upstream-01-2017/sys/sys/ata.h (revision 312218) @@ -1,1015 +1,1015 @@ /*- * Copyright (c) 2000 - 2008 Søren Schmidt * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer, * without modification, immediately at the beginning of the file. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _SYS_ATA_H_ #define _SYS_ATA_H_ #include /* ATA/ATAPI device parameters */ struct ata_params { /*000*/ u_int16_t config; /* configuration info */ #define ATA_PROTO_MASK 0x8003 #define ATA_PROTO_ATAPI 0x8000 #define ATA_PROTO_ATAPI_12 0x8000 #define ATA_PROTO_ATAPI_16 0x8001 #define ATA_PROTO_CFA 0x848a #define ATA_ATAPI_TYPE_MASK 0x1f00 #define ATA_ATAPI_TYPE_DIRECT 0x0000 /* disk/floppy */ #define ATA_ATAPI_TYPE_TAPE 0x0100 /* streaming tape */ #define ATA_ATAPI_TYPE_CDROM 0x0500 /* CD-ROM device */ #define ATA_ATAPI_TYPE_OPTICAL 0x0700 /* optical disk */ #define ATA_DRQ_MASK 0x0060 #define ATA_DRQ_SLOW 0x0000 /* cpu 3 ms delay */ #define ATA_DRQ_INTR 0x0020 /* interrupt 10 ms delay */ #define ATA_DRQ_FAST 0x0040 /* accel 50 us delay */ #define ATA_RESP_INCOMPLETE 0x0004 /*001*/ u_int16_t cylinders; /* # of cylinders */ /*002*/ u_int16_t specconf; /* specific configuration */ /*003*/ u_int16_t heads; /* # heads */ u_int16_t obsolete4; u_int16_t obsolete5; /*006*/ u_int16_t sectors; /* # sectors/track */ /*007*/ u_int16_t vendor7[3]; /*010*/ u_int8_t serial[20]; /* serial number */ /*020*/ u_int16_t retired20; u_int16_t retired21; u_int16_t obsolete22; /*023*/ u_int8_t revision[8]; /* firmware revision */ /*027*/ u_int8_t model[40]; /* model name */ /*047*/ u_int16_t sectors_intr; /* sectors per interrupt */ /*048*/ u_int16_t usedmovsd; /* double word read/write? */ /*049*/ u_int16_t capabilities1; #define ATA_SUPPORT_DMA 0x0100 #define ATA_SUPPORT_LBA 0x0200 #define ATA_SUPPORT_IORDY 0x0400 #define ATA_SUPPORT_IORDYDIS 0x0800 #define ATA_SUPPORT_OVERLAP 0x4000 /*050*/ u_int16_t capabilities2; /*051*/ u_int16_t retired_piomode; /* PIO modes 0-2 */ #define ATA_RETIRED_PIO_MASK 0x0300 /*052*/ u_int16_t retired_dmamode; /* DMA modes */ #define ATA_RETIRED_DMA_MASK 0x0003 /*053*/ u_int16_t atavalid; /* fields valid */ #define ATA_FLAG_54_58 0x0001 /* words 54-58 valid */ #define ATA_FLAG_64_70 0x0002 /* words 64-70 valid */ #define ATA_FLAG_88 0x0004 /* word 88 valid */ /*054*/ u_int16_t current_cylinders; /*055*/ u_int16_t current_heads; /*056*/ u_int16_t current_sectors; /*057*/ u_int16_t current_size_1; /*058*/ u_int16_t current_size_2; /*059*/ u_int16_t multi; #define ATA_MULTI_VALID 0x0100 /*060*/ u_int16_t lba_size_1; u_int16_t lba_size_2; u_int16_t obsolete62; /*063*/ u_int16_t mwdmamodes; /* multiword DMA modes */ /*064*/ u_int16_t apiomodes; /* advanced PIO modes */ /*065*/ u_int16_t mwdmamin; /* min. M/W DMA time/word ns */ /*066*/ u_int16_t mwdmarec; /* rec. M/W DMA time ns */ /*067*/ u_int16_t pioblind; /* min. PIO cycle w/o flow */ /*068*/ u_int16_t pioiordy; /* min. PIO cycle IORDY flow */ /*069*/ u_int16_t support3; #define ATA_SUPPORT_RZAT 0x0020 #define ATA_SUPPORT_DRAT 0x4000 #define ATA_SUPPORT_ZONE_MASK 0x0003 #define ATA_SUPPORT_ZONE_NR 0x0000 #define ATA_SUPPORT_ZONE_HOST_AWARE 0x0001 #define ATA_SUPPORT_ZONE_DEV_MANAGED 0x0002 u_int16_t reserved70; /*071*/ u_int16_t rlsovlap; /* rel time (us) for overlap */ /*072*/ u_int16_t rlsservice; /* rel time (us) for service */ u_int16_t reserved73; u_int16_t reserved74; /*075*/ u_int16_t queue; #define ATA_QUEUE_LEN(x) ((x) & 0x001f) /*76*/ u_int16_t satacapabilities; #define ATA_SATA_GEN1 0x0002 #define ATA_SATA_GEN2 0x0004 #define ATA_SATA_GEN3 0x0008 #define ATA_SUPPORT_NCQ 0x0100 #define ATA_SUPPORT_IFPWRMNGTRCV 0x0200 #define ATA_SUPPORT_PHYEVENTCNT 0x0400 #define ATA_SUPPORT_NCQ_UNLOAD 0x0800 #define ATA_SUPPORT_NCQ_PRIO 0x1000 #define ATA_SUPPORT_HAPST 0x2000 #define ATA_SUPPORT_DAPST 0x4000 #define ATA_SUPPORT_READLOGDMAEXT 0x8000 /*77*/ u_int16_t satacapabilities2; #define ATA_SATA_CURR_GEN_MASK 0x0006 #define ATA_SUPPORT_NCQ_STREAM 0x0010 #define ATA_SUPPORT_NCQ_QMANAGEMENT 0x0020 #define ATA_SUPPORT_RCVSND_FPDMA_QUEUED 0x0040 /*78*/ u_int16_t satasupport; #define ATA_SUPPORT_NONZERO 0x0002 #define ATA_SUPPORT_AUTOACTIVATE 0x0004 #define ATA_SUPPORT_IFPWRMNGT 0x0008 #define ATA_SUPPORT_INORDERDATA 0x0010 #define ATA_SUPPORT_ASYNCNOTIF 0x0020 #define ATA_SUPPORT_SOFTSETPRESERVE 0x0040 /*79*/ u_int16_t sataenabled; #define ATA_ENABLED_DAPST 0x0080 /*080*/ u_int16_t version_major; /*081*/ u_int16_t version_minor; struct { /*082/085*/ u_int16_t command1; #define ATA_SUPPORT_SMART 0x0001 #define ATA_SUPPORT_SECURITY 0x0002 #define ATA_SUPPORT_REMOVABLE 0x0004 #define ATA_SUPPORT_POWERMGT 0x0008 #define ATA_SUPPORT_PACKET 0x0010 #define ATA_SUPPORT_WRITECACHE 0x0020 #define ATA_SUPPORT_LOOKAHEAD 0x0040 #define ATA_SUPPORT_RELEASEIRQ 0x0080 #define ATA_SUPPORT_SERVICEIRQ 0x0100 #define ATA_SUPPORT_RESET 0x0200 #define ATA_SUPPORT_PROTECTED 0x0400 #define ATA_SUPPORT_WRITEBUFFER 0x1000 #define ATA_SUPPORT_READBUFFER 0x2000 #define ATA_SUPPORT_NOP 0x4000 /*083/086*/ u_int16_t command2; #define ATA_SUPPORT_MICROCODE 0x0001 #define ATA_SUPPORT_QUEUED 0x0002 #define ATA_SUPPORT_CFA 0x0004 #define ATA_SUPPORT_APM 0x0008 #define ATA_SUPPORT_NOTIFY 0x0010 #define ATA_SUPPORT_STANDBY 0x0020 #define ATA_SUPPORT_SPINUP 0x0040 #define ATA_SUPPORT_MAXSECURITY 0x0100 #define ATA_SUPPORT_AUTOACOUSTIC 0x0200 #define ATA_SUPPORT_ADDRESS48 0x0400 #define ATA_SUPPORT_OVERLAY 0x0800 #define ATA_SUPPORT_FLUSHCACHE 0x1000 #define ATA_SUPPORT_FLUSHCACHE48 0x2000 /*084/087*/ u_int16_t extension; #define ATA_SUPPORT_SMARTLOG 0x0001 #define ATA_SUPPORT_SMARTTEST 0x0002 #define ATA_SUPPORT_MEDIASN 0x0004 #define ATA_SUPPORT_MEDIAPASS 0x0008 #define ATA_SUPPORT_STREAMING 0x0010 #define ATA_SUPPORT_GENLOG 0x0020 #define ATA_SUPPORT_WRITEDMAFUAEXT 0x0040 #define ATA_SUPPORT_WRITEDMAQFUAEXT 0x0080 #define ATA_SUPPORT_64BITWWN 0x0100 #define ATA_SUPPORT_UNLOAD 0x2000 } __packed support, enabled; /*088*/ u_int16_t udmamodes; /* UltraDMA modes */ /*089*/ u_int16_t erase_time; /* time req'd in 2min units */ /*090*/ u_int16_t enhanced_erase_time; /* time req'd in 2min units */ /*091*/ u_int16_t apm_value; /*092*/ u_int16_t master_passwd_revision; /* password revision code */ /*093*/ u_int16_t hwres; #define ATA_CABLE_ID 0x2000 /*094*/ u_int16_t acoustic; #define ATA_ACOUSTIC_CURRENT(x) ((x) & 0x00ff) #define ATA_ACOUSTIC_VENDOR(x) (((x) & 0xff00) >> 8) /*095*/ u_int16_t stream_min_req_size; /*096*/ u_int16_t stream_transfer_time; /*097*/ u_int16_t stream_access_latency; /*098*/ u_int32_t stream_granularity; /*100*/ u_int16_t lba_size48_1; u_int16_t lba_size48_2; u_int16_t lba_size48_3; u_int16_t lba_size48_4; u_int16_t reserved104; /*105*/ u_int16_t max_dsm_blocks; /*106*/ u_int16_t pss; #define ATA_PSS_LSPPS 0x000F #define ATA_PSS_LSSABOVE512 0x1000 #define ATA_PSS_MULTLS 0x2000 #define ATA_PSS_VALID_MASK 0xC000 #define ATA_PSS_VALID_VALUE 0x4000 /*107*/ u_int16_t isd; /*108*/ u_int16_t wwn[4]; u_int16_t reserved112[5]; /*117*/ u_int16_t lss_1; /*118*/ u_int16_t lss_2; /*119*/ u_int16_t support2; #define ATA_SUPPORT_WRITEREADVERIFY 0x0002 #define ATA_SUPPORT_WRITEUNCORREXT 0x0004 #define ATA_SUPPORT_RWLOGDMAEXT 0x0008 #define ATA_SUPPORT_MICROCODE3 0x0010 #define ATA_SUPPORT_FREEFALL 0x0020 #define ATA_SUPPORT_SENSE_REPORT 0x0040 #define ATA_SUPPORT_EPC 0x0080 /*120*/ u_int16_t enabled2; #define ATA_ENABLED_WRITEREADVERIFY 0x0002 #define ATA_ENABLED_WRITEUNCORREXT 0x0004 #define ATA_ENABLED_FREEFALL 0x0020 #define ATA_ENABLED_SENSE_REPORT 0x0040 #define ATA_ENABLED_EPC 0x0080 u_int16_t reserved121[6]; /*127*/ u_int16_t removable_status; /*128*/ u_int16_t security_status; #define ATA_SECURITY_LEVEL 0x0100 /* 0: high, 1: maximum */ #define ATA_SECURITY_ENH_SUPP 0x0020 /* enhanced erase supported */ #define ATA_SECURITY_COUNT_EXP 0x0010 /* count expired */ #define ATA_SECURITY_FROZEN 0x0008 /* security config is frozen */ #define ATA_SECURITY_LOCKED 0x0004 /* drive is locked */ #define ATA_SECURITY_ENABLED 0x0002 /* ATA Security is enabled */ #define ATA_SECURITY_SUPPORTED 0x0001 /* ATA Security is supported */ u_int16_t reserved129[31]; /*160*/ u_int16_t cfa_powermode1; u_int16_t reserved161; /*162*/ u_int16_t cfa_kms_support; /*163*/ u_int16_t cfa_trueide_modes; /*164*/ u_int16_t cfa_memory_modes; u_int16_t reserved165[4]; /*169*/ u_int16_t support_dsm; #define ATA_SUPPORT_DSM_TRIM 0x0001 u_int16_t reserved170[6]; /*176*/ u_int8_t media_serial[60]; /*206*/ u_int16_t sct; u_int16_t reserved206[2]; /*209*/ u_int16_t lsalign; /*210*/ u_int16_t wrv_sectors_m3_1; u_int16_t wrv_sectors_m3_2; /*212*/ u_int16_t wrv_sectors_m2_1; u_int16_t wrv_sectors_m2_2; /*214*/ u_int16_t nv_cache_caps; /*215*/ u_int16_t nv_cache_size_1; u_int16_t nv_cache_size_2; /*217*/ u_int16_t media_rotation_rate; #define ATA_RATE_NOT_REPORTED 0x0000 #define ATA_RATE_NON_ROTATING 0x0001 u_int16_t reserved218; /*219*/ u_int16_t nv_cache_opt; /*220*/ u_int16_t wrv_mode; u_int16_t reserved221; /*222*/ u_int16_t transport_major; /*223*/ u_int16_t transport_minor; u_int16_t reserved224[31]; /*255*/ u_int16_t integrity; } __packed; /* ATA Dataset Management */ #define ATA_DSM_BLK_SIZE 512 #define ATA_DSM_BLK_RANGES 64 #define ATA_DSM_RANGE_SIZE 8 #define ATA_DSM_RANGE_MAX 65535 /* * ATA Device Register * * bit 7 Obsolete (was 1 in early ATA specs) * bit 6 Sets LBA/CHS mode. 1=LBA, 0=CHS * bit 5 Obsolete (was 1 in early ATA specs) * bit 4 1 = Slave Drive, 0 = Master Drive * bit 3-0 In LBA mode, 27-24 of address. In CHS mode, head number */ #define ATA_DEV_MASTER 0x00 #define ATA_DEV_SLAVE 0x10 #define ATA_DEV_LBA 0x40 /* ATA limits */ #define ATA_MAX_28BIT_LBA 268435455UL /* ATA Status Register */ #define ATA_STATUS_ERROR 0x01 #define ATA_STATUS_SENSE_AVAIL 0x02 #define ATA_STATUS_ALIGN_ERR 0x04 #define ATA_STATUS_DATA_REQ 0x08 #define ATA_STATUS_DEF_WRITE_ERR 0x10 #define ATA_STATUS_DEVICE_FAULT 0x20 #define ATA_STATUS_DEVICE_READY 0x40 #define ATA_STATUS_BUSY 0x80 /* ATA Error Register */ #define ATA_ERROR_ABORT 0x04 #define ATA_ERROR_ID_NOT_FOUND 0x10 /* ATA HPA Features */ #define ATA_HPA_FEAT_MAX_ADDR 0x00 #define ATA_HPA_FEAT_SET_PWD 0x01 #define ATA_HPA_FEAT_LOCK 0x02 #define ATA_HPA_FEAT_UNLOCK 0x03 #define ATA_HPA_FEAT_FREEZE 0x04 /* ATA transfer modes */ #define ATA_MODE_MASK 0x0f #define ATA_DMA_MASK 0xf0 #define ATA_PIO 0x00 #define ATA_PIO0 0x08 #define ATA_PIO1 0x09 #define ATA_PIO2 0x0a #define ATA_PIO3 0x0b #define ATA_PIO4 0x0c #define ATA_PIO_MAX 0x0f #define ATA_DMA 0x10 #define ATA_WDMA0 0x20 #define ATA_WDMA1 0x21 #define ATA_WDMA2 0x22 #define ATA_UDMA0 0x40 #define ATA_UDMA1 0x41 #define ATA_UDMA2 0x42 #define ATA_UDMA3 0x43 #define ATA_UDMA4 0x44 #define ATA_UDMA5 0x45 #define ATA_UDMA6 0x46 #define ATA_SA150 0x47 #define ATA_SA300 0x48 #define ATA_SA600 0x49 #define ATA_DMA_MAX 0x4f /* ATA commands */ #define ATA_NOP 0x00 /* NOP */ #define ATA_NF_FLUSHQUEUE 0x00 /* flush queued cmd's */ #define ATA_NF_AUTOPOLL 0x01 /* start autopoll function */ #define ATA_DATA_SET_MANAGEMENT 0x06 #define ATA_DSM_TRIM 0x01 #define ATA_DEVICE_RESET 0x08 /* reset device */ #define ATA_READ 0x20 /* read */ #define ATA_READ48 0x24 /* read 48bit LBA */ #define ATA_READ_DMA48 0x25 /* read DMA 48bit LBA */ #define ATA_READ_DMA_QUEUED48 0x26 /* read DMA QUEUED 48bit LBA */ #define ATA_READ_NATIVE_MAX_ADDRESS48 0x27 /* read native max addr 48bit */ #define ATA_READ_MUL48 0x29 /* read multi 48bit LBA */ #define ATA_READ_STREAM_DMA48 0x2a /* read DMA stream 48bit LBA */ #define ATA_READ_LOG_EXT 0x2f /* read log ext - PIO Data-In */ #define ATA_READ_STREAM48 0x2b /* read stream 48bit LBA */ #define ATA_WRITE 0x30 /* write */ #define ATA_WRITE48 0x34 /* write 48bit LBA */ #define ATA_WRITE_DMA48 0x35 /* write DMA 48bit LBA */ #define ATA_WRITE_DMA_QUEUED48 0x36 /* write DMA QUEUED 48bit LBA*/ #define ATA_SET_MAX_ADDRESS48 0x37 /* set max address 48bit */ #define ATA_WRITE_MUL48 0x39 /* write multi 48bit LBA */ #define ATA_WRITE_STREAM_DMA48 0x3a #define ATA_WRITE_STREAM48 0x3b #define ATA_WRITE_DMA_FUA48 0x3d #define ATA_WRITE_DMA_QUEUED_FUA48 0x3e #define ATA_WRITE_LOG_EXT 0x3f #define ATA_READ_VERIFY 0x40 #define ATA_READ_VERIFY48 0x42 #define ATA_WRITE_UNCORRECTABLE48 0x45 /* write uncorrectable 48bit LBA */ #define ATA_WU_PSEUDO 0x55 /* pseudo-uncorrectable error */ #define ATA_WU_FLAGGED 0xaa /* flagged-uncorrectable error */ #define ATA_READ_LOG_DMA_EXT 0x47 /* read log DMA ext - PIO Data-In */ #define ATA_ZAC_MANAGEMENT_IN 0x4a /* ZAC management in */ #define ATA_ZM_REPORT_ZONES 0x00 /* report zones */ #define ATA_READ_FPDMA_QUEUED 0x60 /* read DMA NCQ */ #define ATA_WRITE_FPDMA_QUEUED 0x61 /* write DMA NCQ */ #define ATA_NCQ_NON_DATA 0x63 /* NCQ non-data command */ #define ATA_ABORT_NCQ_QUEUE 0x00 /* abort NCQ queue */ #define ATA_DEADLINE_HANDLING 0x01 /* deadline handling */ #define ATA_SET_FEATURES 0x05 /* set features */ #define ATA_ZERO_EXT 0x06 /* zero ext */ #define ATA_NCQ_ZAC_MGMT_OUT 0x07 /* NCQ ZAC mgmt out no data */ #define ATA_SEND_FPDMA_QUEUED 0x64 /* send DMA NCQ */ #define ATA_SFPDMA_DSM 0x00 /* Data set management */ #define ATA_SFPDMA_DSM_TRIM 0x01 /* Set trim bit in auxiliary */ #define ATA_SFPDMA_HYBRID_EVICT 0x01 /* Hybrid Evict */ #define ATA_SFPDMA_WLDMA 0x02 /* Write Log DMA EXT */ #define ATA_SFPDMA_ZAC_MGMT_OUT 0x03 /* NCQ ZAC mgmt out w/data */ #define ATA_RECV_FPDMA_QUEUED 0x65 /* receive DMA NCQ */ #define ATA_RFPDMA_RL_DMA_EXT 0x00 /* Read Log DMA EXT */ #define ATA_RFPDMA_ZAC_MGMT_IN 0x02 /* NCQ ZAC mgmt in w/data */ #define ATA_SEP_ATTN 0x67 /* SEP request */ #define ATA_SEEK 0x70 /* seek */ #define ATA_ZAC_MANAGEMENT_OUT 0x9f /* ZAC management out */ #define ATA_ZM_CLOSE_ZONE 0x01 /* close zone */ #define ATA_ZM_FINISH_ZONE 0x02 /* finish zone */ #define ATA_ZM_OPEN_ZONE 0x03 /* open zone */ #define ATA_ZM_RWP 0x04 /* reset write pointer */ #define ATA_PACKET_CMD 0xa0 /* packet command */ #define ATA_ATAPI_IDENTIFY 0xa1 /* get ATAPI params*/ #define ATA_SERVICE 0xa2 /* service command */ #define ATA_SMART_CMD 0xb0 /* SMART command */ #define ATA_CFA_ERASE 0xc0 /* CFA erase */ #define ATA_READ_MUL 0xc4 /* read multi */ #define ATA_WRITE_MUL 0xc5 /* write multi */ #define ATA_SET_MULTI 0xc6 /* set multi size */ #define ATA_READ_DMA_QUEUED 0xc7 /* read DMA QUEUED */ #define ATA_READ_DMA 0xc8 /* read DMA */ #define ATA_WRITE_DMA 0xca /* write DMA */ #define ATA_WRITE_DMA_QUEUED 0xcc /* write DMA QUEUED */ #define ATA_WRITE_MUL_FUA48 0xce #define ATA_STANDBY_IMMEDIATE 0xe0 /* standby immediate */ #define ATA_IDLE_IMMEDIATE 0xe1 /* idle immediate */ #define ATA_STANDBY_CMD 0xe2 /* standby */ #define ATA_IDLE_CMD 0xe3 /* idle */ #define ATA_READ_BUFFER 0xe4 /* read buffer */ #define ATA_READ_PM 0xe4 /* read portmultiplier */ #define ATA_CHECK_POWER_MODE 0xe5 /* device power mode */ #define ATA_SLEEP 0xe6 /* sleep */ #define ATA_FLUSHCACHE 0xe7 /* flush cache to disk */ #define ATA_WRITE_PM 0xe8 /* write portmultiplier */ #define ATA_FLUSHCACHE48 0xea /* flush cache to disk */ #define ATA_ATA_IDENTIFY 0xec /* get ATA params */ #define ATA_SETFEATURES 0xef /* features command */ #define ATA_SF_ENAB_WCACHE 0x02 /* enable write cache */ #define ATA_SF_DIS_WCACHE 0x82 /* disable write cache */ #define ATA_SF_SETXFER 0x03 /* set transfer mode */ #define ATA_SF_APM 0x05 /* Enable APM feature set */ #define ATA_SF_ENAB_PUIS 0x06 /* enable PUIS */ #define ATA_SF_DIS_PUIS 0x86 /* disable PUIS */ #define ATA_SF_PUIS_SPINUP 0x07 /* PUIS spin-up */ #define ATA_SF_WRV 0x0b /* Enable Write-Read-Verify */ #define ATA_SF_DLC 0x0c /* Enable device life control */ #define ATA_SF_SATA 0x10 /* Enable use of SATA feature */ #define ATA_SF_FFC 0x41 /* Free-fall Control */ #define ATA_SF_MHIST 0x43 /* Set Max Host Sect. Times */ #define ATA_SF_RATE 0x45 /* Set Rate Basis */ #define ATA_SF_EPC 0x4A /* Extended Power Conditions */ #define ATA_SF_ENAB_RCACHE 0xaa /* enable readahead cache */ #define ATA_SF_DIS_RCACHE 0x55 /* disable readahead cache */ #define ATA_SF_ENAB_RELIRQ 0x5d /* enable release interrupt */ #define ATA_SF_DIS_RELIRQ 0xdd /* disable release interrupt */ #define ATA_SF_ENAB_SRVIRQ 0x5e /* enable service interrupt */ #define ATA_SF_DIS_SRVIRQ 0xde /* disable service interrupt */ #define ATA_SF_LPSAERC 0x62 /* Long Phys Sect Align ErrRep*/ #define ATA_SF_DSN 0x63 /* Device Stats Notification */ #define ATA_CHECK_POWER_MODE 0xe5 /* Check Power Mode */ #define ATA_SECURITY_SET_PASSWORD 0xf1 /* set drive password */ #define ATA_SECURITY_UNLOCK 0xf2 /* unlock drive using passwd */ #define ATA_SECURITY_ERASE_PREPARE 0xf3 /* prepare to erase drive */ #define ATA_SECURITY_ERASE_UNIT 0xf4 /* erase all blocks on drive */ #define ATA_SECURITY_FREEZE_LOCK 0xf5 /* freeze security config */ #define ATA_SECURITY_DISABLE_PASSWORD 0xf6 /* disable drive password */ #define ATA_READ_NATIVE_MAX_ADDRESS 0xf8 /* read native max address */ #define ATA_SET_MAX_ADDRESS 0xf9 /* set max address */ /* ATAPI commands */ #define ATAPI_TEST_UNIT_READY 0x00 /* check if device is ready */ #define ATAPI_REZERO 0x01 /* rewind */ #define ATAPI_REQUEST_SENSE 0x03 /* get sense data */ #define ATAPI_FORMAT 0x04 /* format unit */ #define ATAPI_READ 0x08 /* read data */ #define ATAPI_WRITE 0x0a /* write data */ #define ATAPI_WEOF 0x10 /* write filemark */ #define ATAPI_WF_WRITE 0x01 #define ATAPI_SPACE 0x11 /* space command */ #define ATAPI_SP_FM 0x01 #define ATAPI_SP_EOD 0x03 #define ATAPI_INQUIRY 0x12 /* get inquiry data */ #define ATAPI_MODE_SELECT 0x15 /* mode select */ #define ATAPI_ERASE 0x19 /* erase */ #define ATAPI_MODE_SENSE 0x1a /* mode sense */ #define ATAPI_START_STOP 0x1b /* start/stop unit */ #define ATAPI_SS_LOAD 0x01 #define ATAPI_SS_RETENSION 0x02 #define ATAPI_SS_EJECT 0x04 #define ATAPI_PREVENT_ALLOW 0x1e /* media removal */ #define ATAPI_READ_FORMAT_CAPACITIES 0x23 /* get format capacities */ #define ATAPI_READ_CAPACITY 0x25 /* get volume capacity */ #define ATAPI_READ_BIG 0x28 /* read data */ #define ATAPI_WRITE_BIG 0x2a /* write data */ #define ATAPI_LOCATE 0x2b /* locate to position */ #define ATAPI_READ_POSITION 0x34 /* read position */ #define ATAPI_SYNCHRONIZE_CACHE 0x35 /* flush buf, close channel */ #define ATAPI_WRITE_BUFFER 0x3b /* write device buffer */ #define ATAPI_READ_BUFFER 0x3c /* read device buffer */ #define ATAPI_READ_SUBCHANNEL 0x42 /* get subchannel info */ #define ATAPI_READ_TOC 0x43 /* get table of contents */ #define ATAPI_PLAY_10 0x45 /* play by lba */ #define ATAPI_PLAY_MSF 0x47 /* play by MSF address */ #define ATAPI_PLAY_TRACK 0x48 /* play by track number */ #define ATAPI_PAUSE 0x4b /* pause audio operation */ #define ATAPI_READ_DISK_INFO 0x51 /* get disk info structure */ #define ATAPI_READ_TRACK_INFO 0x52 /* get track info structure */ #define ATAPI_RESERVE_TRACK 0x53 /* reserve track */ #define ATAPI_SEND_OPC_INFO 0x54 /* send OPC structurek */ #define ATAPI_MODE_SELECT_BIG 0x55 /* set device parameters */ #define ATAPI_REPAIR_TRACK 0x58 /* repair track */ #define ATAPI_READ_MASTER_CUE 0x59 /* read master CUE info */ #define ATAPI_MODE_SENSE_BIG 0x5a /* get device parameters */ #define ATAPI_CLOSE_TRACK 0x5b /* close track/session */ #define ATAPI_READ_BUFFER_CAPACITY 0x5c /* get buffer capicity */ #define ATAPI_SEND_CUE_SHEET 0x5d /* send CUE sheet */ #define ATAPI_SERVICE_ACTION_IN 0x96 /* get service data */ #define ATAPI_BLANK 0xa1 /* blank the media */ #define ATAPI_SEND_KEY 0xa3 /* send DVD key structure */ #define ATAPI_REPORT_KEY 0xa4 /* get DVD key structure */ #define ATAPI_PLAY_12 0xa5 /* play by lba */ #define ATAPI_LOAD_UNLOAD 0xa6 /* changer control command */ #define ATAPI_READ_STRUCTURE 0xad /* get DVD structure */ #define ATAPI_PLAY_CD 0xb4 /* universal play command */ #define ATAPI_SET_SPEED 0xbb /* set drive speed */ #define ATAPI_MECH_STATUS 0xbd /* get changer status */ #define ATAPI_READ_CD 0xbe /* read data */ #define ATAPI_POLL_DSC 0xff /* poll DSC status bit */ struct ata_ioc_devices { int channel; char name[2][32]; struct ata_params params[2]; }; /* pr channel ATA ioctl calls */ #define IOCATAGMAXCHANNEL _IOR('a', 1, int) #define IOCATAREINIT _IOW('a', 2, int) #define IOCATAATTACH _IOW('a', 3, int) #define IOCATADETACH _IOW('a', 4, int) #define IOCATADEVICES _IOWR('a', 5, struct ata_ioc_devices) /* ATAPI request sense structure */ struct atapi_sense { u_int8_t error; /* current or deferred errors */ #define ATA_SENSE_VALID 0x80 u_int8_t segment; /* segment number */ u_int8_t key; /* sense key */ #define ATA_SENSE_KEY_MASK 0x0f /* sense key mask */ #define ATA_SENSE_NO_SENSE 0x00 /* no specific sense key info */ #define ATA_SENSE_RECOVERED_ERROR 0x01 /* command OK, data recovered */ #define ATA_SENSE_NOT_READY 0x02 /* no access to drive */ #define ATA_SENSE_MEDIUM_ERROR 0x03 /* non-recovered data error */ #define ATA_SENSE_HARDWARE_ERROR 0x04 /* non-recoverable HW failure */ #define ATA_SENSE_ILLEGAL_REQUEST 0x05 /* invalid command param(s) */ #define ATA_SENSE_UNIT_ATTENTION 0x06 /* media changed */ #define ATA_SENSE_DATA_PROTECT 0x07 /* write protect */ #define ATA_SENSE_BLANK_CHECK 0x08 /* blank check */ #define ATA_SENSE_VENDOR_SPECIFIC 0x09 /* vendor specific skey */ #define ATA_SENSE_COPY_ABORTED 0x0a /* copy aborted */ #define ATA_SENSE_ABORTED_COMMAND 0x0b /* command aborted, try again */ #define ATA_SENSE_EQUAL 0x0c /* equal */ #define ATA_SENSE_VOLUME_OVERFLOW 0x0d /* volume overflow */ #define ATA_SENSE_MISCOMPARE 0x0e /* data dont match the medium */ #define ATA_SENSE_RESERVED 0x0f #define ATA_SENSE_ILI 0x20; #define ATA_SENSE_EOM 0x40; #define ATA_SENSE_FILEMARK 0x80; u_int32_t cmd_info; /* cmd information */ u_int8_t sense_length; /* additional sense len (n-7) */ u_int32_t cmd_specific_info; /* additional cmd spec info */ u_int8_t asc; /* additional sense code */ u_int8_t ascq; /* additional sense code qual */ u_int8_t replaceable_unit_code; /* replaceable unit code */ u_int8_t specific; /* sense key specific */ #define ATA_SENSE_SPEC_VALID 0x80 #define ATA_SENSE_SPEC_MASK 0x7f u_int8_t specific1; /* sense key specific */ u_int8_t specific2; /* sense key specific */ } __packed; /* * SET FEATURES subcommands */ /* * SET FEATURES command * Extended Power Conditions subcommand -- ATA_SF_EPC (0x4A) * These values go in the LBA 3:0. */ #define ATA_SF_EPC_RESTORE 0x00 /* Restore Power Condition Settings */ #define ATA_SF_EPC_GOTO 0x01 /* Go To Power Condition */ #define ATA_SF_EPC_SET_TIMER 0x02 /* Set Power Condition Timer */ #define ATA_SF_EPC_SET_STATE 0x03 /* Set Power Condition State */ #define ATA_SF_EPC_ENABLE 0x04 /* Enable the EPC feature set */ #define ATA_SF_EPC_DISABLE 0x05 /* Disable the EPC feature set */ #define ATA_SF_EPC_SET_SOURCE 0x06 /* Set EPC Power Source */ /* * SET FEATURES command * Extended Power Conditions subcommand -- ATA_SF_EPC (0x4A) * Power Condition ID field * These values go in the count register. */ #define ATA_EPC_STANDBY_Z 0x00 /* Substate of PM2:Standby */ #define ATA_EPC_STANDBY_Y 0x01 /* Substate of PM2:Standby */ #define ATA_EPC_IDLE_A 0x81 /* Substate of PM1:Idle */ #define ATA_EPC_IDLE_B 0x82 /* Substate of PM1:Idle */ #define ATA_EPC_IDLE_C 0x83 /* Substate of PM1:Idle */ #define ATA_EPC_ALL 0xff /* All supported power conditions */ /* * SET FEATURES command * Extended Power Conditions subcommand -- ATA_SF_EPC (0x4A) * Restore Power Conditions Settings subcommand * These values go in the LBA register. */ #define ATA_SF_EPC_RST_DFLT 0x40 /* 1=Rst from Default, 0= from Saved */ #define ATA_SF_EPC_RST_SAVE 0x10 /* 1=Save on completion */ /* * SET FEATURES command * Extended Power Conditions subcommand -- ATA_SF_EPC (0x4A) * Got To Power Condition subcommand * These values go in the LBA register. */ #define ATA_SF_EPC_GOTO_DELAY 0x02000000 /* Delayed entry bit */ #define ATA_SF_EPC_GOTO_HOLD 0x01000000 /* Hold Power Cond bit */ /* * SET FEATURES command * Extended Power Conditions subcommand -- ATA_SF_EPC (0x4A) * Set Power Condition Timer subcommand * These values go in the LBA register. */ #define ATA_SF_EPC_TIMER_MASK 0x00ffff00 /* Timer field */ #define ATA_SF_EPC_TIMER_SHIFT 8 #define ATA_SF_EPC_TIMER_SEC 0x00000080 /* Timer units, 1=sec, 0=.1s */ #define ATA_SF_EPC_TIMER_EN 0x00000020 /* Enable/disable cond. */ #define ATA_SF_EPC_TIMER_SAVE 0x00000010 /* Save settings on comp. */ /* * SET FEATURES command * Extended Power Conditions subcommand -- ATA_SF_EPC (0x4A) * Set Power Condition State subcommand * These values go in the LBA register. */ #define ATA_SF_EPC_SETCON_EN 0x00000020 /* Enable power cond. */ #define ATA_SF_EPC_SETCON_SAVE 0x00000010 /* Save settings on comp */ /* * SET FEATURES command * Extended Power Conditions subcommand -- ATA_SF_EPC (0x4A) * Set EPC Power Source subcommand * These values go in the count register. */ #define ATA_SF_EPC_SRC_UNKNOWN 0x0000 /* Unknown source */ #define ATA_SF_EPC_SRC_BAT 0x0001 /* battery source */ #define ATA_SF_EPC_SRC_NOT_BAT 0x0002 /* not battery source */ #define ATA_LOG_DIRECTORY 0x00 /* Directory of all logs */ #define ATA_POWER_COND_LOG 0x08 /* Power Conditions Log */ #define ATA_PCL_IDLE 0x00 /* Idle Power Conditions Page */ #define ATA_PCL_STANDBY 0x01 /* Standby Power Conditions Page */ #define ATA_IDENTIFY_DATA_LOG 0x30 /* Identify Device Data Log */ #define ATA_IDL_PAGE_LIST 0x00 /* List of supported pages */ #define ATA_IDL_IDENTIFY_DATA 0x01 /* Copy of Identify Device data */ #define ATA_IDL_CAPACITY 0x02 /* Capacity */ #define ATA_IDL_SUP_CAP 0x03 /* Supported Capabilities */ #define ATA_IDL_CUR_SETTINGS 0x04 /* Current Settings */ #define ATA_IDL_ATA_STRINGS 0x05 /* ATA Strings */ #define ATA_IDL_SECURITY 0x06 /* Security */ #define ATA_IDL_PARALLEL_ATA 0x07 /* Parallel ATA */ -#define ATA_IDL_SERIAL_ATA 0x08 /* Seiral ATA */ +#define ATA_IDL_SERIAL_ATA 0x08 /* Serial ATA */ #define ATA_IDL_ZDI 0x09 /* Zoned Device Information */ struct ata_gp_log_dir { uint8_t header[2]; #define ATA_GP_LOG_DIR_VERSION 0x0001 uint8_t num_pages[255*2]; /* Number of log pages at address */ }; /* * ATA Power Conditions log descriptor */ struct ata_power_cond_log_desc { uint8_t reserved1; uint8_t flags; #define ATA_PCL_COND_SUPPORTED 0x80 #define ATA_PCL_COND_SAVEABLE 0x40 #define ATA_PCL_COND_CHANGEABLE 0x20 #define ATA_PCL_DEFAULT_TIMER_EN 0x10 #define ATA_PCL_SAVED_TIMER_EN 0x08 #define ATA_PCL_CURRENT_TIMER_EN 0x04 #define ATA_PCL_HOLD_PC_NOT_SUP 0x02 uint8_t reserved2[2]; uint8_t default_timer[4]; uint8_t saved_timer[4]; uint8_t current_timer[4]; uint8_t nom_time_to_active[4]; uint8_t min_timer[4]; uint8_t max_timer[4]; uint8_t num_transitions_to_pc[4]; uint8_t hours_in_pc[4]; uint8_t reserved3[28]; }; /* * ATA Power Conditions Log (0x08), Idle power conditions page (0x00) */ struct ata_power_cond_log_idle { struct ata_power_cond_log_desc idle_a_desc; struct ata_power_cond_log_desc idle_b_desc; struct ata_power_cond_log_desc idle_c_desc; uint8_t reserved[320]; }; /* * ATA Power Conditions Log (0x08), Standby power conditions page (0x01) */ struct ata_power_cond_log_standby { uint8_t reserved[384]; struct ata_power_cond_log_desc standby_y_desc; struct ata_power_cond_log_desc standby_z_desc; }; /* * ATA IDENTIFY DEVICE data log (0x30) page 0x00 * List of Supported IDENTIFY DEVICE data pages. */ struct ata_identify_log_pages { uint8_t header[8]; #define ATA_IDLOG_REVISION 0x0000000000000001 uint8_t entry_count; uint8_t entries[503]; }; /* * ATA IDENTIFY DEVICE data log (0x30) * Capacity (Page 0x02). */ struct ata_identify_log_capacity { uint8_t header[8]; #define ATA_CAP_HEADER_VALID 0x8000000000000000 #define ATA_CAP_PAGE_NUM_MASK 0x0000000000ff0000 #define ATA_CAP_PAGE_NUM_SHIFT 16 #define ATA_CAP_REV_MASK 0x00000000000000ff uint8_t capacity[8]; #define ATA_CAP_CAPACITY_VALID 0x8000000000000000 #define ATA_CAP_ACCESSIBLE_CAP 0x0000ffffffffffff uint8_t phys_logical_sect_size[8]; #define ATA_CAP_PL_VALID 0x8000000000000000 #define ATA_CAP_LTOP_REL_SUP 0x4000000000000000 #define ATA_CAP_LOG_SECT_SUP 0x2000000000000000 #define ATA_CAP_ALIGN_ERR_MASK 0x0000000000300000 #define ATA_CAP_LTOP_MASK 0x00000000000f0000 #define ATA_CAP_LOG_SECT_OFF 0x000000000000ffff uint8_t logical_sect_size[8]; #define ATA_CAP_LOG_SECT_VALID 0x8000000000000000 #define ATA_CAP_LOG_SECT_SIZE 0x00000000ffffffff uint8_t nominal_buffer_size[8]; #define ATA_CAP_NOM_BUF_VALID 0x8000000000000000 #define ATA_CAP_NOM_BUF_SIZE 0x7fffffffffffffff uint8_t reserved[472]; }; /* * ATA IDENTIFY DEVICE data log (0x30) * Supported Capabilities (Page 0x03). */ struct ata_identify_log_sup_cap { uint8_t header[8]; #define ATA_SUP_CAP_HEADER_VALID 0x8000000000000000 #define ATA_SUP_CAP_PAGE_NUM_MASK 0x0000000000ff0000 #define ATA_SUP_CAP_PAGE_NUM_SHIFT 16 #define ATA_SUP_CAP_REV_MASK 0x00000000000000ff uint8_t sup_cap[8]; #define ATA_SUP_CAP_VALID 0x8000000000000000 #define ATA_SC_SET_SECT_CONFIG_SUP 0x0002000000000000 /* Set Sect Conf*/ #define ATA_SC_ZERO_EXT_SUP 0x0001000000000000 /* Zero EXT */ #define ATA_SC_SUCC_NCQ_SENSE_SUP 0x0000800000000000 /* Succ. NCQ Sns */ #define ATA_SC_DLC_SUP 0x0000400000000000 /* DLC */ #define ATA_SC_RQSN_DEV_FAULT_SUP 0x0000200000000000 /* Req Sns Dev Flt*/ #define ATA_SC_DSN_SUP 0x0000100000000000 /* DSN */ #define ATA_SC_LP_STANDBY_SUP 0x0000080000000000 /* LP Standby */ #define ATA_SC_SET_EPC_PS_SUP 0x0000040000000000 /* Set EPC PS */ #define ATA_SC_AMAX_ADDR_SUP 0x0000020000000000 /* AMAX Addr */ #define ATA_SC_DRAT_SUP 0x0000008000000000 /* DRAT */ #define ATA_SC_LPS_MISALGN_SUP 0x0000004000000000 /* LPS Misalign */ #define ATA_SC_RB_DMA_SUP 0x0000001000000000 /* Read Buf DMA */ #define ATA_SC_WB_DMA_SUP 0x0000000800000000 /* Write Buf DMA */ #define ATA_SC_DNLD_MC_DMA_SUP 0x0000000200000000 /* DL MCode DMA */ #define ATA_SC_28BIT_SUP 0x0000000100000000 /* 28-bit */ #define ATA_SC_RZAT_SUP 0x0000000080000000 /* RZAT */ #define ATA_SC_NOP_SUP 0x0000000020000000 /* NOP */ #define ATA_SC_READ_BUFFER_SUP 0x0000000010000000 /* Read Buffer */ #define ATA_SC_WRITE_BUFFER_SUP 0x0000000008000000 /* Write Buffer */ #define ATA_SC_READ_LOOK_AHEAD_SUP 0x0000000002000000 /* Read Look-Ahead*/ #define ATA_SC_VOLATILE_WC_SUP 0x0000000001000000 /* Volatile WC */ #define ATA_SC_SMART_SUP 0x0000000000800000 /* SMART */ #define ATA_SC_FLUSH_CACHE_EXT_SUP 0x0000000000400000 /* Flush Cache Ext */ #define ATA_SC_48BIT_SUP 0x0000000000100000 /* 48-Bit */ #define ATA_SC_SPINUP_SUP 0x0000000000040000 /* Spin-Up */ #define ATA_SC_PUIS_SUP 0x0000000000020000 /* PUIS */ #define ATA_SC_APM_SUP 0x0000000000010000 /* APM */ #define ATA_SC_DL_MICROCODE_SUP 0x0000000000004000 /* DL Microcode */ #define ATA_SC_UNLOAD_SUP 0x0000000000002000 /* Unload */ #define ATA_SC_WRITE_FUA_EXT_SUP 0x0000000000001000 /* Write FUA EXT */ #define ATA_SC_GPL_SUP 0x0000000000000800 /* GPL */ #define ATA_SC_STREAMING_SUP 0x0000000000000400 /* Streaming */ #define ATA_SC_SMART_SELFTEST_SUP 0x0000000000000100 /* SMART self-test */ #define ATA_SC_SMART_ERR_LOG_SUP 0x0000000000000080 /* SMART Err Log */ #define ATA_SC_EPC_SUP 0x0000000000000040 /* EPC */ #define ATA_SC_SENSE_SUP 0x0000000000000020 /* Sense data */ #define ATA_SC_FREEFALL_SUP 0x0000000000000010 /* Free-Fall */ #define ATA_SC_DM_MODE3_SUP 0x0000000000000008 /* DM Mode 3 */ #define ATA_SC_GPL_DMA_SUP 0x0000000000000004 /* GPL DMA */ #define ATA_SC_WRITE_UNCOR_SUP 0x0000000000000002 /* Write uncorr. */ #define ATA_SC_WRV_SUP 0x0000000000000001 /* WRV */ uint8_t download_code_cap[8]; #define ATA_DL_CODE_VALID 0x8000000000000000 #define ATA_DLC_DM_OFFSETS_DEFER_SUP 0x0000000400000000 #define ATA_DLC_DM_IMMED_SUP 0x0000000200000000 #define ATA_DLC_DM_OFF_IMMED_SUP 0x0000000100000000 #define ATA_DLC_DM_MAX_XFER_SIZE_MASK 0x00000000ffff0000 #define ATA_DLC_DM_MAX_XFER_SIZE_SHIFT 16 #define ATA_DLC_DM_MIN_XFER_SIZE_MASK 0x000000000000ffff uint8_t nom_media_rotation_rate[8]; #define ATA_NOM_MEDIA_ROTATION_VALID 0x8000000000000000 #define ATA_ROTATION_MASK 0x000000000000ffff uint8_t form_factor[8]; #define ATA_FORM_FACTOR_VALID 0x8000000000000000 #define ATA_FF_MASK 0x000000000000000f #define ATA_FF_NOT_REPORTED 0x0000000000000000 /* Not reported */ #define ATA_FF_525_IN 0x0000000000000001 /* 5.25 inch */ #define ATA_FF_35_IN 0x0000000000000002 /* 3.5 inch */ #define ATA_FF_25_IN 0x0000000000000003 /* 2.5 inch */ #define ATA_FF_18_IN 0x0000000000000004 /* 1.8 inch */ #define ATA_FF_LT_18_IN 0x0000000000000005 /* < 1.8 inch */ #define ATA_FF_MSATA 0x0000000000000006 /* mSATA */ #define ATA_FF_M2 0x0000000000000007 /* M.2 */ #define ATA_FF_MICROSSD 0x0000000000000008 /* MicroSSD */ #define ATA_FF_CFAST 0x0000000000000009 /* CFast */ uint8_t wrv_sec_cnt_mode3[8]; #define ATA_WRV_MODE3_VALID 0x8000000000000000 #define ATA_WRV_MODE3_COUNT 0x00000000ffffffff uint8_t wrv_sec_cnt_mode2[8]; #define ATA_WRV_MODE2_VALID 0x8000000000000000 #define ATA_WRV_MODE2_COUNT 0x00000000ffffffff uint8_t wwn[16]; /* XXX KDM need to figure out how to handle 128-bit fields */ uint8_t dsm[8]; #define ATA_DSM_VALID 0x8000000000000000 #define ATA_LB_MARKUP_SUP 0x000000000000ff00 #define ATA_TRIM_SUP 0x0000000000000001 uint8_t util_per_unit_time[16]; /* XXX KDM need to figure out how to handle 128-bit fields */ uint8_t util_usage_rate_sup[8]; #define ATA_UTIL_USAGE_RATE_VALID 0x8000000000000000 #define ATA_SETTING_RATE_SUP 0x0000000000800000 #define ATA_SINCE_POWERON_SUP 0x0000000000000100 #define ATA_POH_RATE_SUP 0x0000000000000010 #define ATA_DATE_TIME_RATE_SUP 0x0000000000000001 uint8_t zoned_cap[8]; #define ATA_ZONED_VALID 0x8000000000000000 #define ATA_ZONED_MASK 0x0000000000000003 uint8_t sup_zac_cap[8]; #define ATA_SUP_ZAC_CAP_VALID 0x8000000000000000 #define ATA_ND_RWP_SUP 0x0000000000000010 /* Reset Write Ptr*/ #define ATA_ND_FINISH_ZONE_SUP 0x0000000000000008 /* Finish Zone */ #define ATA_ND_CLOSE_ZONE_SUP 0x0000000000000004 /* Close Zone */ #define ATA_ND_OPEN_ZONE_SUP 0x0000000000000002 /* Open Zone */ #define ATA_REPORT_ZONES_SUP 0x0000000000000001 /* Report Zones */ uint8_t reserved[392]; }; /* * ATA Identify Device Data Log Zoned Device Information Page (0x09). * Current as of ZAC r04a, August 25, 2015. */ struct ata_zoned_info_log { uint8_t header[8]; #define ATA_ZDI_HEADER_VALID 0x8000000000000000 #define ATA_ZDI_PAGE_NUM_MASK 0x0000000000ff0000 #define ATA_ZDI_PAGE_NUM_SHIFT 16 #define ATA_ZDI_REV_MASK 0x00000000000000ff uint8_t zoned_cap[8]; #define ATA_ZDI_CAP_VALID 0x8000000000000000 #define ATA_ZDI_CAP_URSWRZ 0x0000000000000001 uint8_t zoned_settings[8]; #define ATA_ZDI_SETTINGS_VALID 0x8000000000000000 uint8_t optimal_seq_zones[8]; #define ATA_ZDI_OPT_SEQ_VALID 0x8000000000000000 #define ATA_ZDI_OPT_SEQ_MASK 0x00000000ffffffff uint8_t optimal_nonseq_zones[8]; #define ATA_ZDI_OPT_NS_VALID 0x8000000000000000 #define ATA_ZDI_OPT_NS_MASK 0x00000000ffffffff uint8_t max_seq_req_zones[8]; #define ATA_ZDI_MAX_SEQ_VALID 0x8000000000000000 #define ATA_ZDI_MAX_SEQ_MASK 0x00000000ffffffff uint8_t version_info[8]; #define ATA_ZDI_VER_VALID 0x8000000000000000 #define ATA_ZDI_VER_ZAC_SUP 0x0100000000000000 #define ATA_ZDI_VER_ZAC_MASK 0x00000000000000ff uint8_t reserved[456]; }; struct ata_ioc_request { union { struct { u_int8_t command; u_int8_t feature; u_int64_t lba; u_int16_t count; } ata; struct { char ccb[16]; struct atapi_sense sense; } atapi; } u; caddr_t data; int count; int flags; #define ATA_CMD_CONTROL 0x01 #define ATA_CMD_READ 0x02 #define ATA_CMD_WRITE 0x04 #define ATA_CMD_ATAPI 0x08 int timeout; int error; }; struct ata_security_password { u_int16_t ctrl; #define ATA_SECURITY_PASSWORD_USER 0x0000 #define ATA_SECURITY_PASSWORD_MASTER 0x0001 #define ATA_SECURITY_ERASE_NORMAL 0x0000 #define ATA_SECURITY_ERASE_ENHANCED 0x0002 #define ATA_SECURITY_LEVEL_HIGH 0x0000 #define ATA_SECURITY_LEVEL_MAXIMUM 0x0100 u_int8_t password[32]; u_int16_t revision; u_int16_t reserved[238]; }; /* pr device ATA ioctl calls */ #define IOCATAREQUEST _IOWR('a', 100, struct ata_ioc_request) #define IOCATAGPARM _IOR('a', 101, struct ata_params) #define IOCATAGMODE _IOR('a', 102, int) #define IOCATASMODE _IOW('a', 103, int) #define IOCATAGSPINDOWN _IOR('a', 104, int) #define IOCATASSPINDOWN _IOW('a', 105, int) struct ata_ioc_raid_config { int lun; int type; #define AR_JBOD 0x0001 #define AR_SPAN 0x0002 #define AR_RAID0 0x0004 #define AR_RAID1 0x0008 #define AR_RAID01 0x0010 #define AR_RAID3 0x0020 #define AR_RAID4 0x0040 #define AR_RAID5 0x0080 int interleave; int status; #define AR_READY 1 #define AR_DEGRADED 2 #define AR_REBUILDING 4 int progress; int total_disks; int disks[16]; }; struct ata_ioc_raid_status { int lun; int type; int interleave; int status; int progress; int total_disks; struct { int state; #define AR_DISK_ONLINE 0x01 #define AR_DISK_PRESENT 0x02 #define AR_DISK_SPARE 0x04 int lun; } disks[16]; }; /* ATA RAID ioctl calls */ #define IOCATARAIDCREATE _IOWR('a', 200, struct ata_ioc_raid_config) #define IOCATARAIDDELETE _IOW('a', 201, int) #define IOCATARAIDSTATUS _IOWR('a', 202, struct ata_ioc_raid_status) #define IOCATARAIDADDSPARE _IOW('a', 203, struct ata_ioc_raid_config) #define IOCATARAIDREBUILD _IOW('a', 204, int) #endif /* _SYS_ATA_H_ */ Index: projects/netbsd-tests-upstream-01-2017/sys/vm/vm_object.c =================================================================== --- projects/netbsd-tests-upstream-01-2017/sys/vm/vm_object.c (revision 312217) +++ projects/netbsd-tests-upstream-01-2017/sys/vm/vm_object.c (revision 312218) @@ -1,2604 +1,2615 @@ /*- * Copyright (c) 1991, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * The Mach Operating System project at Carnegie-Mellon University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)vm_object.c 8.5 (Berkeley) 3/22/94 * * * Copyright (c) 1987, 1990 Carnegie-Mellon University. * All rights reserved. * * Authors: Avadis Tevanian, Jr., Michael Wayne Young * * Permission to use, copy, modify and distribute this software and * its documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation. * * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. * * Carnegie Mellon requests users of this software to return to * * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU * School of Computer Science * Carnegie Mellon University * Pittsburgh PA 15213-3890 * * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. */ /* * Virtual memory object module. */ #include __FBSDID("$FreeBSD$"); #include "opt_vm.h" #include #include #include #include #include #include #include #include #include /* for curproc, pageproc */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static int old_msync; SYSCTL_INT(_vm, OID_AUTO, old_msync, CTLFLAG_RW, &old_msync, 0, "Use old (insecure) msync behavior"); static int vm_object_page_collect_flush(vm_object_t object, vm_page_t p, int pagerflags, int flags, boolean_t *clearobjflags, boolean_t *eio); static boolean_t vm_object_page_remove_write(vm_page_t p, int flags, boolean_t *clearobjflags); static void vm_object_qcollapse(vm_object_t object); static void vm_object_vndeallocate(vm_object_t object); /* * Virtual memory objects maintain the actual data * associated with allocated virtual memory. A given * page of memory exists within exactly one object. * * An object is only deallocated when all "references" * are given up. Only one "reference" to a given * region of an object should be writeable. * * Associated with each object is a list of all resident * memory pages belonging to that object; this list is * maintained by the "vm_page" module, and locked by the object's * lock. * * Each object also records a "pager" routine which is * used to retrieve (and store) pages to the proper backing * storage. In addition, objects may be backed by other * objects from which they were virtual-copied. * * The only items within the object structure which are * modified after time of creation are: * reference count locked by object's lock * pager routine locked by object's lock * */ struct object_q vm_object_list; struct mtx vm_object_list_mtx; /* lock for object list and count */ struct vm_object kernel_object_store; struct vm_object kmem_object_store; static SYSCTL_NODE(_vm_stats, OID_AUTO, object, CTLFLAG_RD, 0, "VM object stats"); static long object_collapses; SYSCTL_LONG(_vm_stats_object, OID_AUTO, collapses, CTLFLAG_RD, &object_collapses, 0, "VM object collapses"); static long object_bypasses; SYSCTL_LONG(_vm_stats_object, OID_AUTO, bypasses, CTLFLAG_RD, &object_bypasses, 0, "VM object bypasses"); static uma_zone_t obj_zone; static int vm_object_zinit(void *mem, int size, int flags); #ifdef INVARIANTS static void vm_object_zdtor(void *mem, int size, void *arg); static void vm_object_zdtor(void *mem, int size, void *arg) { vm_object_t object; object = (vm_object_t)mem; KASSERT(object->ref_count == 0, ("object %p ref_count = %d", object, object->ref_count)); KASSERT(TAILQ_EMPTY(&object->memq), ("object %p has resident pages in its memq", object)); KASSERT(vm_radix_is_empty(&object->rtree), ("object %p has resident pages in its trie", object)); #if VM_NRESERVLEVEL > 0 KASSERT(LIST_EMPTY(&object->rvq), ("object %p has reservations", object)); #endif KASSERT(object->paging_in_progress == 0, ("object %p paging_in_progress = %d", object, object->paging_in_progress)); KASSERT(object->resident_page_count == 0, ("object %p resident_page_count = %d", object, object->resident_page_count)); KASSERT(object->shadow_count == 0, ("object %p shadow_count = %d", object, object->shadow_count)); KASSERT(object->type == OBJT_DEAD, ("object %p has non-dead type %d", object, object->type)); } #endif static int vm_object_zinit(void *mem, int size, int flags) { vm_object_t object; object = (vm_object_t)mem; rw_init_flags(&object->lock, "vm object", RW_DUPOK | RW_NEW); /* These are true for any object that has been freed */ object->type = OBJT_DEAD; object->ref_count = 0; object->rtree.rt_root = 0; object->paging_in_progress = 0; object->resident_page_count = 0; object->shadow_count = 0; mtx_lock(&vm_object_list_mtx); TAILQ_INSERT_TAIL(&vm_object_list, object, object_list); mtx_unlock(&vm_object_list_mtx); return (0); } static void _vm_object_allocate(objtype_t type, vm_pindex_t size, vm_object_t object) { TAILQ_INIT(&object->memq); LIST_INIT(&object->shadow_head); object->type = type; switch (type) { case OBJT_DEAD: panic("_vm_object_allocate: can't create OBJT_DEAD"); case OBJT_DEFAULT: case OBJT_SWAP: object->flags = OBJ_ONEMAPPING; break; case OBJT_DEVICE: case OBJT_SG: object->flags = OBJ_FICTITIOUS | OBJ_UNMANAGED; break; case OBJT_MGTDEVICE: object->flags = OBJ_FICTITIOUS; break; case OBJT_PHYS: object->flags = OBJ_UNMANAGED; break; case OBJT_VNODE: object->flags = 0; break; default: panic("_vm_object_allocate: type %d is undefined", type); } object->size = size; object->generation = 1; object->ref_count = 1; object->memattr = VM_MEMATTR_DEFAULT; object->cred = NULL; object->charge = 0; object->handle = NULL; object->backing_object = NULL; object->backing_object_offset = (vm_ooffset_t) 0; #if VM_NRESERVLEVEL > 0 LIST_INIT(&object->rvq); #endif umtx_shm_object_init(object); } /* * vm_object_init: * * Initialize the VM objects module. */ void vm_object_init(void) { TAILQ_INIT(&vm_object_list); mtx_init(&vm_object_list_mtx, "vm object_list", NULL, MTX_DEF); rw_init(&kernel_object->lock, "kernel vm object"); _vm_object_allocate(OBJT_PHYS, OFF_TO_IDX(VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS), kernel_object); #if VM_NRESERVLEVEL > 0 kernel_object->flags |= OBJ_COLORED; kernel_object->pg_color = (u_short)atop(VM_MIN_KERNEL_ADDRESS); #endif rw_init(&kmem_object->lock, "kmem vm object"); _vm_object_allocate(OBJT_PHYS, OFF_TO_IDX(VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS), kmem_object); #if VM_NRESERVLEVEL > 0 kmem_object->flags |= OBJ_COLORED; kmem_object->pg_color = (u_short)atop(VM_MIN_KERNEL_ADDRESS); #endif /* * The lock portion of struct vm_object must be type stable due * to vm_pageout_fallback_object_lock locking a vm object * without holding any references to it. */ obj_zone = uma_zcreate("VM OBJECT", sizeof (struct vm_object), NULL, #ifdef INVARIANTS vm_object_zdtor, #else NULL, #endif vm_object_zinit, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); vm_radix_init(); } void vm_object_clear_flag(vm_object_t object, u_short bits) { VM_OBJECT_ASSERT_WLOCKED(object); object->flags &= ~bits; } /* * Sets the default memory attribute for the specified object. Pages * that are allocated to this object are by default assigned this memory * attribute. * * Presently, this function must be called before any pages are allocated * to the object. In the future, this requirement may be relaxed for * "default" and "swap" objects. */ int vm_object_set_memattr(vm_object_t object, vm_memattr_t memattr) { VM_OBJECT_ASSERT_WLOCKED(object); switch (object->type) { case OBJT_DEFAULT: case OBJT_DEVICE: case OBJT_MGTDEVICE: case OBJT_PHYS: case OBJT_SG: case OBJT_SWAP: case OBJT_VNODE: if (!TAILQ_EMPTY(&object->memq)) return (KERN_FAILURE); break; case OBJT_DEAD: return (KERN_INVALID_ARGUMENT); default: panic("vm_object_set_memattr: object %p is of undefined type", object); } object->memattr = memattr; return (KERN_SUCCESS); } void vm_object_pip_add(vm_object_t object, short i) { VM_OBJECT_ASSERT_WLOCKED(object); object->paging_in_progress += i; } void vm_object_pip_subtract(vm_object_t object, short i) { VM_OBJECT_ASSERT_WLOCKED(object); object->paging_in_progress -= i; } void vm_object_pip_wakeup(vm_object_t object) { VM_OBJECT_ASSERT_WLOCKED(object); object->paging_in_progress--; if ((object->flags & OBJ_PIPWNT) && object->paging_in_progress == 0) { vm_object_clear_flag(object, OBJ_PIPWNT); wakeup(object); } } void vm_object_pip_wakeupn(vm_object_t object, short i) { VM_OBJECT_ASSERT_WLOCKED(object); if (i) object->paging_in_progress -= i; if ((object->flags & OBJ_PIPWNT) && object->paging_in_progress == 0) { vm_object_clear_flag(object, OBJ_PIPWNT); wakeup(object); } } void vm_object_pip_wait(vm_object_t object, char *waitid) { VM_OBJECT_ASSERT_WLOCKED(object); while (object->paging_in_progress) { object->flags |= OBJ_PIPWNT; VM_OBJECT_SLEEP(object, object, PVM, waitid, 0); } } /* * vm_object_allocate: * * Returns a new object with the given size. */ vm_object_t vm_object_allocate(objtype_t type, vm_pindex_t size) { vm_object_t object; object = (vm_object_t)uma_zalloc(obj_zone, M_WAITOK); _vm_object_allocate(type, size, object); return (object); } /* * vm_object_reference: * * Gets another reference to the given object. Note: OBJ_DEAD * objects can be referenced during final cleaning. */ void vm_object_reference(vm_object_t object) { if (object == NULL) return; VM_OBJECT_WLOCK(object); vm_object_reference_locked(object); VM_OBJECT_WUNLOCK(object); } /* * vm_object_reference_locked: * * Gets another reference to the given object. * * The object must be locked. */ void vm_object_reference_locked(vm_object_t object) { struct vnode *vp; VM_OBJECT_ASSERT_WLOCKED(object); object->ref_count++; if (object->type == OBJT_VNODE) { vp = object->handle; vref(vp); } } /* * Handle deallocating an object of type OBJT_VNODE. */ static void vm_object_vndeallocate(vm_object_t object) { struct vnode *vp = (struct vnode *) object->handle; VM_OBJECT_ASSERT_WLOCKED(object); KASSERT(object->type == OBJT_VNODE, ("vm_object_vndeallocate: not a vnode object")); KASSERT(vp != NULL, ("vm_object_vndeallocate: missing vp")); #ifdef INVARIANTS if (object->ref_count == 0) { vn_printf(vp, "vm_object_vndeallocate "); panic("vm_object_vndeallocate: bad object reference count"); } #endif if (!umtx_shm_vnobj_persistent && object->ref_count == 1) umtx_shm_object_terminated(object); /* * The test for text of vp vnode does not need a bypass to * reach right VV_TEXT there, since it is obtained from * object->handle. */ if (object->ref_count > 1 || (vp->v_vflag & VV_TEXT) == 0) { object->ref_count--; VM_OBJECT_WUNLOCK(object); /* vrele may need the vnode lock. */ vrele(vp); } else { vhold(vp); VM_OBJECT_WUNLOCK(object); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); vdrop(vp); VM_OBJECT_WLOCK(object); object->ref_count--; if (object->type == OBJT_DEAD) { VM_OBJECT_WUNLOCK(object); VOP_UNLOCK(vp, 0); } else { if (object->ref_count == 0) VOP_UNSET_TEXT(vp); VM_OBJECT_WUNLOCK(object); vput(vp); } } } /* * vm_object_deallocate: * * Release a reference to the specified object, * gained either through a vm_object_allocate * or a vm_object_reference call. When all references * are gone, storage associated with this object * may be relinquished. * * No object may be locked. */ void vm_object_deallocate(vm_object_t object) { vm_object_t temp; struct vnode *vp; while (object != NULL) { VM_OBJECT_WLOCK(object); if (object->type == OBJT_VNODE) { vm_object_vndeallocate(object); return; } KASSERT(object->ref_count != 0, ("vm_object_deallocate: object deallocated too many times: %d", object->type)); /* * If the reference count goes to 0 we start calling * vm_object_terminate() on the object chain. * A ref count of 1 may be a special case depending on the * shadow count being 0 or 1. */ object->ref_count--; if (object->ref_count > 1) { VM_OBJECT_WUNLOCK(object); return; } else if (object->ref_count == 1) { if (object->type == OBJT_SWAP && (object->flags & OBJ_TMPFS) != 0) { vp = object->un_pager.swp.swp_tmpfs; vhold(vp); VM_OBJECT_WUNLOCK(object); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); VM_OBJECT_WLOCK(object); if (object->type == OBJT_DEAD || object->ref_count != 1) { VM_OBJECT_WUNLOCK(object); VOP_UNLOCK(vp, 0); vdrop(vp); return; } if ((object->flags & OBJ_TMPFS) != 0) VOP_UNSET_TEXT(vp); VOP_UNLOCK(vp, 0); vdrop(vp); } if (object->shadow_count == 0 && object->handle == NULL && (object->type == OBJT_DEFAULT || (object->type == OBJT_SWAP && (object->flags & OBJ_TMPFS_NODE) == 0))) { vm_object_set_flag(object, OBJ_ONEMAPPING); } else if ((object->shadow_count == 1) && (object->handle == NULL) && (object->type == OBJT_DEFAULT || object->type == OBJT_SWAP)) { vm_object_t robject; robject = LIST_FIRST(&object->shadow_head); KASSERT(robject != NULL, ("vm_object_deallocate: ref_count: %d, shadow_count: %d", object->ref_count, object->shadow_count)); KASSERT((robject->flags & OBJ_TMPFS_NODE) == 0, ("shadowed tmpfs v_object %p", object)); if (!VM_OBJECT_TRYWLOCK(robject)) { /* * Avoid a potential deadlock. */ object->ref_count++; VM_OBJECT_WUNLOCK(object); /* * More likely than not the thread * holding robject's lock has lower * priority than the current thread. * Let the lower priority thread run. */ pause("vmo_de", 1); continue; } /* * Collapse object into its shadow unless its * shadow is dead. In that case, object will * be deallocated by the thread that is * deallocating its shadow. */ if ((robject->flags & OBJ_DEAD) == 0 && (robject->handle == NULL) && (robject->type == OBJT_DEFAULT || robject->type == OBJT_SWAP)) { robject->ref_count++; retry: if (robject->paging_in_progress) { VM_OBJECT_WUNLOCK(object); vm_object_pip_wait(robject, "objde1"); temp = robject->backing_object; if (object == temp) { VM_OBJECT_WLOCK(object); goto retry; } } else if (object->paging_in_progress) { VM_OBJECT_WUNLOCK(robject); object->flags |= OBJ_PIPWNT; VM_OBJECT_SLEEP(object, object, PDROP | PVM, "objde2", 0); VM_OBJECT_WLOCK(robject); temp = robject->backing_object; if (object == temp) { VM_OBJECT_WLOCK(object); goto retry; } } else VM_OBJECT_WUNLOCK(object); if (robject->ref_count == 1) { robject->ref_count--; object = robject; goto doterm; } object = robject; vm_object_collapse(object); VM_OBJECT_WUNLOCK(object); continue; } VM_OBJECT_WUNLOCK(robject); } VM_OBJECT_WUNLOCK(object); return; } doterm: umtx_shm_object_terminated(object); temp = object->backing_object; if (temp != NULL) { KASSERT((object->flags & OBJ_TMPFS_NODE) == 0, ("shadowed tmpfs v_object 2 %p", object)); VM_OBJECT_WLOCK(temp); LIST_REMOVE(object, shadow_list); temp->shadow_count--; VM_OBJECT_WUNLOCK(temp); object->backing_object = NULL; } /* * Don't double-terminate, we could be in a termination * recursion due to the terminate having to sync data * to disk. */ if ((object->flags & OBJ_DEAD) == 0) vm_object_terminate(object); else VM_OBJECT_WUNLOCK(object); object = temp; } } /* * vm_object_destroy removes the object from the global object list * and frees the space for the object. */ void vm_object_destroy(vm_object_t object) { /* * Release the allocation charge. */ if (object->cred != NULL) { swap_release_by_cred(object->charge, object->cred); object->charge = 0; crfree(object->cred); object->cred = NULL; } /* * Free the space for the object. */ uma_zfree(obj_zone, object); } /* * vm_object_terminate actually destroys the specified object, freeing * up all previously used resources. * * The object must be locked. * This routine may block. */ void vm_object_terminate(vm_object_t object) { vm_page_t p, p_next; VM_OBJECT_ASSERT_WLOCKED(object); /* * Make sure no one uses us. */ vm_object_set_flag(object, OBJ_DEAD); /* * wait for the pageout daemon to be done with the object */ vm_object_pip_wait(object, "objtrm"); KASSERT(!object->paging_in_progress, ("vm_object_terminate: pageout in progress")); /* * Clean and free the pages, as appropriate. All references to the * object are gone, so we don't need to lock it. */ if (object->type == OBJT_VNODE) { struct vnode *vp = (struct vnode *)object->handle; /* * Clean pages and flush buffers. */ vm_object_page_clean(object, 0, 0, OBJPC_SYNC); VM_OBJECT_WUNLOCK(object); vinvalbuf(vp, V_SAVE, 0, 0); BO_LOCK(&vp->v_bufobj); vp->v_bufobj.bo_flag |= BO_DEAD; BO_UNLOCK(&vp->v_bufobj); VM_OBJECT_WLOCK(object); } KASSERT(object->ref_count == 0, ("vm_object_terminate: object with references, ref_count=%d", object->ref_count)); /* * Free any remaining pageable pages. This also removes them from the * paging queues. However, don't free wired pages, just remove them * from the object. Rather than incrementally removing each page from * the object, the page and object are reset to any empty state. */ TAILQ_FOREACH_SAFE(p, &object->memq, listq, p_next) { vm_page_assert_unbusied(p); vm_page_lock(p); /* * Optimize the page's removal from the object by resetting * its "object" field. Specifically, if the page is not * wired, then the effect of this assignment is that * vm_page_free()'s call to vm_page_remove() will return * immediately without modifying the page or the object. */ p->object = NULL; if (p->wire_count == 0) { vm_page_free(p); PCPU_INC(cnt.v_pfree); } vm_page_unlock(p); } /* * If the object contained any pages, then reset it to an empty state. * None of the object's fields, including "resident_page_count", were * modified by the preceding loop. */ if (object->resident_page_count != 0) { vm_radix_reclaim_allnodes(&object->rtree); TAILQ_INIT(&object->memq); object->resident_page_count = 0; if (object->type == OBJT_VNODE) vdrop(object->handle); } #if VM_NRESERVLEVEL > 0 if (__predict_false(!LIST_EMPTY(&object->rvq))) vm_reserv_break_all(object); #endif KASSERT(object->cred == NULL || object->type == OBJT_DEFAULT || object->type == OBJT_SWAP, ("%s: non-swap obj %p has cred", __func__, object)); /* * Let the pager know object is dead. */ vm_pager_deallocate(object); VM_OBJECT_WUNLOCK(object); vm_object_destroy(object); } /* * Make the page read-only so that we can clear the object flags. However, if * this is a nosync mmap then the object is likely to stay dirty so do not * mess with the page and do not clear the object flags. Returns TRUE if the * page should be flushed, and FALSE otherwise. */ static boolean_t vm_object_page_remove_write(vm_page_t p, int flags, boolean_t *clearobjflags) { /* * If we have been asked to skip nosync pages and this is a * nosync page, skip it. Note that the object flags were not * cleared in this case so we do not have to set them. */ if ((flags & OBJPC_NOSYNC) != 0 && (p->oflags & VPO_NOSYNC) != 0) { *clearobjflags = FALSE; return (FALSE); } else { pmap_remove_write(p); return (p->dirty != 0); } } /* * vm_object_page_clean * * Clean all dirty pages in the specified range of object. Leaves page * on whatever queue it is currently on. If NOSYNC is set then do not * write out pages with VPO_NOSYNC set (originally comes from MAP_NOSYNC), * leaving the object dirty. * * When stuffing pages asynchronously, allow clustering. XXX we need a * synchronous clustering mode implementation. * * Odd semantics: if start == end, we clean everything. * * The object must be locked. * * Returns FALSE if some page from the range was not written, as * reported by the pager, and TRUE otherwise. */ boolean_t vm_object_page_clean(vm_object_t object, vm_ooffset_t start, vm_ooffset_t end, int flags) { vm_page_t np, p; vm_pindex_t pi, tend, tstart; int curgeneration, n, pagerflags; boolean_t clearobjflags, eio, res; VM_OBJECT_ASSERT_WLOCKED(object); /* * The OBJ_MIGHTBEDIRTY flag is only set for OBJT_VNODE * objects. The check below prevents the function from * operating on non-vnode objects. */ if ((object->flags & OBJ_MIGHTBEDIRTY) == 0 || object->resident_page_count == 0) return (TRUE); pagerflags = (flags & (OBJPC_SYNC | OBJPC_INVAL)) != 0 ? VM_PAGER_PUT_SYNC : VM_PAGER_CLUSTER_OK; pagerflags |= (flags & OBJPC_INVAL) != 0 ? VM_PAGER_PUT_INVAL : 0; tstart = OFF_TO_IDX(start); tend = (end == 0) ? object->size : OFF_TO_IDX(end + PAGE_MASK); clearobjflags = tstart == 0 && tend >= object->size; res = TRUE; rescan: curgeneration = object->generation; for (p = vm_page_find_least(object, tstart); p != NULL; p = np) { pi = p->pindex; if (pi >= tend) break; np = TAILQ_NEXT(p, listq); if (p->valid == 0) continue; if (vm_page_sleep_if_busy(p, "vpcwai")) { if (object->generation != curgeneration) { if ((flags & OBJPC_SYNC) != 0) goto rescan; else clearobjflags = FALSE; } np = vm_page_find_least(object, pi); continue; } if (!vm_object_page_remove_write(p, flags, &clearobjflags)) continue; n = vm_object_page_collect_flush(object, p, pagerflags, flags, &clearobjflags, &eio); if (eio) { res = FALSE; clearobjflags = FALSE; } if (object->generation != curgeneration) { if ((flags & OBJPC_SYNC) != 0) goto rescan; else clearobjflags = FALSE; } /* * If the VOP_PUTPAGES() did a truncated write, so * that even the first page of the run is not fully * written, vm_pageout_flush() returns 0 as the run * length. Since the condition that caused truncated * write may be permanent, e.g. exhausted free space, * accepting n == 0 would cause an infinite loop. * * Forwarding the iterator leaves the unwritten page * behind, but there is not much we can do there if * filesystem refuses to write it. */ if (n == 0) { n = 1; clearobjflags = FALSE; } np = vm_page_find_least(object, pi + n); } #if 0 VOP_FSYNC(vp, (pagerflags & VM_PAGER_PUT_SYNC) ? MNT_WAIT : 0); #endif if (clearobjflags) vm_object_clear_flag(object, OBJ_MIGHTBEDIRTY); return (res); } static int vm_object_page_collect_flush(vm_object_t object, vm_page_t p, int pagerflags, int flags, boolean_t *clearobjflags, boolean_t *eio) { vm_page_t ma[vm_pageout_page_count], p_first, tp; int count, i, mreq, runlen; vm_page_lock_assert(p, MA_NOTOWNED); VM_OBJECT_ASSERT_WLOCKED(object); count = 1; mreq = 0; for (tp = p; count < vm_pageout_page_count; count++) { tp = vm_page_next(tp); if (tp == NULL || vm_page_busied(tp)) break; if (!vm_object_page_remove_write(tp, flags, clearobjflags)) break; } for (p_first = p; count < vm_pageout_page_count; count++) { tp = vm_page_prev(p_first); if (tp == NULL || vm_page_busied(tp)) break; if (!vm_object_page_remove_write(tp, flags, clearobjflags)) break; p_first = tp; mreq++; } for (tp = p_first, i = 0; i < count; tp = TAILQ_NEXT(tp, listq), i++) ma[i] = tp; vm_pageout_flush(ma, count, pagerflags, mreq, &runlen, eio); return (runlen); } /* * Note that there is absolutely no sense in writing out * anonymous objects, so we track down the vnode object * to write out. * We invalidate (remove) all pages from the address space * for semantic correctness. * * If the backing object is a device object with unmanaged pages, then any * mappings to the specified range of pages must be removed before this * function is called. * * Note: certain anonymous maps, such as MAP_NOSYNC maps, * may start out with a NULL object. */ boolean_t vm_object_sync(vm_object_t object, vm_ooffset_t offset, vm_size_t size, boolean_t syncio, boolean_t invalidate) { vm_object_t backing_object; struct vnode *vp; struct mount *mp; int error, flags, fsync_after; boolean_t res; if (object == NULL) return (TRUE); res = TRUE; error = 0; VM_OBJECT_WLOCK(object); while ((backing_object = object->backing_object) != NULL) { VM_OBJECT_WLOCK(backing_object); offset += object->backing_object_offset; VM_OBJECT_WUNLOCK(object); object = backing_object; if (object->size < OFF_TO_IDX(offset + size)) size = IDX_TO_OFF(object->size) - offset; } /* * Flush pages if writing is allowed, invalidate them * if invalidation requested. Pages undergoing I/O * will be ignored by vm_object_page_remove(). * * We cannot lock the vnode and then wait for paging * to complete without deadlocking against vm_fault. * Instead we simply call vm_object_page_remove() and * allow it to block internally on a page-by-page * basis when it encounters pages undergoing async * I/O. */ if (object->type == OBJT_VNODE && (object->flags & OBJ_MIGHTBEDIRTY) != 0) { vp = object->handle; VM_OBJECT_WUNLOCK(object); (void) vn_start_write(vp, &mp, V_WAIT); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); if (syncio && !invalidate && offset == 0 && OFF_TO_IDX(size) == object->size) { /* * If syncing the whole mapping of the file, * it is faster to schedule all the writes in * async mode, also allowing the clustering, * and then wait for i/o to complete. */ flags = 0; fsync_after = TRUE; } else { flags = (syncio || invalidate) ? OBJPC_SYNC : 0; flags |= invalidate ? (OBJPC_SYNC | OBJPC_INVAL) : 0; fsync_after = FALSE; } VM_OBJECT_WLOCK(object); res = vm_object_page_clean(object, offset, offset + size, flags); VM_OBJECT_WUNLOCK(object); if (fsync_after) error = VOP_FSYNC(vp, MNT_WAIT, curthread); VOP_UNLOCK(vp, 0); vn_finished_write(mp); if (error != 0) res = FALSE; VM_OBJECT_WLOCK(object); } if ((object->type == OBJT_VNODE || object->type == OBJT_DEVICE) && invalidate) { if (object->type == OBJT_DEVICE) /* * The option OBJPR_NOTMAPPED must be passed here * because vm_object_page_remove() cannot remove * unmanaged mappings. */ flags = OBJPR_NOTMAPPED; else if (old_msync) flags = 0; else flags = OBJPR_CLEANONLY; vm_object_page_remove(object, OFF_TO_IDX(offset), OFF_TO_IDX(offset + size + PAGE_MASK), flags); } VM_OBJECT_WUNLOCK(object); return (res); } /* * vm_object_madvise: * * Implements the madvise function at the object/page level. * * MADV_WILLNEED (any object) * * Activate the specified pages if they are resident. * * MADV_DONTNEED (any object) * * Deactivate the specified pages if they are resident. * * MADV_FREE (OBJT_DEFAULT/OBJT_SWAP objects, * OBJ_ONEMAPPING only) * * Deactivate and clean the specified pages if they are * resident. This permits the process to reuse the pages * without faulting or the kernel to reclaim the pages * without I/O. */ void vm_object_madvise(vm_object_t object, vm_pindex_t pindex, vm_pindex_t end, - int advise) + int advice) { vm_pindex_t tpindex; vm_object_t backing_object, tobject; vm_page_t m; if (object == NULL) return; + VM_OBJECT_WLOCK(object); - /* - * Locate and adjust resident pages - */ - for (; pindex < end; pindex += 1) { + for (m = NULL; pindex < end; pindex++) { relookup: tobject = object; tpindex = pindex; shadowlookup: /* * MADV_FREE only operates on OBJT_DEFAULT or OBJT_SWAP pages * and those pages must be OBJ_ONEMAPPING. */ - if (advise == MADV_FREE) { + if (advice == MADV_FREE) { if ((tobject->type != OBJT_DEFAULT && tobject->type != OBJT_SWAP) || (tobject->flags & OBJ_ONEMAPPING) == 0) { goto unlock_tobject; } } else if ((tobject->flags & OBJ_UNMANAGED) != 0) goto unlock_tobject; - m = vm_page_lookup(tobject, tpindex); - if (m == NULL) { - /* - * There may be swap even if there is no backing page - */ - if (advise == MADV_FREE && tobject->type == OBJT_SWAP) + + /* + * In the common case where the object has no backing object, we + * can avoid performing lookups at each pindex. In either case, + * when applying MADV_FREE we take care to release any swap + * space used to store non-resident pages. + */ + if (object->backing_object == NULL) { + m = (m != NULL) ? TAILQ_NEXT(m, listq) : + vm_page_find_least(object, pindex); + tpindex = (m != NULL && m->pindex < end) ? + m->pindex : end; + if (advice == MADV_FREE && object->type == OBJT_SWAP && + tpindex > pindex) + swap_pager_freespace(object, pindex, + tpindex - pindex); + if ((pindex = tpindex) == end) + break; + } else if ((m = vm_page_lookup(tobject, tpindex)) == NULL) { + if (advice == MADV_FREE && tobject->type == OBJT_SWAP) swap_pager_freespace(tobject, tpindex, 1); /* - * next object + * Prepare to search the next object in the chain. */ backing_object = tobject->backing_object; if (backing_object == NULL) goto unlock_tobject; VM_OBJECT_WLOCK(backing_object); tpindex += OFF_TO_IDX(tobject->backing_object_offset); if (tobject != object) VM_OBJECT_WUNLOCK(tobject); tobject = backing_object; goto shadowlookup; - } else if (m->valid != VM_PAGE_BITS_ALL) - goto unlock_tobject; + } + /* * If the page is not in a normal state, skip it. */ + if (m->valid != VM_PAGE_BITS_ALL) + goto unlock_tobject; vm_page_lock(m); if (m->hold_count != 0 || m->wire_count != 0) { vm_page_unlock(m); goto unlock_tobject; } KASSERT((m->flags & PG_FICTITIOUS) == 0, ("vm_object_madvise: page %p is fictitious", m)); KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("vm_object_madvise: page %p is not managed", m)); if (vm_page_busied(m)) { - if (advise == MADV_WILLNEED) { + if (advice == MADV_WILLNEED) { /* * Reference the page before unlocking and * sleeping so that the page daemon is less * likely to reclaim it. */ vm_page_aflag_set(m, PGA_REFERENCED); } if (object != tobject) VM_OBJECT_WUNLOCK(object); VM_OBJECT_WUNLOCK(tobject); vm_page_busy_sleep(m, "madvpo", false); + m = NULL; VM_OBJECT_WLOCK(object); goto relookup; } - if (advise == MADV_WILLNEED) { - vm_page_activate(m); - } else { - vm_page_advise(m, advise); - } + vm_page_advise(m, advice); vm_page_unlock(m); - if (advise == MADV_FREE && tobject->type == OBJT_SWAP) + if (advice == MADV_FREE && tobject->type == OBJT_SWAP) swap_pager_freespace(tobject, tpindex, 1); unlock_tobject: if (tobject != object) VM_OBJECT_WUNLOCK(tobject); - } + } VM_OBJECT_WUNLOCK(object); } /* * vm_object_shadow: * * Create a new object which is backed by the * specified existing object range. The source * object reference is deallocated. * * The new object and offset into that object * are returned in the source parameters. */ void vm_object_shadow( vm_object_t *object, /* IN/OUT */ vm_ooffset_t *offset, /* IN/OUT */ vm_size_t length) { vm_object_t source; vm_object_t result; source = *object; /* * Don't create the new object if the old object isn't shared. */ if (source != NULL) { VM_OBJECT_WLOCK(source); if (source->ref_count == 1 && source->handle == NULL && (source->type == OBJT_DEFAULT || source->type == OBJT_SWAP)) { VM_OBJECT_WUNLOCK(source); return; } VM_OBJECT_WUNLOCK(source); } /* * Allocate a new object with the given length. */ result = vm_object_allocate(OBJT_DEFAULT, atop(length)); /* * The new object shadows the source object, adding a reference to it. * Our caller changes his reference to point to the new object, * removing a reference to the source object. Net result: no change * of reference count. * * Try to optimize the result object's page color when shadowing * in order to maintain page coloring consistency in the combined * shadowed object. */ result->backing_object = source; /* * Store the offset into the source object, and fix up the offset into * the new object. */ result->backing_object_offset = *offset; if (source != NULL) { VM_OBJECT_WLOCK(source); LIST_INSERT_HEAD(&source->shadow_head, result, shadow_list); source->shadow_count++; #if VM_NRESERVLEVEL > 0 result->flags |= source->flags & OBJ_COLORED; result->pg_color = (source->pg_color + OFF_TO_IDX(*offset)) & ((1 << (VM_NFREEORDER - 1)) - 1); #endif VM_OBJECT_WUNLOCK(source); } /* * Return the new things */ *offset = 0; *object = result; } /* * vm_object_split: * * Split the pages in a map entry into a new object. This affords * easier removal of unused pages, and keeps object inheritance from * being a negative impact on memory usage. */ void vm_object_split(vm_map_entry_t entry) { vm_page_t m, m_next; vm_object_t orig_object, new_object, source; vm_pindex_t idx, offidxstart; vm_size_t size; orig_object = entry->object.vm_object; if (orig_object->type != OBJT_DEFAULT && orig_object->type != OBJT_SWAP) return; if (orig_object->ref_count <= 1) return; VM_OBJECT_WUNLOCK(orig_object); offidxstart = OFF_TO_IDX(entry->offset); size = atop(entry->end - entry->start); /* * If swap_pager_copy() is later called, it will convert new_object * into a swap object. */ new_object = vm_object_allocate(OBJT_DEFAULT, size); /* * At this point, the new object is still private, so the order in * which the original and new objects are locked does not matter. */ VM_OBJECT_WLOCK(new_object); VM_OBJECT_WLOCK(orig_object); source = orig_object->backing_object; if (source != NULL) { VM_OBJECT_WLOCK(source); if ((source->flags & OBJ_DEAD) != 0) { VM_OBJECT_WUNLOCK(source); VM_OBJECT_WUNLOCK(orig_object); VM_OBJECT_WUNLOCK(new_object); vm_object_deallocate(new_object); VM_OBJECT_WLOCK(orig_object); return; } LIST_INSERT_HEAD(&source->shadow_head, new_object, shadow_list); source->shadow_count++; vm_object_reference_locked(source); /* for new_object */ vm_object_clear_flag(source, OBJ_ONEMAPPING); VM_OBJECT_WUNLOCK(source); new_object->backing_object_offset = orig_object->backing_object_offset + entry->offset; new_object->backing_object = source; } if (orig_object->cred != NULL) { new_object->cred = orig_object->cred; crhold(orig_object->cred); new_object->charge = ptoa(size); KASSERT(orig_object->charge >= ptoa(size), ("orig_object->charge < 0")); orig_object->charge -= ptoa(size); } retry: m = vm_page_find_least(orig_object, offidxstart); for (; m != NULL && (idx = m->pindex - offidxstart) < size; m = m_next) { m_next = TAILQ_NEXT(m, listq); /* * We must wait for pending I/O to complete before we can * rename the page. * * We do not have to VM_PROT_NONE the page as mappings should * not be changed by this operation. */ if (vm_page_busied(m)) { VM_OBJECT_WUNLOCK(new_object); vm_page_lock(m); VM_OBJECT_WUNLOCK(orig_object); vm_page_busy_sleep(m, "spltwt", false); VM_OBJECT_WLOCK(orig_object); VM_OBJECT_WLOCK(new_object); goto retry; } /* vm_page_rename() will dirty the page. */ if (vm_page_rename(m, new_object, idx)) { VM_OBJECT_WUNLOCK(new_object); VM_OBJECT_WUNLOCK(orig_object); VM_WAIT; VM_OBJECT_WLOCK(orig_object); VM_OBJECT_WLOCK(new_object); goto retry; } #if VM_NRESERVLEVEL > 0 /* * If some of the reservation's allocated pages remain with * the original object, then transferring the reservation to * the new object is neither particularly beneficial nor * particularly harmful as compared to leaving the reservation * with the original object. If, however, all of the * reservation's allocated pages are transferred to the new * object, then transferring the reservation is typically * beneficial. Determining which of these two cases applies * would be more costly than unconditionally renaming the * reservation. */ vm_reserv_rename(m, new_object, orig_object, offidxstart); #endif if (orig_object->type == OBJT_SWAP) vm_page_xbusy(m); } if (orig_object->type == OBJT_SWAP) { /* * swap_pager_copy() can sleep, in which case the orig_object's * and new_object's locks are released and reacquired. */ swap_pager_copy(orig_object, new_object, offidxstart, 0); TAILQ_FOREACH(m, &new_object->memq, listq) vm_page_xunbusy(m); } VM_OBJECT_WUNLOCK(orig_object); VM_OBJECT_WUNLOCK(new_object); entry->object.vm_object = new_object; entry->offset = 0LL; vm_object_deallocate(orig_object); VM_OBJECT_WLOCK(new_object); } #define OBSC_COLLAPSE_NOWAIT 0x0002 #define OBSC_COLLAPSE_WAIT 0x0004 static vm_page_t vm_object_collapse_scan_wait(vm_object_t object, vm_page_t p, vm_page_t next, int op) { vm_object_t backing_object; VM_OBJECT_ASSERT_WLOCKED(object); backing_object = object->backing_object; VM_OBJECT_ASSERT_WLOCKED(backing_object); KASSERT(p == NULL || vm_page_busied(p), ("unbusy page %p", p)); KASSERT(p == NULL || p->object == object || p->object == backing_object, ("invalid ownership %p %p %p", p, object, backing_object)); if ((op & OBSC_COLLAPSE_NOWAIT) != 0) return (next); if (p != NULL) vm_page_lock(p); VM_OBJECT_WUNLOCK(object); VM_OBJECT_WUNLOCK(backing_object); if (p == NULL) VM_WAIT; else vm_page_busy_sleep(p, "vmocol", false); VM_OBJECT_WLOCK(object); VM_OBJECT_WLOCK(backing_object); return (TAILQ_FIRST(&backing_object->memq)); } static bool vm_object_scan_all_shadowed(vm_object_t object) { vm_object_t backing_object; vm_page_t p, pp; vm_pindex_t backing_offset_index, new_pindex, pi, ps; VM_OBJECT_ASSERT_WLOCKED(object); VM_OBJECT_ASSERT_WLOCKED(object->backing_object); backing_object = object->backing_object; if (backing_object->type != OBJT_DEFAULT && backing_object->type != OBJT_SWAP) return (false); pi = backing_offset_index = OFF_TO_IDX(object->backing_object_offset); p = vm_page_find_least(backing_object, pi); ps = swap_pager_find_least(backing_object, pi); /* * Only check pages inside the parent object's range and * inside the parent object's mapping of the backing object. */ for (;; pi++) { if (p != NULL && p->pindex < pi) p = TAILQ_NEXT(p, listq); if (ps < pi) ps = swap_pager_find_least(backing_object, pi); if (p == NULL && ps >= backing_object->size) break; else if (p == NULL) pi = ps; else pi = MIN(p->pindex, ps); new_pindex = pi - backing_offset_index; if (new_pindex >= object->size) break; /* * See if the parent has the page or if the parent's object * pager has the page. If the parent has the page but the page * is not valid, the parent's object pager must have the page. * * If this fails, the parent does not completely shadow the * object and we might as well give up now. */ pp = vm_page_lookup(object, new_pindex); if ((pp == NULL || pp->valid == 0) && !vm_pager_has_page(object, new_pindex, NULL, NULL)) return (false); } return (true); } static bool vm_object_collapse_scan(vm_object_t object, int op) { vm_object_t backing_object; vm_page_t next, p, pp; vm_pindex_t backing_offset_index, new_pindex; VM_OBJECT_ASSERT_WLOCKED(object); VM_OBJECT_ASSERT_WLOCKED(object->backing_object); backing_object = object->backing_object; backing_offset_index = OFF_TO_IDX(object->backing_object_offset); /* * Initial conditions */ if ((op & OBSC_COLLAPSE_WAIT) != 0) vm_object_set_flag(backing_object, OBJ_DEAD); /* * Our scan */ for (p = TAILQ_FIRST(&backing_object->memq); p != NULL; p = next) { next = TAILQ_NEXT(p, listq); new_pindex = p->pindex - backing_offset_index; /* * Check for busy page */ if (vm_page_busied(p)) { next = vm_object_collapse_scan_wait(object, p, next, op); continue; } KASSERT(p->object == backing_object, ("vm_object_collapse_scan: object mismatch")); if (p->pindex < backing_offset_index || new_pindex >= object->size) { if (backing_object->type == OBJT_SWAP) swap_pager_freespace(backing_object, p->pindex, 1); /* * Page is out of the parent object's range, we can * simply destroy it. */ vm_page_lock(p); KASSERT(!pmap_page_is_mapped(p), ("freeing mapped page %p", p)); if (p->wire_count == 0) vm_page_free(p); else vm_page_remove(p); vm_page_unlock(p); continue; } pp = vm_page_lookup(object, new_pindex); if (pp != NULL && vm_page_busied(pp)) { /* * The page in the parent is busy and possibly not * (yet) valid. Until its state is finalized by the * busy bit owner, we can't tell whether it shadows the * original page. Therefore, we must either skip it * and the original (backing_object) page or wait for * its state to be finalized. * * This is due to a race with vm_fault() where we must * unbusy the original (backing_obj) page before we can * (re)lock the parent. Hence we can get here. */ next = vm_object_collapse_scan_wait(object, pp, next, op); continue; } KASSERT(pp == NULL || pp->valid != 0, ("unbusy invalid page %p", pp)); if (pp != NULL || vm_pager_has_page(object, new_pindex, NULL, NULL)) { /* * The page already exists in the parent OR swap exists * for this location in the parent. Leave the parent's * page alone. Destroy the original page from the * backing object. */ if (backing_object->type == OBJT_SWAP) swap_pager_freespace(backing_object, p->pindex, 1); vm_page_lock(p); KASSERT(!pmap_page_is_mapped(p), ("freeing mapped page %p", p)); if (p->wire_count == 0) vm_page_free(p); else vm_page_remove(p); vm_page_unlock(p); continue; } /* * Page does not exist in parent, rename the page from the * backing object to the main object. * * If the page was mapped to a process, it can remain mapped * through the rename. vm_page_rename() will dirty the page. */ if (vm_page_rename(p, object, new_pindex)) { next = vm_object_collapse_scan_wait(object, NULL, next, op); continue; } /* Use the old pindex to free the right page. */ if (backing_object->type == OBJT_SWAP) swap_pager_freespace(backing_object, new_pindex + backing_offset_index, 1); #if VM_NRESERVLEVEL > 0 /* * Rename the reservation. */ vm_reserv_rename(p, object, backing_object, backing_offset_index); #endif } return (true); } /* * this version of collapse allows the operation to occur earlier and * when paging_in_progress is true for an object... This is not a complete * operation, but should plug 99.9% of the rest of the leaks. */ static void vm_object_qcollapse(vm_object_t object) { vm_object_t backing_object = object->backing_object; VM_OBJECT_ASSERT_WLOCKED(object); VM_OBJECT_ASSERT_WLOCKED(backing_object); if (backing_object->ref_count != 1) return; vm_object_collapse_scan(object, OBSC_COLLAPSE_NOWAIT); } /* * vm_object_collapse: * * Collapse an object with the object backing it. * Pages in the backing object are moved into the * parent, and the backing object is deallocated. */ void vm_object_collapse(vm_object_t object) { vm_object_t backing_object, new_backing_object; VM_OBJECT_ASSERT_WLOCKED(object); while (TRUE) { /* * Verify that the conditions are right for collapse: * * The object exists and the backing object exists. */ if ((backing_object = object->backing_object) == NULL) break; /* * we check the backing object first, because it is most likely * not collapsable. */ VM_OBJECT_WLOCK(backing_object); if (backing_object->handle != NULL || (backing_object->type != OBJT_DEFAULT && backing_object->type != OBJT_SWAP) || (backing_object->flags & OBJ_DEAD) || object->handle != NULL || (object->type != OBJT_DEFAULT && object->type != OBJT_SWAP) || (object->flags & OBJ_DEAD)) { VM_OBJECT_WUNLOCK(backing_object); break; } if (object->paging_in_progress != 0 || backing_object->paging_in_progress != 0) { vm_object_qcollapse(object); VM_OBJECT_WUNLOCK(backing_object); break; } /* * We know that we can either collapse the backing object (if * the parent is the only reference to it) or (perhaps) have * the parent bypass the object if the parent happens to shadow * all the resident pages in the entire backing object. * * This is ignoring pager-backed pages such as swap pages. * vm_object_collapse_scan fails the shadowing test in this * case. */ if (backing_object->ref_count == 1) { vm_object_pip_add(object, 1); vm_object_pip_add(backing_object, 1); /* * If there is exactly one reference to the backing * object, we can collapse it into the parent. */ vm_object_collapse_scan(object, OBSC_COLLAPSE_WAIT); #if VM_NRESERVLEVEL > 0 /* * Break any reservations from backing_object. */ if (__predict_false(!LIST_EMPTY(&backing_object->rvq))) vm_reserv_break_all(backing_object); #endif /* * Move the pager from backing_object to object. */ if (backing_object->type == OBJT_SWAP) { /* * swap_pager_copy() can sleep, in which case * the backing_object's and object's locks are * released and reacquired. * Since swap_pager_copy() is being asked to * destroy the source, it will change the * backing_object's type to OBJT_DEFAULT. */ swap_pager_copy( backing_object, object, OFF_TO_IDX(object->backing_object_offset), TRUE); } /* * Object now shadows whatever backing_object did. * Note that the reference to * backing_object->backing_object moves from within * backing_object to within object. */ LIST_REMOVE(object, shadow_list); backing_object->shadow_count--; if (backing_object->backing_object) { VM_OBJECT_WLOCK(backing_object->backing_object); LIST_REMOVE(backing_object, shadow_list); LIST_INSERT_HEAD( &backing_object->backing_object->shadow_head, object, shadow_list); /* * The shadow_count has not changed. */ VM_OBJECT_WUNLOCK(backing_object->backing_object); } object->backing_object = backing_object->backing_object; object->backing_object_offset += backing_object->backing_object_offset; /* * Discard backing_object. * * Since the backing object has no pages, no pager left, * and no object references within it, all that is * necessary is to dispose of it. */ KASSERT(backing_object->ref_count == 1, ( "backing_object %p was somehow re-referenced during collapse!", backing_object)); vm_object_pip_wakeup(backing_object); backing_object->type = OBJT_DEAD; backing_object->ref_count = 0; VM_OBJECT_WUNLOCK(backing_object); vm_object_destroy(backing_object); vm_object_pip_wakeup(object); object_collapses++; } else { /* * If we do not entirely shadow the backing object, * there is nothing we can do so we give up. */ if (object->resident_page_count != object->size && !vm_object_scan_all_shadowed(object)) { VM_OBJECT_WUNLOCK(backing_object); break; } /* * Make the parent shadow the next object in the * chain. Deallocating backing_object will not remove * it, since its reference count is at least 2. */ LIST_REMOVE(object, shadow_list); backing_object->shadow_count--; new_backing_object = backing_object->backing_object; if ((object->backing_object = new_backing_object) != NULL) { VM_OBJECT_WLOCK(new_backing_object); LIST_INSERT_HEAD( &new_backing_object->shadow_head, object, shadow_list ); new_backing_object->shadow_count++; vm_object_reference_locked(new_backing_object); VM_OBJECT_WUNLOCK(new_backing_object); object->backing_object_offset += backing_object->backing_object_offset; } /* * Drop the reference count on backing_object. Since * its ref_count was at least 2, it will not vanish. */ backing_object->ref_count--; VM_OBJECT_WUNLOCK(backing_object); object_bypasses++; } /* * Try again with this object's new backing object. */ } } /* * vm_object_page_remove: * * For the given object, either frees or invalidates each of the * specified pages. In general, a page is freed. However, if a page is * wired for any reason other than the existence of a managed, wired * mapping, then it may be invalidated but not removed from the object. * Pages are specified by the given range ["start", "end") and the option * OBJPR_CLEANONLY. As a special case, if "end" is zero, then the range * extends from "start" to the end of the object. If the option * OBJPR_CLEANONLY is specified, then only the non-dirty pages within the * specified range are affected. If the option OBJPR_NOTMAPPED is * specified, then the pages within the specified range must have no * mappings. Otherwise, if this option is not specified, any mappings to * the specified pages are removed before the pages are freed or * invalidated. * * In general, this operation should only be performed on objects that * contain managed pages. There are, however, two exceptions. First, it * is performed on the kernel and kmem objects by vm_map_entry_delete(). * Second, it is used by msync(..., MS_INVALIDATE) to invalidate device- * backed pages. In both of these cases, the option OBJPR_CLEANONLY must * not be specified and the option OBJPR_NOTMAPPED must be specified. * * The object must be locked. */ void vm_object_page_remove(vm_object_t object, vm_pindex_t start, vm_pindex_t end, int options) { vm_page_t p, next; VM_OBJECT_ASSERT_WLOCKED(object); KASSERT((object->flags & OBJ_UNMANAGED) == 0 || (options & (OBJPR_CLEANONLY | OBJPR_NOTMAPPED)) == OBJPR_NOTMAPPED, ("vm_object_page_remove: illegal options for object %p", object)); if (object->resident_page_count == 0) return; vm_object_pip_add(object, 1); again: p = vm_page_find_least(object, start); /* * Here, the variable "p" is either (1) the page with the least pindex * greater than or equal to the parameter "start" or (2) NULL. */ for (; p != NULL && (p->pindex < end || end == 0); p = next) { next = TAILQ_NEXT(p, listq); /* * If the page is wired for any reason besides the existence * of managed, wired mappings, then it cannot be freed. For * example, fictitious pages, which represent device memory, * are inherently wired and cannot be freed. They can, * however, be invalidated if the option OBJPR_CLEANONLY is * not specified. */ vm_page_lock(p); if (vm_page_xbusied(p)) { VM_OBJECT_WUNLOCK(object); vm_page_busy_sleep(p, "vmopax", true); VM_OBJECT_WLOCK(object); goto again; } if (p->wire_count != 0) { if ((options & OBJPR_NOTMAPPED) == 0) pmap_remove_all(p); if ((options & OBJPR_CLEANONLY) == 0) { p->valid = 0; vm_page_undirty(p); } goto next; } if (vm_page_busied(p)) { VM_OBJECT_WUNLOCK(object); vm_page_busy_sleep(p, "vmopar", false); VM_OBJECT_WLOCK(object); goto again; } KASSERT((p->flags & PG_FICTITIOUS) == 0, ("vm_object_page_remove: page %p is fictitious", p)); if ((options & OBJPR_CLEANONLY) != 0 && p->valid != 0) { if ((options & OBJPR_NOTMAPPED) == 0) pmap_remove_write(p); if (p->dirty) goto next; } if ((options & OBJPR_NOTMAPPED) == 0) pmap_remove_all(p); vm_page_free(p); next: vm_page_unlock(p); } vm_object_pip_wakeup(object); } /* * vm_object_page_noreuse: * * For the given object, attempt to move the specified pages to * the head of the inactive queue. This bypasses regular LRU * operation and allows the pages to be reused quickly under memory * pressure. If a page is wired for any reason, then it will not * be queued. Pages are specified by the range ["start", "end"). * As a special case, if "end" is zero, then the range extends from * "start" to the end of the object. * * This operation should only be performed on objects that * contain non-fictitious, managed pages. * * The object must be locked. */ void vm_object_page_noreuse(vm_object_t object, vm_pindex_t start, vm_pindex_t end) { struct mtx *mtx, *new_mtx; vm_page_t p, next; VM_OBJECT_ASSERT_WLOCKED(object); KASSERT((object->flags & (OBJ_FICTITIOUS | OBJ_UNMANAGED)) == 0, ("vm_object_page_noreuse: illegal object %p", object)); if (object->resident_page_count == 0) return; p = vm_page_find_least(object, start); /* * Here, the variable "p" is either (1) the page with the least pindex * greater than or equal to the parameter "start" or (2) NULL. */ mtx = NULL; for (; p != NULL && (p->pindex < end || end == 0); p = next) { next = TAILQ_NEXT(p, listq); /* * Avoid releasing and reacquiring the same page lock. */ new_mtx = vm_page_lockptr(p); if (mtx != new_mtx) { if (mtx != NULL) mtx_unlock(mtx); mtx = new_mtx; mtx_lock(mtx); } vm_page_deactivate_noreuse(p); } if (mtx != NULL) mtx_unlock(mtx); } /* * Populate the specified range of the object with valid pages. Returns * TRUE if the range is successfully populated and FALSE otherwise. * * Note: This function should be optimized to pass a larger array of * pages to vm_pager_get_pages() before it is applied to a non- * OBJT_DEVICE object. * * The object must be locked. */ boolean_t vm_object_populate(vm_object_t object, vm_pindex_t start, vm_pindex_t end) { vm_page_t m; vm_pindex_t pindex; int rv; VM_OBJECT_ASSERT_WLOCKED(object); for (pindex = start; pindex < end; pindex++) { m = vm_page_grab(object, pindex, VM_ALLOC_NORMAL); if (m->valid != VM_PAGE_BITS_ALL) { rv = vm_pager_get_pages(object, &m, 1, NULL, NULL); if (rv != VM_PAGER_OK) { vm_page_lock(m); vm_page_free(m); vm_page_unlock(m); break; } } /* * Keep "m" busy because a subsequent iteration may unlock * the object. */ } if (pindex > start) { m = vm_page_lookup(object, start); while (m != NULL && m->pindex < pindex) { vm_page_xunbusy(m); m = TAILQ_NEXT(m, listq); } } return (pindex == end); } /* * Routine: vm_object_coalesce * Function: Coalesces two objects backing up adjoining * regions of memory into a single object. * * returns TRUE if objects were combined. * * NOTE: Only works at the moment if the second object is NULL - * if it's not, which object do we lock first? * * Parameters: * prev_object First object to coalesce * prev_offset Offset into prev_object * prev_size Size of reference to prev_object * next_size Size of reference to the second object * reserved Indicator that extension region has * swap accounted for * * Conditions: * The object must *not* be locked. */ boolean_t vm_object_coalesce(vm_object_t prev_object, vm_ooffset_t prev_offset, vm_size_t prev_size, vm_size_t next_size, boolean_t reserved) { vm_pindex_t next_pindex; if (prev_object == NULL) return (TRUE); VM_OBJECT_WLOCK(prev_object); if ((prev_object->type != OBJT_DEFAULT && prev_object->type != OBJT_SWAP) || (prev_object->flags & OBJ_TMPFS_NODE) != 0) { VM_OBJECT_WUNLOCK(prev_object); return (FALSE); } /* * Try to collapse the object first */ vm_object_collapse(prev_object); /* * Can't coalesce if: . more than one reference . paged out . shadows * another object . has a copy elsewhere (any of which mean that the * pages not mapped to prev_entry may be in use anyway) */ if (prev_object->backing_object != NULL) { VM_OBJECT_WUNLOCK(prev_object); return (FALSE); } prev_size >>= PAGE_SHIFT; next_size >>= PAGE_SHIFT; next_pindex = OFF_TO_IDX(prev_offset) + prev_size; if ((prev_object->ref_count > 1) && (prev_object->size != next_pindex)) { VM_OBJECT_WUNLOCK(prev_object); return (FALSE); } /* * Account for the charge. */ if (prev_object->cred != NULL) { /* * If prev_object was charged, then this mapping, * although not charged now, may become writable * later. Non-NULL cred in the object would prevent * swap reservation during enabling of the write * access, so reserve swap now. Failed reservation * cause allocation of the separate object for the map * entry, and swap reservation for this entry is * managed in appropriate time. */ if (!reserved && !swap_reserve_by_cred(ptoa(next_size), prev_object->cred)) { VM_OBJECT_WUNLOCK(prev_object); return (FALSE); } prev_object->charge += ptoa(next_size); } /* * Remove any pages that may still be in the object from a previous * deallocation. */ if (next_pindex < prev_object->size) { vm_object_page_remove(prev_object, next_pindex, next_pindex + next_size, 0); if (prev_object->type == OBJT_SWAP) swap_pager_freespace(prev_object, next_pindex, next_size); #if 0 if (prev_object->cred != NULL) { KASSERT(prev_object->charge >= ptoa(prev_object->size - next_pindex), ("object %p overcharged 1 %jx %jx", prev_object, (uintmax_t)next_pindex, (uintmax_t)next_size)); prev_object->charge -= ptoa(prev_object->size - next_pindex); } #endif } /* * Extend the object if necessary. */ if (next_pindex + next_size > prev_object->size) prev_object->size = next_pindex + next_size; VM_OBJECT_WUNLOCK(prev_object); return (TRUE); } void vm_object_set_writeable_dirty(vm_object_t object) { VM_OBJECT_ASSERT_WLOCKED(object); if (object->type != OBJT_VNODE) { if ((object->flags & OBJ_TMPFS_NODE) != 0) { KASSERT(object->type == OBJT_SWAP, ("non-swap tmpfs")); vm_object_set_flag(object, OBJ_TMPFS_DIRTY); } return; } object->generation++; if ((object->flags & OBJ_MIGHTBEDIRTY) != 0) return; vm_object_set_flag(object, OBJ_MIGHTBEDIRTY); } /* * vm_object_unwire: * * For each page offset within the specified range of the given object, * find the highest-level page in the shadow chain and unwire it. A page * must exist at every page offset, and the highest-level page must be * wired. */ void vm_object_unwire(vm_object_t object, vm_ooffset_t offset, vm_size_t length, uint8_t queue) { vm_object_t tobject; vm_page_t m, tm; vm_pindex_t end_pindex, pindex, tpindex; int depth, locked_depth; KASSERT((offset & PAGE_MASK) == 0, ("vm_object_unwire: offset is not page aligned")); KASSERT((length & PAGE_MASK) == 0, ("vm_object_unwire: length is not a multiple of PAGE_SIZE")); /* The wired count of a fictitious page never changes. */ if ((object->flags & OBJ_FICTITIOUS) != 0) return; pindex = OFF_TO_IDX(offset); end_pindex = pindex + atop(length); locked_depth = 1; VM_OBJECT_RLOCK(object); m = vm_page_find_least(object, pindex); while (pindex < end_pindex) { if (m == NULL || pindex < m->pindex) { /* * The first object in the shadow chain doesn't * contain a page at the current index. Therefore, * the page must exist in a backing object. */ tobject = object; tpindex = pindex; depth = 0; do { tpindex += OFF_TO_IDX(tobject->backing_object_offset); tobject = tobject->backing_object; KASSERT(tobject != NULL, ("vm_object_unwire: missing page")); if ((tobject->flags & OBJ_FICTITIOUS) != 0) goto next_page; depth++; if (depth == locked_depth) { locked_depth++; VM_OBJECT_RLOCK(tobject); } } while ((tm = vm_page_lookup(tobject, tpindex)) == NULL); } else { tm = m; m = TAILQ_NEXT(m, listq); } vm_page_lock(tm); vm_page_unwire(tm, queue); vm_page_unlock(tm); next_page: pindex++; } /* Release the accumulated object locks. */ for (depth = 0; depth < locked_depth; depth++) { tobject = object->backing_object; VM_OBJECT_RUNLOCK(object); object = tobject; } } struct vnode * vm_object_vnode(vm_object_t object) { VM_OBJECT_ASSERT_LOCKED(object); if (object->type == OBJT_VNODE) return (object->handle); if (object->type == OBJT_SWAP && (object->flags & OBJ_TMPFS) != 0) return (object->un_pager.swp.swp_tmpfs); return (NULL); } static int sysctl_vm_object_list(SYSCTL_HANDLER_ARGS) { struct kinfo_vmobject kvo; char *fullpath, *freepath; struct vnode *vp; struct vattr va; vm_object_t obj; vm_page_t m; int count, error; if (req->oldptr == NULL) { /* * If an old buffer has not been provided, generate an * estimate of the space needed for a subsequent call. */ mtx_lock(&vm_object_list_mtx); count = 0; TAILQ_FOREACH(obj, &vm_object_list, object_list) { if (obj->type == OBJT_DEAD) continue; count++; } mtx_unlock(&vm_object_list_mtx); return (SYSCTL_OUT(req, NULL, sizeof(struct kinfo_vmobject) * count * 11 / 10)); } error = 0; /* * VM objects are type stable and are never removed from the * list once added. This allows us to safely read obj->object_list * after reacquiring the VM object lock. */ mtx_lock(&vm_object_list_mtx); TAILQ_FOREACH(obj, &vm_object_list, object_list) { if (obj->type == OBJT_DEAD) continue; VM_OBJECT_RLOCK(obj); if (obj->type == OBJT_DEAD) { VM_OBJECT_RUNLOCK(obj); continue; } mtx_unlock(&vm_object_list_mtx); kvo.kvo_size = ptoa(obj->size); kvo.kvo_resident = obj->resident_page_count; kvo.kvo_ref_count = obj->ref_count; kvo.kvo_shadow_count = obj->shadow_count; kvo.kvo_memattr = obj->memattr; kvo.kvo_active = 0; kvo.kvo_inactive = 0; TAILQ_FOREACH(m, &obj->memq, listq) { /* * A page may belong to the object but be * dequeued and set to PQ_NONE while the * object lock is not held. This makes the * reads of m->queue below racy, and we do not * count pages set to PQ_NONE. However, this * sysctl is only meant to give an * approximation of the system anyway. */ if (vm_page_active(m)) kvo.kvo_active++; else if (vm_page_inactive(m)) kvo.kvo_inactive++; } kvo.kvo_vn_fileid = 0; kvo.kvo_vn_fsid = 0; freepath = NULL; fullpath = ""; vp = NULL; switch (obj->type) { case OBJT_DEFAULT: kvo.kvo_type = KVME_TYPE_DEFAULT; break; case OBJT_VNODE: kvo.kvo_type = KVME_TYPE_VNODE; vp = obj->handle; vref(vp); break; case OBJT_SWAP: kvo.kvo_type = KVME_TYPE_SWAP; break; case OBJT_DEVICE: kvo.kvo_type = KVME_TYPE_DEVICE; break; case OBJT_PHYS: kvo.kvo_type = KVME_TYPE_PHYS; break; case OBJT_DEAD: kvo.kvo_type = KVME_TYPE_DEAD; break; case OBJT_SG: kvo.kvo_type = KVME_TYPE_SG; break; case OBJT_MGTDEVICE: kvo.kvo_type = KVME_TYPE_MGTDEVICE; break; default: kvo.kvo_type = KVME_TYPE_UNKNOWN; break; } VM_OBJECT_RUNLOCK(obj); if (vp != NULL) { vn_fullpath(curthread, vp, &fullpath, &freepath); vn_lock(vp, LK_SHARED | LK_RETRY); if (VOP_GETATTR(vp, &va, curthread->td_ucred) == 0) { kvo.kvo_vn_fileid = va.va_fileid; kvo.kvo_vn_fsid = va.va_fsid; } vput(vp); } strlcpy(kvo.kvo_path, fullpath, sizeof(kvo.kvo_path)); if (freepath != NULL) free(freepath, M_TEMP); /* Pack record size down */ kvo.kvo_structsize = offsetof(struct kinfo_vmobject, kvo_path) + strlen(kvo.kvo_path) + 1; kvo.kvo_structsize = roundup(kvo.kvo_structsize, sizeof(uint64_t)); error = SYSCTL_OUT(req, &kvo, kvo.kvo_structsize); mtx_lock(&vm_object_list_mtx); if (error) break; } mtx_unlock(&vm_object_list_mtx); return (error); } SYSCTL_PROC(_vm, OID_AUTO, objects, CTLTYPE_STRUCT | CTLFLAG_RW | CTLFLAG_SKIP | CTLFLAG_MPSAFE, NULL, 0, sysctl_vm_object_list, "S,kinfo_vmobject", "List of VM objects"); #include "opt_ddb.h" #ifdef DDB #include #include #include static int _vm_object_in_map(vm_map_t map, vm_object_t object, vm_map_entry_t entry) { vm_map_t tmpm; vm_map_entry_t tmpe; vm_object_t obj; int entcount; if (map == 0) return 0; if (entry == 0) { tmpe = map->header.next; entcount = map->nentries; while (entcount-- && (tmpe != &map->header)) { if (_vm_object_in_map(map, object, tmpe)) { return 1; } tmpe = tmpe->next; } } else if (entry->eflags & MAP_ENTRY_IS_SUB_MAP) { tmpm = entry->object.sub_map; tmpe = tmpm->header.next; entcount = tmpm->nentries; while (entcount-- && tmpe != &tmpm->header) { if (_vm_object_in_map(tmpm, object, tmpe)) { return 1; } tmpe = tmpe->next; } } else if ((obj = entry->object.vm_object) != NULL) { for (; obj; obj = obj->backing_object) if (obj == object) { return 1; } } return 0; } static int vm_object_in_map(vm_object_t object) { struct proc *p; /* sx_slock(&allproc_lock); */ FOREACH_PROC_IN_SYSTEM(p) { if (!p->p_vmspace /* || (p->p_flag & (P_SYSTEM|P_WEXIT)) */) continue; if (_vm_object_in_map(&p->p_vmspace->vm_map, object, 0)) { /* sx_sunlock(&allproc_lock); */ return 1; } } /* sx_sunlock(&allproc_lock); */ if (_vm_object_in_map(kernel_map, object, 0)) return 1; return 0; } DB_SHOW_COMMAND(vmochk, vm_object_check) { vm_object_t object; /* * make sure that internal objs are in a map somewhere * and none have zero ref counts. */ TAILQ_FOREACH(object, &vm_object_list, object_list) { if (object->handle == NULL && (object->type == OBJT_DEFAULT || object->type == OBJT_SWAP)) { if (object->ref_count == 0) { db_printf("vmochk: internal obj has zero ref count: %ld\n", (long)object->size); } if (!vm_object_in_map(object)) { db_printf( "vmochk: internal obj is not in a map: " "ref: %d, size: %lu: 0x%lx, backing_object: %p\n", object->ref_count, (u_long)object->size, (u_long)object->size, (void *)object->backing_object); } } } } /* * vm_object_print: [ debug ] */ DB_SHOW_COMMAND(object, vm_object_print_static) { /* XXX convert args. */ vm_object_t object = (vm_object_t)addr; boolean_t full = have_addr; vm_page_t p; /* XXX count is an (unused) arg. Avoid shadowing it. */ #define count was_count int count; if (object == NULL) return; db_iprintf( "Object %p: type=%d, size=0x%jx, res=%d, ref=%d, flags=0x%x ruid %d charge %jx\n", object, (int)object->type, (uintmax_t)object->size, object->resident_page_count, object->ref_count, object->flags, object->cred ? object->cred->cr_ruid : -1, (uintmax_t)object->charge); db_iprintf(" sref=%d, backing_object(%d)=(%p)+0x%jx\n", object->shadow_count, object->backing_object ? object->backing_object->ref_count : 0, object->backing_object, (uintmax_t)object->backing_object_offset); if (!full) return; db_indent += 2; count = 0; TAILQ_FOREACH(p, &object->memq, listq) { if (count == 0) db_iprintf("memory:="); else if (count == 6) { db_printf("\n"); db_iprintf(" ..."); count = 0; } else db_printf(","); count++; db_printf("(off=0x%jx,page=0x%jx)", (uintmax_t)p->pindex, (uintmax_t)VM_PAGE_TO_PHYS(p)); } if (count != 0) db_printf("\n"); db_indent -= 2; } /* XXX. */ #undef count /* XXX need this non-static entry for calling from vm_map_print. */ void vm_object_print( /* db_expr_t */ long addr, boolean_t have_addr, /* db_expr_t */ long count, char *modif) { vm_object_print_static(addr, have_addr, count, modif); } DB_SHOW_COMMAND(vmopag, vm_object_print_pages) { vm_object_t object; vm_pindex_t fidx; vm_paddr_t pa; vm_page_t m, prev_m; int rcount, nl, c; nl = 0; TAILQ_FOREACH(object, &vm_object_list, object_list) { db_printf("new object: %p\n", (void *)object); if (nl > 18) { c = cngetc(); if (c != ' ') return; nl = 0; } nl++; rcount = 0; fidx = 0; pa = -1; TAILQ_FOREACH(m, &object->memq, listq) { if (m->pindex > 128) break; if ((prev_m = TAILQ_PREV(m, pglist, listq)) != NULL && prev_m->pindex + 1 != m->pindex) { if (rcount) { db_printf(" index(%ld)run(%d)pa(0x%lx)\n", (long)fidx, rcount, (long)pa); if (nl > 18) { c = cngetc(); if (c != ' ') return; nl = 0; } nl++; rcount = 0; } } if (rcount && (VM_PAGE_TO_PHYS(m) == pa + rcount * PAGE_SIZE)) { ++rcount; continue; } if (rcount) { db_printf(" index(%ld)run(%d)pa(0x%lx)\n", (long)fidx, rcount, (long)pa); if (nl > 18) { c = cngetc(); if (c != ' ') return; nl = 0; } nl++; } fidx = m->pindex; pa = VM_PAGE_TO_PHYS(m); rcount = 1; } if (rcount) { db_printf(" index(%ld)run(%d)pa(0x%lx)\n", (long)fidx, rcount, (long)pa); if (nl > 18) { c = cngetc(); if (c != ' ') return; nl = 0; } nl++; } } } #endif /* DDB */ Index: projects/netbsd-tests-upstream-01-2017/sys/vm/vm_page.c =================================================================== --- projects/netbsd-tests-upstream-01-2017/sys/vm/vm_page.c (revision 312217) +++ projects/netbsd-tests-upstream-01-2017/sys/vm/vm_page.c (revision 312218) @@ -1,3600 +1,3603 @@ /*- * Copyright (c) 1991 Regents of the University of California. * All rights reserved. * Copyright (c) 1998 Matthew Dillon. All Rights Reserved. * * This code is derived from software contributed to Berkeley by * The Mach Operating System project at Carnegie-Mellon University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)vm_page.c 7.4 (Berkeley) 5/7/91 */ /*- * Copyright (c) 1987, 1990 Carnegie-Mellon University. * All rights reserved. * * Authors: Avadis Tevanian, Jr., Michael Wayne Young * * Permission to use, copy, modify and distribute this software and * its documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation. * * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. * * Carnegie Mellon requests users of this software to return to * * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU * School of Computer Science * Carnegie Mellon University * Pittsburgh PA 15213-3890 * * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. */ /* * GENERAL RULES ON VM_PAGE MANIPULATION * * - A page queue lock is required when adding or removing a page from a * page queue regardless of other locks or the busy state of a page. * * * In general, no thread besides the page daemon can acquire or * hold more than one page queue lock at a time. * * * The page daemon can acquire and hold any pair of page queue * locks in any order. * * - The object lock is required when inserting or removing * pages from an object (vm_page_insert() or vm_page_remove()). * */ /* * Resident memory management module. */ #include __FBSDID("$FreeBSD$"); #include "opt_vm.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Associated with page of user-allocatable memory is a * page structure. */ struct vm_domain vm_dom[MAXMEMDOM]; struct mtx_padalign vm_page_queue_free_mtx; struct mtx_padalign pa_lock[PA_LOCK_COUNT]; /* * bogus page -- for I/O to/from partially complete buffers, * or for paging into sparsely invalid regions. */ vm_page_t bogus_page; vm_page_t vm_page_array; long vm_page_array_size; long first_page; static int boot_pages = UMA_BOOT_PAGES; SYSCTL_INT(_vm, OID_AUTO, boot_pages, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &boot_pages, 0, "number of pages allocated for bootstrapping the VM system"); static int pa_tryrelock_restart; SYSCTL_INT(_vm, OID_AUTO, tryrelock_restart, CTLFLAG_RD, &pa_tryrelock_restart, 0, "Number of tryrelock restarts"); static TAILQ_HEAD(, vm_page) blacklist_head; static int sysctl_vm_page_blacklist(SYSCTL_HANDLER_ARGS); SYSCTL_PROC(_vm, OID_AUTO, page_blacklist, CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, sysctl_vm_page_blacklist, "A", "Blacklist pages"); /* Is the page daemon waiting for free pages? */ static int vm_pageout_pages_needed; static uma_zone_t fakepg_zone; static void vm_page_alloc_check(vm_page_t m); static void vm_page_clear_dirty_mask(vm_page_t m, vm_page_bits_t pagebits); static void vm_page_enqueue(uint8_t queue, vm_page_t m); static void vm_page_free_wakeup(void); static void vm_page_init(void *dummy); static int vm_page_insert_after(vm_page_t m, vm_object_t object, vm_pindex_t pindex, vm_page_t mpred); static void vm_page_insert_radixdone(vm_page_t m, vm_object_t object, vm_page_t mpred); static int vm_page_reclaim_run(int req_class, u_long npages, vm_page_t m_run, vm_paddr_t high); SYSINIT(vm_page, SI_SUB_VM, SI_ORDER_SECOND, vm_page_init, NULL); static void vm_page_init(void *dummy) { fakepg_zone = uma_zcreate("fakepg", sizeof(struct vm_page), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE | UMA_ZONE_VM); bogus_page = vm_page_alloc(NULL, 0, VM_ALLOC_NOOBJ | VM_ALLOC_NORMAL | VM_ALLOC_WIRED); } /* Make sure that u_long is at least 64 bits when PAGE_SIZE is 32K. */ #if PAGE_SIZE == 32768 #ifdef CTASSERT CTASSERT(sizeof(u_long) >= 8); #endif #endif /* * Try to acquire a physical address lock while a pmap is locked. If we * fail to trylock we unlock and lock the pmap directly and cache the * locked pa in *locked. The caller should then restart their loop in case * the virtual to physical mapping has changed. */ int vm_page_pa_tryrelock(pmap_t pmap, vm_paddr_t pa, vm_paddr_t *locked) { vm_paddr_t lockpa; lockpa = *locked; *locked = pa; if (lockpa) { PA_LOCK_ASSERT(lockpa, MA_OWNED); if (PA_LOCKPTR(pa) == PA_LOCKPTR(lockpa)) return (0); PA_UNLOCK(lockpa); } if (PA_TRYLOCK(pa)) return (0); PMAP_UNLOCK(pmap); atomic_add_int(&pa_tryrelock_restart, 1); PA_LOCK(pa); PMAP_LOCK(pmap); return (EAGAIN); } /* * vm_set_page_size: * * Sets the page size, perhaps based upon the memory * size. Must be called before any use of page-size * dependent functions. */ void vm_set_page_size(void) { if (vm_cnt.v_page_size == 0) vm_cnt.v_page_size = PAGE_SIZE; if (((vm_cnt.v_page_size - 1) & vm_cnt.v_page_size) != 0) panic("vm_set_page_size: page size not a power of two"); } /* * vm_page_blacklist_next: * * Find the next entry in the provided string of blacklist * addresses. Entries are separated by space, comma, or newline. * If an invalid integer is encountered then the rest of the * string is skipped. Updates the list pointer to the next * character, or NULL if the string is exhausted or invalid. */ static vm_paddr_t vm_page_blacklist_next(char **list, char *end) { vm_paddr_t bad; char *cp, *pos; if (list == NULL || *list == NULL) return (0); if (**list =='\0') { *list = NULL; return (0); } /* * If there's no end pointer then the buffer is coming from * the kenv and we know it's null-terminated. */ if (end == NULL) end = *list + strlen(*list); /* Ensure that strtoq() won't walk off the end */ if (*end != '\0') { if (*end == '\n' || *end == ' ' || *end == ',') *end = '\0'; else { printf("Blacklist not terminated, skipping\n"); *list = NULL; return (0); } } for (pos = *list; *pos != '\0'; pos = cp) { bad = strtoq(pos, &cp, 0); if (*cp == '\0' || *cp == ' ' || *cp == ',' || *cp == '\n') { if (bad == 0) { if (++cp < end) continue; else break; } } else break; if (*cp == '\0' || ++cp >= end) *list = NULL; else *list = cp; return (trunc_page(bad)); } printf("Garbage in RAM blacklist, skipping\n"); *list = NULL; return (0); } /* * vm_page_blacklist_check: * * Iterate through the provided string of blacklist addresses, pulling * each entry out of the physical allocator free list and putting it * onto a list for reporting via the vm.page_blacklist sysctl. */ static void vm_page_blacklist_check(char *list, char *end) { vm_paddr_t pa; vm_page_t m; char *next; int ret; next = list; while (next != NULL) { if ((pa = vm_page_blacklist_next(&next, end)) == 0) continue; m = vm_phys_paddr_to_vm_page(pa); if (m == NULL) continue; mtx_lock(&vm_page_queue_free_mtx); ret = vm_phys_unfree_page(m); mtx_unlock(&vm_page_queue_free_mtx); if (ret == TRUE) { TAILQ_INSERT_TAIL(&blacklist_head, m, listq); if (bootverbose) printf("Skipping page with pa 0x%jx\n", (uintmax_t)pa); } } } /* * vm_page_blacklist_load: * * Search for a special module named "ram_blacklist". It'll be a * plain text file provided by the user via the loader directive * of the same name. */ static void vm_page_blacklist_load(char **list, char **end) { void *mod; u_char *ptr; u_int len; mod = NULL; ptr = NULL; mod = preload_search_by_type("ram_blacklist"); if (mod != NULL) { ptr = preload_fetch_addr(mod); len = preload_fetch_size(mod); } *list = ptr; if (ptr != NULL) *end = ptr + len; else *end = NULL; return; } static int sysctl_vm_page_blacklist(SYSCTL_HANDLER_ARGS) { vm_page_t m; struct sbuf sbuf; int error, first; first = 1; error = sysctl_wire_old_buffer(req, 0); if (error != 0) return (error); sbuf_new_for_sysctl(&sbuf, NULL, 128, req); TAILQ_FOREACH(m, &blacklist_head, listq) { sbuf_printf(&sbuf, "%s%#jx", first ? "" : ",", (uintmax_t)m->phys_addr); first = 0; } error = sbuf_finish(&sbuf); sbuf_delete(&sbuf); return (error); } static void vm_page_domain_init(struct vm_domain *vmd) { struct vm_pagequeue *pq; int i; *__DECONST(char **, &vmd->vmd_pagequeues[PQ_INACTIVE].pq_name) = "vm inactive pagequeue"; *__DECONST(u_int **, &vmd->vmd_pagequeues[PQ_INACTIVE].pq_vcnt) = &vm_cnt.v_inactive_count; *__DECONST(char **, &vmd->vmd_pagequeues[PQ_ACTIVE].pq_name) = "vm active pagequeue"; *__DECONST(u_int **, &vmd->vmd_pagequeues[PQ_ACTIVE].pq_vcnt) = &vm_cnt.v_active_count; *__DECONST(char **, &vmd->vmd_pagequeues[PQ_LAUNDRY].pq_name) = "vm laundry pagequeue"; *__DECONST(int **, &vmd->vmd_pagequeues[PQ_LAUNDRY].pq_vcnt) = &vm_cnt.v_laundry_count; *__DECONST(char **, &vmd->vmd_pagequeues[PQ_UNSWAPPABLE].pq_name) = "vm unswappable pagequeue"; /* Unswappable dirty pages are counted as being in the laundry. */ *__DECONST(int **, &vmd->vmd_pagequeues[PQ_UNSWAPPABLE].pq_vcnt) = &vm_cnt.v_laundry_count; vmd->vmd_page_count = 0; vmd->vmd_free_count = 0; vmd->vmd_segs = 0; vmd->vmd_oom = FALSE; for (i = 0; i < PQ_COUNT; i++) { pq = &vmd->vmd_pagequeues[i]; TAILQ_INIT(&pq->pq_pl); mtx_init(&pq->pq_mutex, pq->pq_name, "vm pagequeue", MTX_DEF | MTX_DUPOK); } } /* * vm_page_startup: * * Initializes the resident memory module. * * Allocates memory for the page cells, and * for the object/offset-to-page hash table headers. * Each page cell is initialized and placed on the free list. */ vm_offset_t vm_page_startup(vm_offset_t vaddr) { vm_offset_t mapped; vm_paddr_t page_range; vm_paddr_t new_end; int i; vm_paddr_t pa; vm_paddr_t last_pa; char *list, *listend; vm_paddr_t end; vm_paddr_t biggestsize; vm_paddr_t low_water, high_water; int biggestone; int pages_per_zone; biggestsize = 0; biggestone = 0; vaddr = round_page(vaddr); for (i = 0; phys_avail[i + 1]; i += 2) { phys_avail[i] = round_page(phys_avail[i]); phys_avail[i + 1] = trunc_page(phys_avail[i + 1]); } low_water = phys_avail[0]; high_water = phys_avail[1]; for (i = 0; i < vm_phys_nsegs; i++) { if (vm_phys_segs[i].start < low_water) low_water = vm_phys_segs[i].start; if (vm_phys_segs[i].end > high_water) high_water = vm_phys_segs[i].end; } for (i = 0; phys_avail[i + 1]; i += 2) { vm_paddr_t size = phys_avail[i + 1] - phys_avail[i]; if (size > biggestsize) { biggestone = i; biggestsize = size; } if (phys_avail[i] < low_water) low_water = phys_avail[i]; if (phys_avail[i + 1] > high_water) high_water = phys_avail[i + 1]; } end = phys_avail[biggestone+1]; /* * Initialize the page and queue locks. */ mtx_init(&vm_page_queue_free_mtx, "vm page free queue", NULL, MTX_DEF); for (i = 0; i < PA_LOCK_COUNT; i++) mtx_init(&pa_lock[i], "vm page", NULL, MTX_DEF); for (i = 0; i < vm_ndomains; i++) vm_page_domain_init(&vm_dom[i]); /* * Almost all of the pages needed for boot strapping UMA are used * for zone structures, so if the number of CPUs results in those * structures taking more than one page each, we set aside more pages * in proportion to the zone structure size. */ pages_per_zone = howmany(sizeof(struct uma_zone) + sizeof(struct uma_cache) * (mp_maxid + 1), UMA_SLAB_SIZE); if (pages_per_zone > 1) { /* Reserve more pages so that we don't run out. */ boot_pages = UMA_BOOT_PAGES_ZONES * pages_per_zone; } /* * Allocate memory for use when boot strapping the kernel memory * allocator. * * CTFLAG_RDTUN doesn't work during the early boot process, so we must * manually fetch the value. */ TUNABLE_INT_FETCH("vm.boot_pages", &boot_pages); new_end = end - (boot_pages * UMA_SLAB_SIZE); new_end = trunc_page(new_end); mapped = pmap_map(&vaddr, new_end, end, VM_PROT_READ | VM_PROT_WRITE); bzero((void *)mapped, end - new_end); uma_startup((void *)mapped, boot_pages); #if defined(__aarch64__) || defined(__amd64__) || defined(__arm__) || \ defined(__i386__) || defined(__mips__) /* * Allocate a bitmap to indicate that a random physical page * needs to be included in a minidump. * * The amd64 port needs this to indicate which direct map pages * need to be dumped, via calls to dump_add_page()/dump_drop_page(). * * However, i386 still needs this workspace internally within the * minidump code. In theory, they are not needed on i386, but are * included should the sf_buf code decide to use them. */ last_pa = 0; for (i = 0; dump_avail[i + 1] != 0; i += 2) if (dump_avail[i + 1] > last_pa) last_pa = dump_avail[i + 1]; page_range = last_pa / PAGE_SIZE; vm_page_dump_size = round_page(roundup2(page_range, NBBY) / NBBY); new_end -= vm_page_dump_size; vm_page_dump = (void *)(uintptr_t)pmap_map(&vaddr, new_end, new_end + vm_page_dump_size, VM_PROT_READ | VM_PROT_WRITE); bzero((void *)vm_page_dump, vm_page_dump_size); #endif #ifdef __amd64__ /* * Request that the physical pages underlying the message buffer be * included in a crash dump. Since the message buffer is accessed * through the direct map, they are not automatically included. */ pa = DMAP_TO_PHYS((vm_offset_t)msgbufp->msg_ptr); last_pa = pa + round_page(msgbufsize); while (pa < last_pa) { dump_add_page(pa); pa += PAGE_SIZE; } #endif /* * Compute the number of pages of memory that will be available for * use (taking into account the overhead of a page structure per * page). */ first_page = low_water / PAGE_SIZE; #ifdef VM_PHYSSEG_SPARSE page_range = 0; for (i = 0; i < vm_phys_nsegs; i++) { page_range += atop(vm_phys_segs[i].end - vm_phys_segs[i].start); } for (i = 0; phys_avail[i + 1] != 0; i += 2) page_range += atop(phys_avail[i + 1] - phys_avail[i]); #elif defined(VM_PHYSSEG_DENSE) page_range = high_water / PAGE_SIZE - first_page; #else #error "Either VM_PHYSSEG_DENSE or VM_PHYSSEG_SPARSE must be defined." #endif end = new_end; /* * Reserve an unmapped guard page to trap access to vm_page_array[-1]. */ vaddr += PAGE_SIZE; /* * Initialize the mem entry structures now, and put them in the free * queue. */ new_end = trunc_page(end - page_range * sizeof(struct vm_page)); mapped = pmap_map(&vaddr, new_end, end, VM_PROT_READ | VM_PROT_WRITE); vm_page_array = (vm_page_t) mapped; #if VM_NRESERVLEVEL > 0 /* * Allocate memory for the reservation management system's data * structures. */ new_end = vm_reserv_startup(&vaddr, new_end, high_water); #endif #if defined(__aarch64__) || defined(__amd64__) || defined(__mips__) /* * pmap_map on arm64, amd64, and mips can come out of the direct-map, * not kvm like i386, so the pages must be tracked for a crashdump to * include this data. This includes the vm_page_array and the early * UMA bootstrap pages. */ for (pa = new_end; pa < phys_avail[biggestone + 1]; pa += PAGE_SIZE) dump_add_page(pa); #endif phys_avail[biggestone + 1] = new_end; /* * Add physical memory segments corresponding to the available * physical pages. */ for (i = 0; phys_avail[i + 1] != 0; i += 2) vm_phys_add_seg(phys_avail[i], phys_avail[i + 1]); /* * Clear all of the page structures */ bzero((caddr_t) vm_page_array, page_range * sizeof(struct vm_page)); for (i = 0; i < page_range; i++) vm_page_array[i].order = VM_NFREEORDER; vm_page_array_size = page_range; /* * Initialize the physical memory allocator. */ vm_phys_init(); /* * Add every available physical page that is not blacklisted to * the free lists. */ vm_cnt.v_page_count = 0; vm_cnt.v_free_count = 0; for (i = 0; phys_avail[i + 1] != 0; i += 2) { pa = phys_avail[i]; last_pa = phys_avail[i + 1]; while (pa < last_pa) { vm_phys_add_page(pa); pa += PAGE_SIZE; } } TAILQ_INIT(&blacklist_head); vm_page_blacklist_load(&list, &listend); vm_page_blacklist_check(list, listend); list = kern_getenv("vm.blacklist"); vm_page_blacklist_check(list, NULL); freeenv(list); #if VM_NRESERVLEVEL > 0 /* * Initialize the reservation management system. */ vm_reserv_init(); #endif return (vaddr); } void vm_page_reference(vm_page_t m) { vm_page_aflag_set(m, PGA_REFERENCED); } /* * vm_page_busy_downgrade: * * Downgrade an exclusive busy page into a single shared busy page. */ void vm_page_busy_downgrade(vm_page_t m) { u_int x; bool locked; vm_page_assert_xbusied(m); locked = mtx_owned(vm_page_lockptr(m)); for (;;) { x = m->busy_lock; x &= VPB_BIT_WAITERS; if (x != 0 && !locked) vm_page_lock(m); if (atomic_cmpset_rel_int(&m->busy_lock, VPB_SINGLE_EXCLUSIVER | x, VPB_SHARERS_WORD(1))) break; if (x != 0 && !locked) vm_page_unlock(m); } if (x != 0) { wakeup(m); if (!locked) vm_page_unlock(m); } } /* * vm_page_sbusied: * * Return a positive value if the page is shared busied, 0 otherwise. */ int vm_page_sbusied(vm_page_t m) { u_int x; x = m->busy_lock; return ((x & VPB_BIT_SHARED) != 0 && x != VPB_UNBUSIED); } /* * vm_page_sunbusy: * * Shared unbusy a page. */ void vm_page_sunbusy(vm_page_t m) { u_int x; vm_page_assert_sbusied(m); for (;;) { x = m->busy_lock; if (VPB_SHARERS(x) > 1) { if (atomic_cmpset_int(&m->busy_lock, x, x - VPB_ONE_SHARER)) break; continue; } if ((x & VPB_BIT_WAITERS) == 0) { KASSERT(x == VPB_SHARERS_WORD(1), ("vm_page_sunbusy: invalid lock state")); if (atomic_cmpset_int(&m->busy_lock, VPB_SHARERS_WORD(1), VPB_UNBUSIED)) break; continue; } KASSERT(x == (VPB_SHARERS_WORD(1) | VPB_BIT_WAITERS), ("vm_page_sunbusy: invalid lock state for waiters")); vm_page_lock(m); if (!atomic_cmpset_int(&m->busy_lock, x, VPB_UNBUSIED)) { vm_page_unlock(m); continue; } wakeup(m); vm_page_unlock(m); break; } } /* * vm_page_busy_sleep: * * Sleep and release the page lock, using the page pointer as wchan. * This is used to implement the hard-path of busying mechanism. * * The given page must be locked. * * If nonshared is true, sleep only if the page is xbusy. */ void vm_page_busy_sleep(vm_page_t m, const char *wmesg, bool nonshared) { u_int x; vm_page_assert_locked(m); x = m->busy_lock; if (x == VPB_UNBUSIED || (nonshared && (x & VPB_BIT_SHARED) != 0) || ((x & VPB_BIT_WAITERS) == 0 && !atomic_cmpset_int(&m->busy_lock, x, x | VPB_BIT_WAITERS))) { vm_page_unlock(m); return; } msleep(m, vm_page_lockptr(m), PVM | PDROP, wmesg, 0); } /* * vm_page_trysbusy: * * Try to shared busy a page. * If the operation succeeds 1 is returned otherwise 0. * The operation never sleeps. */ int vm_page_trysbusy(vm_page_t m) { u_int x; for (;;) { x = m->busy_lock; if ((x & VPB_BIT_SHARED) == 0) return (0); if (atomic_cmpset_acq_int(&m->busy_lock, x, x + VPB_ONE_SHARER)) return (1); } } static void vm_page_xunbusy_locked(vm_page_t m) { vm_page_assert_xbusied(m); vm_page_assert_locked(m); atomic_store_rel_int(&m->busy_lock, VPB_UNBUSIED); /* There is a waiter, do wakeup() instead of vm_page_flash(). */ wakeup(m); } void vm_page_xunbusy_maybelocked(vm_page_t m) { bool lockacq; vm_page_assert_xbusied(m); /* * Fast path for unbusy. If it succeeds, we know that there * are no waiters, so we do not need a wakeup. */ if (atomic_cmpset_rel_int(&m->busy_lock, VPB_SINGLE_EXCLUSIVER, VPB_UNBUSIED)) return; lockacq = !mtx_owned(vm_page_lockptr(m)); if (lockacq) vm_page_lock(m); vm_page_xunbusy_locked(m); if (lockacq) vm_page_unlock(m); } /* * vm_page_xunbusy_hard: * * Called after the first try the exclusive unbusy of a page failed. * It is assumed that the waiters bit is on. */ void vm_page_xunbusy_hard(vm_page_t m) { vm_page_assert_xbusied(m); vm_page_lock(m); vm_page_xunbusy_locked(m); vm_page_unlock(m); } /* * vm_page_flash: * * Wakeup anyone waiting for the page. * The ownership bits do not change. * * The given page must be locked. */ void vm_page_flash(vm_page_t m) { u_int x; vm_page_lock_assert(m, MA_OWNED); for (;;) { x = m->busy_lock; if ((x & VPB_BIT_WAITERS) == 0) return; if (atomic_cmpset_int(&m->busy_lock, x, x & (~VPB_BIT_WAITERS))) break; } wakeup(m); } /* * Keep page from being freed by the page daemon * much of the same effect as wiring, except much lower * overhead and should be used only for *very* temporary * holding ("wiring"). */ void vm_page_hold(vm_page_t mem) { vm_page_lock_assert(mem, MA_OWNED); mem->hold_count++; } void vm_page_unhold(vm_page_t mem) { vm_page_lock_assert(mem, MA_OWNED); KASSERT(mem->hold_count >= 1, ("vm_page_unhold: hold count < 0!!!")); --mem->hold_count; if (mem->hold_count == 0 && (mem->flags & PG_UNHOLDFREE) != 0) vm_page_free_toq(mem); } /* * vm_page_unhold_pages: * * Unhold each of the pages that is referenced by the given array. */ void vm_page_unhold_pages(vm_page_t *ma, int count) { struct mtx *mtx, *new_mtx; mtx = NULL; for (; count != 0; count--) { /* * Avoid releasing and reacquiring the same page lock. */ new_mtx = vm_page_lockptr(*ma); if (mtx != new_mtx) { if (mtx != NULL) mtx_unlock(mtx); mtx = new_mtx; mtx_lock(mtx); } vm_page_unhold(*ma); ma++; } if (mtx != NULL) mtx_unlock(mtx); } vm_page_t PHYS_TO_VM_PAGE(vm_paddr_t pa) { vm_page_t m; #ifdef VM_PHYSSEG_SPARSE m = vm_phys_paddr_to_vm_page(pa); if (m == NULL) m = vm_phys_fictitious_to_vm_page(pa); return (m); #elif defined(VM_PHYSSEG_DENSE) long pi; pi = atop(pa); if (pi >= first_page && (pi - first_page) < vm_page_array_size) { m = &vm_page_array[pi - first_page]; return (m); } return (vm_phys_fictitious_to_vm_page(pa)); #else #error "Either VM_PHYSSEG_DENSE or VM_PHYSSEG_SPARSE must be defined." #endif } /* * vm_page_getfake: * * Create a fictitious page with the specified physical address and * memory attribute. The memory attribute is the only the machine- * dependent aspect of a fictitious page that must be initialized. */ vm_page_t vm_page_getfake(vm_paddr_t paddr, vm_memattr_t memattr) { vm_page_t m; m = uma_zalloc(fakepg_zone, M_WAITOK | M_ZERO); vm_page_initfake(m, paddr, memattr); return (m); } void vm_page_initfake(vm_page_t m, vm_paddr_t paddr, vm_memattr_t memattr) { if ((m->flags & PG_FICTITIOUS) != 0) { /* * The page's memattr might have changed since the * previous initialization. Update the pmap to the * new memattr. */ goto memattr; } m->phys_addr = paddr; m->queue = PQ_NONE; /* Fictitious pages don't use "segind". */ m->flags = PG_FICTITIOUS; /* Fictitious pages don't use "order" or "pool". */ m->oflags = VPO_UNMANAGED; m->busy_lock = VPB_SINGLE_EXCLUSIVER; m->wire_count = 1; pmap_page_init(m); memattr: pmap_page_set_memattr(m, memattr); } /* * vm_page_putfake: * * Release a fictitious page. */ void vm_page_putfake(vm_page_t m) { KASSERT((m->oflags & VPO_UNMANAGED) != 0, ("managed %p", m)); KASSERT((m->flags & PG_FICTITIOUS) != 0, ("vm_page_putfake: bad page %p", m)); uma_zfree(fakepg_zone, m); } /* * vm_page_updatefake: * * Update the given fictitious page to the specified physical address and * memory attribute. */ void vm_page_updatefake(vm_page_t m, vm_paddr_t paddr, vm_memattr_t memattr) { KASSERT((m->flags & PG_FICTITIOUS) != 0, ("vm_page_updatefake: bad page %p", m)); m->phys_addr = paddr; pmap_page_set_memattr(m, memattr); } /* * vm_page_free: * * Free a page. */ void vm_page_free(vm_page_t m) { m->flags &= ~PG_ZERO; vm_page_free_toq(m); } /* * vm_page_free_zero: * * Free a page to the zerod-pages queue */ void vm_page_free_zero(vm_page_t m) { m->flags |= PG_ZERO; vm_page_free_toq(m); } /* * Unbusy and handle the page queueing for a page from a getpages request that * was optionally read ahead or behind. */ void vm_page_readahead_finish(vm_page_t m) { /* We shouldn't put invalid pages on queues. */ KASSERT(m->valid != 0, ("%s: %p is invalid", __func__, m)); /* * Since the page is not the actually needed one, whether it should * be activated or deactivated is not obvious. Empirical results * have shown that deactivating the page is usually the best choice, * unless the page is wanted by another thread. */ vm_page_lock(m); if ((m->busy_lock & VPB_BIT_WAITERS) != 0) vm_page_activate(m); else vm_page_deactivate(m); vm_page_unlock(m); vm_page_xunbusy(m); } /* * vm_page_sleep_if_busy: * * Sleep and release the page queues lock if the page is busied. * Returns TRUE if the thread slept. * * The given page must be unlocked and object containing it must * be locked. */ int vm_page_sleep_if_busy(vm_page_t m, const char *msg) { vm_object_t obj; vm_page_lock_assert(m, MA_NOTOWNED); VM_OBJECT_ASSERT_WLOCKED(m->object); if (vm_page_busied(m)) { /* * The page-specific object must be cached because page * identity can change during the sleep, causing the * re-lock of a different object. * It is assumed that a reference to the object is already * held by the callers. */ obj = m->object; vm_page_lock(m); VM_OBJECT_WUNLOCK(obj); vm_page_busy_sleep(m, msg, false); VM_OBJECT_WLOCK(obj); return (TRUE); } return (FALSE); } /* * vm_page_dirty_KBI: [ internal use only ] * * Set all bits in the page's dirty field. * * The object containing the specified page must be locked if the * call is made from the machine-independent layer. * * See vm_page_clear_dirty_mask(). * * This function should only be called by vm_page_dirty(). */ void vm_page_dirty_KBI(vm_page_t m) { /* Refer to this operation by its public name. */ KASSERT(m->valid == VM_PAGE_BITS_ALL, ("vm_page_dirty: page is invalid!")); m->dirty = VM_PAGE_BITS_ALL; } /* * vm_page_insert: [ internal use only ] * * Inserts the given mem entry into the object and object list. * * The object must be locked. */ int vm_page_insert(vm_page_t m, vm_object_t object, vm_pindex_t pindex) { vm_page_t mpred; VM_OBJECT_ASSERT_WLOCKED(object); mpred = vm_radix_lookup_le(&object->rtree, pindex); return (vm_page_insert_after(m, object, pindex, mpred)); } /* * vm_page_insert_after: * * Inserts the page "m" into the specified object at offset "pindex". * * The page "mpred" must immediately precede the offset "pindex" within * the specified object. * * The object must be locked. */ static int vm_page_insert_after(vm_page_t m, vm_object_t object, vm_pindex_t pindex, vm_page_t mpred) { vm_page_t msucc; VM_OBJECT_ASSERT_WLOCKED(object); KASSERT(m->object == NULL, ("vm_page_insert_after: page already inserted")); if (mpred != NULL) { KASSERT(mpred->object == object, ("vm_page_insert_after: object doesn't contain mpred")); KASSERT(mpred->pindex < pindex, ("vm_page_insert_after: mpred doesn't precede pindex")); msucc = TAILQ_NEXT(mpred, listq); } else msucc = TAILQ_FIRST(&object->memq); if (msucc != NULL) KASSERT(msucc->pindex > pindex, ("vm_page_insert_after: msucc doesn't succeed pindex")); /* * Record the object/offset pair in this page */ m->object = object; m->pindex = pindex; /* * Now link into the object's ordered list of backed pages. */ if (vm_radix_insert(&object->rtree, m)) { m->object = NULL; m->pindex = 0; return (1); } vm_page_insert_radixdone(m, object, mpred); return (0); } /* * vm_page_insert_radixdone: * * Complete page "m" insertion into the specified object after the * radix trie hooking. * * The page "mpred" must precede the offset "m->pindex" within the * specified object. * * The object must be locked. */ static void vm_page_insert_radixdone(vm_page_t m, vm_object_t object, vm_page_t mpred) { VM_OBJECT_ASSERT_WLOCKED(object); KASSERT(object != NULL && m->object == object, ("vm_page_insert_radixdone: page %p has inconsistent object", m)); if (mpred != NULL) { KASSERT(mpred->object == object, ("vm_page_insert_after: object doesn't contain mpred")); KASSERT(mpred->pindex < m->pindex, ("vm_page_insert_after: mpred doesn't precede pindex")); } if (mpred != NULL) TAILQ_INSERT_AFTER(&object->memq, mpred, m, listq); else TAILQ_INSERT_HEAD(&object->memq, m, listq); /* * Show that the object has one more resident page. */ object->resident_page_count++; /* * Hold the vnode until the last page is released. */ if (object->resident_page_count == 1 && object->type == OBJT_VNODE) vhold(object->handle); /* * Since we are inserting a new and possibly dirty page, * update the object's OBJ_MIGHTBEDIRTY flag. */ if (pmap_page_is_write_mapped(m)) vm_object_set_writeable_dirty(object); } /* * vm_page_remove: * * Removes the specified page from its containing object, but does not * invalidate any backing storage. * * The object must be locked. The page must be locked if it is managed. */ void vm_page_remove(vm_page_t m) { vm_object_t object; vm_page_t mrem; if ((m->oflags & VPO_UNMANAGED) == 0) vm_page_assert_locked(m); if ((object = m->object) == NULL) return; VM_OBJECT_ASSERT_WLOCKED(object); if (vm_page_xbusied(m)) vm_page_xunbusy_maybelocked(m); mrem = vm_radix_remove(&object->rtree, m->pindex); KASSERT(mrem == m, ("removed page %p, expected page %p", mrem, m)); /* * Now remove from the object's list of backed pages. */ TAILQ_REMOVE(&object->memq, m, listq); /* * And show that the object has one fewer resident page. */ object->resident_page_count--; /* * The vnode may now be recycled. */ if (object->resident_page_count == 0 && object->type == OBJT_VNODE) vdrop(object->handle); m->object = NULL; } /* * vm_page_lookup: * * Returns the page associated with the object/offset * pair specified; if none is found, NULL is returned. * * The object must be locked. */ vm_page_t vm_page_lookup(vm_object_t object, vm_pindex_t pindex) { VM_OBJECT_ASSERT_LOCKED(object); return (vm_radix_lookup(&object->rtree, pindex)); } /* * vm_page_find_least: * * Returns the page associated with the object with least pindex * greater than or equal to the parameter pindex, or NULL. * * The object must be locked. */ vm_page_t vm_page_find_least(vm_object_t object, vm_pindex_t pindex) { vm_page_t m; VM_OBJECT_ASSERT_LOCKED(object); if ((m = TAILQ_FIRST(&object->memq)) != NULL && m->pindex < pindex) m = vm_radix_lookup_ge(&object->rtree, pindex); return (m); } /* * Returns the given page's successor (by pindex) within the object if it is * resident; if none is found, NULL is returned. * * The object must be locked. */ vm_page_t vm_page_next(vm_page_t m) { vm_page_t next; VM_OBJECT_ASSERT_LOCKED(m->object); if ((next = TAILQ_NEXT(m, listq)) != NULL) { MPASS(next->object == m->object); if (next->pindex != m->pindex + 1) next = NULL; } return (next); } /* * Returns the given page's predecessor (by pindex) within the object if it is * resident; if none is found, NULL is returned. * * The object must be locked. */ vm_page_t vm_page_prev(vm_page_t m) { vm_page_t prev; VM_OBJECT_ASSERT_LOCKED(m->object); if ((prev = TAILQ_PREV(m, pglist, listq)) != NULL) { MPASS(prev->object == m->object); if (prev->pindex != m->pindex - 1) prev = NULL; } return (prev); } /* * Uses the page mnew as a replacement for an existing page at index * pindex which must be already present in the object. * * The existing page must not be on a paging queue. */ vm_page_t vm_page_replace(vm_page_t mnew, vm_object_t object, vm_pindex_t pindex) { vm_page_t mold; VM_OBJECT_ASSERT_WLOCKED(object); KASSERT(mnew->object == NULL, ("vm_page_replace: page already in object")); /* * This function mostly follows vm_page_insert() and * vm_page_remove() without the radix, object count and vnode * dance. Double check such functions for more comments. */ mnew->object = object; mnew->pindex = pindex; mold = vm_radix_replace(&object->rtree, mnew); KASSERT(mold->queue == PQ_NONE, ("vm_page_replace: mold is on a paging queue")); /* Keep the resident page list in sorted order. */ TAILQ_INSERT_AFTER(&object->memq, mold, mnew, listq); TAILQ_REMOVE(&object->memq, mold, listq); mold->object = NULL; vm_page_xunbusy_maybelocked(mold); /* * The object's resident_page_count does not change because we have * swapped one page for another, but OBJ_MIGHTBEDIRTY. */ if (pmap_page_is_write_mapped(mnew)) vm_object_set_writeable_dirty(object); return (mold); } /* * vm_page_rename: * * Move the given memory entry from its * current object to the specified target object/offset. * * Note: swap associated with the page must be invalidated by the move. We * have to do this for several reasons: (1) we aren't freeing the * page, (2) we are dirtying the page, (3) the VM system is probably * moving the page from object A to B, and will then later move * the backing store from A to B and we can't have a conflict. * * Note: we *always* dirty the page. It is necessary both for the * fact that we moved it, and because we may be invalidating * swap. * * The objects must be locked. */ int vm_page_rename(vm_page_t m, vm_object_t new_object, vm_pindex_t new_pindex) { vm_page_t mpred; vm_pindex_t opidx; VM_OBJECT_ASSERT_WLOCKED(new_object); mpred = vm_radix_lookup_le(&new_object->rtree, new_pindex); KASSERT(mpred == NULL || mpred->pindex != new_pindex, ("vm_page_rename: pindex already renamed")); /* * Create a custom version of vm_page_insert() which does not depend * by m_prev and can cheat on the implementation aspects of the * function. */ opidx = m->pindex; m->pindex = new_pindex; if (vm_radix_insert(&new_object->rtree, m)) { m->pindex = opidx; return (1); } /* * The operation cannot fail anymore. The removal must happen before * the listq iterator is tainted. */ m->pindex = opidx; vm_page_lock(m); vm_page_remove(m); /* Return back to the new pindex to complete vm_page_insert(). */ m->pindex = new_pindex; m->object = new_object; vm_page_unlock(m); vm_page_insert_radixdone(m, new_object, mpred); vm_page_dirty(m); return (0); } /* * vm_page_alloc: * * Allocate and return a page that is associated with the specified * object and offset pair. By default, this page is exclusive busied. * * The caller must always specify an allocation class. * * allocation classes: * VM_ALLOC_NORMAL normal process request * VM_ALLOC_SYSTEM system *really* needs a page * VM_ALLOC_INTERRUPT interrupt time request * * optional allocation flags: * VM_ALLOC_COUNT(number) the number of additional pages that the caller * intends to allocate * VM_ALLOC_NOBUSY do not exclusive busy the page * VM_ALLOC_NODUMP do not include the page in a kernel core dump * VM_ALLOC_NOOBJ page is not associated with an object and * should not be exclusive busy * VM_ALLOC_SBUSY shared busy the allocated page * VM_ALLOC_WIRED wire the allocated page * VM_ALLOC_ZERO prefer a zeroed page * * This routine may not sleep. */ vm_page_t vm_page_alloc(vm_object_t object, vm_pindex_t pindex, int req) { vm_page_t m, mpred; int flags, req_class; mpred = NULL; /* XXX: pacify gcc */ KASSERT((object != NULL) == ((req & VM_ALLOC_NOOBJ) == 0) && (object != NULL || (req & VM_ALLOC_SBUSY) == 0) && ((req & (VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY)) != (VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY)), ("vm_page_alloc: inconsistent object(%p)/req(%x)", object, req)); if (object != NULL) VM_OBJECT_ASSERT_WLOCKED(object); req_class = req & VM_ALLOC_CLASS_MASK; /* * The page daemon is allowed to dig deeper into the free page list. */ if (curproc == pageproc && req_class != VM_ALLOC_INTERRUPT) req_class = VM_ALLOC_SYSTEM; if (object != NULL) { mpred = vm_radix_lookup_le(&object->rtree, pindex); KASSERT(mpred == NULL || mpred->pindex != pindex, ("vm_page_alloc: pindex already allocated")); } /* * Allocate a page if the number of free pages exceeds the minimum * for the request class. */ mtx_lock(&vm_page_queue_free_mtx); if (vm_cnt.v_free_count > vm_cnt.v_free_reserved || (req_class == VM_ALLOC_SYSTEM && vm_cnt.v_free_count > vm_cnt.v_interrupt_free_min) || (req_class == VM_ALLOC_INTERRUPT && vm_cnt.v_free_count > 0)) { /* * Can we allocate the page from a reservation? */ #if VM_NRESERVLEVEL > 0 if (object == NULL || (object->flags & (OBJ_COLORED | OBJ_FICTITIOUS)) != OBJ_COLORED || (m = vm_reserv_alloc_page(object, pindex, mpred)) == NULL) #endif { /* * If not, allocate it from the free page queues. */ m = vm_phys_alloc_pages(object != NULL ? VM_FREEPOOL_DEFAULT : VM_FREEPOOL_DIRECT, 0); #if VM_NRESERVLEVEL > 0 if (m == NULL && vm_reserv_reclaim_inactive()) { m = vm_phys_alloc_pages(object != NULL ? VM_FREEPOOL_DEFAULT : VM_FREEPOOL_DIRECT, 0); } #endif } } else { /* * Not allocatable, give up. */ mtx_unlock(&vm_page_queue_free_mtx); atomic_add_int(&vm_pageout_deficit, max((u_int)req >> VM_ALLOC_COUNT_SHIFT, 1)); pagedaemon_wakeup(); return (NULL); } /* * At this point we had better have found a good page. */ KASSERT(m != NULL, ("vm_page_alloc: missing page")); vm_phys_freecnt_adj(m, -1); mtx_unlock(&vm_page_queue_free_mtx); vm_page_alloc_check(m); /* * Initialize the page. Only the PG_ZERO flag is inherited. */ flags = 0; if ((req & VM_ALLOC_ZERO) != 0) flags = PG_ZERO; flags &= m->flags; if ((req & VM_ALLOC_NODUMP) != 0) flags |= PG_NODUMP; m->flags = flags; m->aflags = 0; m->oflags = object == NULL || (object->flags & OBJ_UNMANAGED) != 0 ? VPO_UNMANAGED : 0; m->busy_lock = VPB_UNBUSIED; if ((req & (VM_ALLOC_NOBUSY | VM_ALLOC_NOOBJ | VM_ALLOC_SBUSY)) == 0) m->busy_lock = VPB_SINGLE_EXCLUSIVER; if ((req & VM_ALLOC_SBUSY) != 0) m->busy_lock = VPB_SHARERS_WORD(1); if (req & VM_ALLOC_WIRED) { /* * The page lock is not required for wiring a page until that * page is inserted into the object. */ atomic_add_int(&vm_cnt.v_wire_count, 1); m->wire_count = 1; } m->act_count = 0; if (object != NULL) { if (vm_page_insert_after(m, object, pindex, mpred)) { pagedaemon_wakeup(); if (req & VM_ALLOC_WIRED) { atomic_subtract_int(&vm_cnt.v_wire_count, 1); m->wire_count = 0; } KASSERT(m->object == NULL, ("page %p has object", m)); m->oflags = VPO_UNMANAGED; m->busy_lock = VPB_UNBUSIED; /* Don't change PG_ZERO. */ vm_page_free_toq(m); return (NULL); } /* Ignore device objects; the pager sets "memattr" for them. */ if (object->memattr != VM_MEMATTR_DEFAULT && (object->flags & OBJ_FICTITIOUS) == 0) pmap_page_set_memattr(m, object->memattr); } else m->pindex = pindex; /* * Don't wakeup too often - wakeup the pageout daemon when * we would be nearly out of memory. */ if (vm_paging_needed()) pagedaemon_wakeup(); return (m); } /* * vm_page_alloc_contig: * * Allocate a contiguous set of physical pages of the given size "npages" * from the free lists. All of the physical pages must be at or above * the given physical address "low" and below the given physical address * "high". The given value "alignment" determines the alignment of the * first physical page in the set. If the given value "boundary" is * non-zero, then the set of physical pages cannot cross any physical * address boundary that is a multiple of that value. Both "alignment" * and "boundary" must be a power of two. * * If the specified memory attribute, "memattr", is VM_MEMATTR_DEFAULT, * then the memory attribute setting for the physical pages is configured * to the object's memory attribute setting. Otherwise, the memory * attribute setting for the physical pages is configured to "memattr", * overriding the object's memory attribute setting. However, if the * object's memory attribute setting is not VM_MEMATTR_DEFAULT, then the * memory attribute setting for the physical pages cannot be configured * to VM_MEMATTR_DEFAULT. * * The specified object may not contain fictitious pages. * * The caller must always specify an allocation class. * * allocation classes: * VM_ALLOC_NORMAL normal process request * VM_ALLOC_SYSTEM system *really* needs a page * VM_ALLOC_INTERRUPT interrupt time request * * optional allocation flags: * VM_ALLOC_NOBUSY do not exclusive busy the page * VM_ALLOC_NODUMP do not include the page in a kernel core dump * VM_ALLOC_NOOBJ page is not associated with an object and * should not be exclusive busy * VM_ALLOC_SBUSY shared busy the allocated page * VM_ALLOC_WIRED wire the allocated page * VM_ALLOC_ZERO prefer a zeroed page * * This routine may not sleep. */ vm_page_t vm_page_alloc_contig(vm_object_t object, vm_pindex_t pindex, int req, u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary, vm_memattr_t memattr) { vm_page_t m, m_ret, mpred; u_int busy_lock, flags, oflags; int req_class; mpred = NULL; /* XXX: pacify gcc */ KASSERT((object != NULL) == ((req & VM_ALLOC_NOOBJ) == 0) && (object != NULL || (req & VM_ALLOC_SBUSY) == 0) && ((req & (VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY)) != (VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY)), ("vm_page_alloc_contig: inconsistent object(%p)/req(%x)", object, req)); if (object != NULL) { VM_OBJECT_ASSERT_WLOCKED(object); KASSERT((object->flags & OBJ_FICTITIOUS) == 0, ("vm_page_alloc_contig: object %p has fictitious pages", object)); } KASSERT(npages > 0, ("vm_page_alloc_contig: npages is zero")); req_class = req & VM_ALLOC_CLASS_MASK; /* * The page daemon is allowed to dig deeper into the free page list. */ if (curproc == pageproc && req_class != VM_ALLOC_INTERRUPT) req_class = VM_ALLOC_SYSTEM; if (object != NULL) { mpred = vm_radix_lookup_le(&object->rtree, pindex); KASSERT(mpred == NULL || mpred->pindex != pindex, ("vm_page_alloc_contig: pindex already allocated")); } /* * Can we allocate the pages without the number of free pages falling * below the lower bound for the allocation class? */ mtx_lock(&vm_page_queue_free_mtx); if (vm_cnt.v_free_count >= npages + vm_cnt.v_free_reserved || (req_class == VM_ALLOC_SYSTEM && vm_cnt.v_free_count >= npages + vm_cnt.v_interrupt_free_min) || (req_class == VM_ALLOC_INTERRUPT && vm_cnt.v_free_count >= npages)) { /* * Can we allocate the pages from a reservation? */ #if VM_NRESERVLEVEL > 0 retry: if (object == NULL || (object->flags & OBJ_COLORED) == 0 || (m_ret = vm_reserv_alloc_contig(object, pindex, npages, low, high, alignment, boundary, mpred)) == NULL) #endif /* * If not, allocate them from the free page queues. */ m_ret = vm_phys_alloc_contig(npages, low, high, alignment, boundary); } else { mtx_unlock(&vm_page_queue_free_mtx); atomic_add_int(&vm_pageout_deficit, npages); pagedaemon_wakeup(); return (NULL); } if (m_ret != NULL) vm_phys_freecnt_adj(m_ret, -npages); else { #if VM_NRESERVLEVEL > 0 if (vm_reserv_reclaim_contig(npages, low, high, alignment, boundary)) goto retry; #endif } mtx_unlock(&vm_page_queue_free_mtx); if (m_ret == NULL) return (NULL); for (m = m_ret; m < &m_ret[npages]; m++) vm_page_alloc_check(m); /* * Initialize the pages. Only the PG_ZERO flag is inherited. */ flags = 0; if ((req & VM_ALLOC_ZERO) != 0) flags = PG_ZERO; if ((req & VM_ALLOC_NODUMP) != 0) flags |= PG_NODUMP; oflags = object == NULL || (object->flags & OBJ_UNMANAGED) != 0 ? VPO_UNMANAGED : 0; busy_lock = VPB_UNBUSIED; if ((req & (VM_ALLOC_NOBUSY | VM_ALLOC_NOOBJ | VM_ALLOC_SBUSY)) == 0) busy_lock = VPB_SINGLE_EXCLUSIVER; if ((req & VM_ALLOC_SBUSY) != 0) busy_lock = VPB_SHARERS_WORD(1); if ((req & VM_ALLOC_WIRED) != 0) atomic_add_int(&vm_cnt.v_wire_count, npages); if (object != NULL) { if (object->memattr != VM_MEMATTR_DEFAULT && memattr == VM_MEMATTR_DEFAULT) memattr = object->memattr; } for (m = m_ret; m < &m_ret[npages]; m++) { m->aflags = 0; m->flags = (m->flags | PG_NODUMP) & flags; m->busy_lock = busy_lock; if ((req & VM_ALLOC_WIRED) != 0) m->wire_count = 1; m->act_count = 0; m->oflags = oflags; if (object != NULL) { if (vm_page_insert_after(m, object, pindex, mpred)) { pagedaemon_wakeup(); if ((req & VM_ALLOC_WIRED) != 0) atomic_subtract_int( &vm_cnt.v_wire_count, npages); KASSERT(m->object == NULL, ("page %p has object", m)); mpred = m; for (m = m_ret; m < &m_ret[npages]; m++) { if (m <= mpred && (req & VM_ALLOC_WIRED) != 0) m->wire_count = 0; m->oflags = VPO_UNMANAGED; m->busy_lock = VPB_UNBUSIED; /* Don't change PG_ZERO. */ vm_page_free_toq(m); } return (NULL); } mpred = m; } else m->pindex = pindex; if (memattr != VM_MEMATTR_DEFAULT) pmap_page_set_memattr(m, memattr); pindex++; } if (vm_paging_needed()) pagedaemon_wakeup(); return (m_ret); } /* * Check a page that has been freshly dequeued from a freelist. */ static void vm_page_alloc_check(vm_page_t m) { KASSERT(m->object == NULL, ("page %p has object", m)); KASSERT(m->queue == PQ_NONE, ("page %p has unexpected queue %d", m, m->queue)); KASSERT(m->wire_count == 0, ("page %p is wired", m)); KASSERT(m->hold_count == 0, ("page %p is held", m)); KASSERT(!vm_page_busied(m), ("page %p is busy", m)); KASSERT(m->dirty == 0, ("page %p is dirty", m)); KASSERT(pmap_page_get_memattr(m) == VM_MEMATTR_DEFAULT, ("page %p has unexpected memattr %d", m, pmap_page_get_memattr(m))); KASSERT(m->valid == 0, ("free page %p is valid", m)); } /* * vm_page_alloc_freelist: * * Allocate a physical page from the specified free page list. * * The caller must always specify an allocation class. * * allocation classes: * VM_ALLOC_NORMAL normal process request * VM_ALLOC_SYSTEM system *really* needs a page * VM_ALLOC_INTERRUPT interrupt time request * * optional allocation flags: * VM_ALLOC_COUNT(number) the number of additional pages that the caller * intends to allocate * VM_ALLOC_WIRED wire the allocated page * VM_ALLOC_ZERO prefer a zeroed page * * This routine may not sleep. */ vm_page_t vm_page_alloc_freelist(int flind, int req) { vm_page_t m; u_int flags; int req_class; req_class = req & VM_ALLOC_CLASS_MASK; /* * The page daemon is allowed to dig deeper into the free page list. */ if (curproc == pageproc && req_class != VM_ALLOC_INTERRUPT) req_class = VM_ALLOC_SYSTEM; /* * Do not allocate reserved pages unless the req has asked for it. */ mtx_lock(&vm_page_queue_free_mtx); if (vm_cnt.v_free_count > vm_cnt.v_free_reserved || (req_class == VM_ALLOC_SYSTEM && vm_cnt.v_free_count > vm_cnt.v_interrupt_free_min) || (req_class == VM_ALLOC_INTERRUPT && vm_cnt.v_free_count > 0)) m = vm_phys_alloc_freelist_pages(flind, VM_FREEPOOL_DIRECT, 0); else { mtx_unlock(&vm_page_queue_free_mtx); atomic_add_int(&vm_pageout_deficit, max((u_int)req >> VM_ALLOC_COUNT_SHIFT, 1)); pagedaemon_wakeup(); return (NULL); } if (m == NULL) { mtx_unlock(&vm_page_queue_free_mtx); return (NULL); } vm_phys_freecnt_adj(m, -1); mtx_unlock(&vm_page_queue_free_mtx); vm_page_alloc_check(m); /* * Initialize the page. Only the PG_ZERO flag is inherited. */ m->aflags = 0; flags = 0; if ((req & VM_ALLOC_ZERO) != 0) flags = PG_ZERO; m->flags &= flags; if ((req & VM_ALLOC_WIRED) != 0) { /* * The page lock is not required for wiring a page that does * not belong to an object. */ atomic_add_int(&vm_cnt.v_wire_count, 1); m->wire_count = 1; } /* Unmanaged pages don't use "act_count". */ m->oflags = VPO_UNMANAGED; if (vm_paging_needed()) pagedaemon_wakeup(); return (m); } #define VPSC_ANY 0 /* No restrictions. */ #define VPSC_NORESERV 1 /* Skip reservations; implies VPSC_NOSUPER. */ #define VPSC_NOSUPER 2 /* Skip superpages. */ /* * vm_page_scan_contig: * * Scan vm_page_array[] between the specified entries "m_start" and * "m_end" for a run of contiguous physical pages that satisfy the * specified conditions, and return the lowest page in the run. The * specified "alignment" determines the alignment of the lowest physical * page in the run. If the specified "boundary" is non-zero, then the * run of physical pages cannot span a physical address that is a * multiple of "boundary". * * "m_end" is never dereferenced, so it need not point to a vm_page * structure within vm_page_array[]. * * "npages" must be greater than zero. "m_start" and "m_end" must not * span a hole (or discontiguity) in the physical address space. Both * "alignment" and "boundary" must be a power of two. */ vm_page_t vm_page_scan_contig(u_long npages, vm_page_t m_start, vm_page_t m_end, u_long alignment, vm_paddr_t boundary, int options) { struct mtx *m_mtx, *new_mtx; vm_object_t object; vm_paddr_t pa; vm_page_t m, m_run; #if VM_NRESERVLEVEL > 0 int level; #endif int m_inc, order, run_ext, run_len; KASSERT(npages > 0, ("npages is 0")); KASSERT(powerof2(alignment), ("alignment is not a power of 2")); KASSERT(powerof2(boundary), ("boundary is not a power of 2")); m_run = NULL; run_len = 0; m_mtx = NULL; for (m = m_start; m < m_end && run_len < npages; m += m_inc) { KASSERT((m->flags & (PG_FICTITIOUS | PG_MARKER)) == 0, ("page %p is PG_FICTITIOUS or PG_MARKER", m)); /* * If the current page would be the start of a run, check its * physical address against the end, alignment, and boundary * conditions. If it doesn't satisfy these conditions, either * terminate the scan or advance to the next page that * satisfies the failed condition. */ if (run_len == 0) { KASSERT(m_run == NULL, ("m_run != NULL")); if (m + npages > m_end) break; pa = VM_PAGE_TO_PHYS(m); if ((pa & (alignment - 1)) != 0) { m_inc = atop(roundup2(pa, alignment) - pa); continue; } if (rounddown2(pa ^ (pa + ptoa(npages) - 1), boundary) != 0) { m_inc = atop(roundup2(pa, boundary) - pa); continue; } } else KASSERT(m_run != NULL, ("m_run == NULL")); /* * Avoid releasing and reacquiring the same page lock. */ new_mtx = vm_page_lockptr(m); if (m_mtx != new_mtx) { if (m_mtx != NULL) mtx_unlock(m_mtx); m_mtx = new_mtx; mtx_lock(m_mtx); } m_inc = 1; retry: if (m->wire_count != 0 || m->hold_count != 0) run_ext = 0; #if VM_NRESERVLEVEL > 0 else if ((level = vm_reserv_level(m)) >= 0 && (options & VPSC_NORESERV) != 0) { run_ext = 0; /* Advance to the end of the reservation. */ pa = VM_PAGE_TO_PHYS(m); m_inc = atop(roundup2(pa + 1, vm_reserv_size(level)) - pa); } #endif else if ((object = m->object) != NULL) { /* * The page is considered eligible for relocation if * and only if it could be laundered or reclaimed by * the page daemon. */ if (!VM_OBJECT_TRYRLOCK(object)) { mtx_unlock(m_mtx); VM_OBJECT_RLOCK(object); mtx_lock(m_mtx); if (m->object != object) { /* * The page may have been freed. */ VM_OBJECT_RUNLOCK(object); goto retry; } else if (m->wire_count != 0 || m->hold_count != 0) { run_ext = 0; goto unlock; } } KASSERT((m->flags & PG_UNHOLDFREE) == 0, ("page %p is PG_UNHOLDFREE", m)); /* Don't care: PG_NODUMP, PG_ZERO. */ if (object->type != OBJT_DEFAULT && object->type != OBJT_SWAP && object->type != OBJT_VNODE) { run_ext = 0; #if VM_NRESERVLEVEL > 0 } else if ((options & VPSC_NOSUPER) != 0 && (level = vm_reserv_level_iffullpop(m)) >= 0) { run_ext = 0; /* Advance to the end of the superpage. */ pa = VM_PAGE_TO_PHYS(m); m_inc = atop(roundup2(pa + 1, vm_reserv_size(level)) - pa); #endif } else if (object->memattr == VM_MEMATTR_DEFAULT && m->queue != PQ_NONE && !vm_page_busied(m)) { /* * The page is allocated but eligible for * relocation. Extend the current run by one * page. */ KASSERT(pmap_page_get_memattr(m) == VM_MEMATTR_DEFAULT, ("page %p has an unexpected memattr", m)); KASSERT((m->oflags & (VPO_SWAPINPROG | VPO_SWAPSLEEP | VPO_UNMANAGED)) == 0, ("page %p has unexpected oflags", m)); /* Don't care: VPO_NOSYNC. */ run_ext = 1; } else run_ext = 0; unlock: VM_OBJECT_RUNLOCK(object); #if VM_NRESERVLEVEL > 0 } else if (level >= 0) { /* * The page is reserved but not yet allocated. In * other words, it is still free. Extend the current * run by one page. */ run_ext = 1; #endif } else if ((order = m->order) < VM_NFREEORDER) { /* * The page is enqueued in the physical memory * allocator's free page queues. Moreover, it is the * first page in a power-of-two-sized run of * contiguous free pages. Add these pages to the end * of the current run, and jump ahead. */ run_ext = 1 << order; m_inc = 1 << order; } else { /* * Skip the page for one of the following reasons: (1) * It is enqueued in the physical memory allocator's * free page queues. However, it is not the first * page in a run of contiguous free pages. (This case * rarely occurs because the scan is performed in * ascending order.) (2) It is not reserved, and it is * transitioning from free to allocated. (Conversely, * the transition from allocated to free for managed * pages is blocked by the page lock.) (3) It is * allocated but not contained by an object and not * wired, e.g., allocated by Xen's balloon driver. */ run_ext = 0; } /* * Extend or reset the current run of pages. */ if (run_ext > 0) { if (run_len == 0) m_run = m; run_len += run_ext; } else { if (run_len > 0) { m_run = NULL; run_len = 0; } } } if (m_mtx != NULL) mtx_unlock(m_mtx); if (run_len >= npages) return (m_run); return (NULL); } /* * vm_page_reclaim_run: * * Try to relocate each of the allocated virtual pages within the * specified run of physical pages to a new physical address. Free the * physical pages underlying the relocated virtual pages. A virtual page * is relocatable if and only if it could be laundered or reclaimed by * the page daemon. Whenever possible, a virtual page is relocated to a * physical address above "high". * * Returns 0 if every physical page within the run was already free or * just freed by a successful relocation. Otherwise, returns a non-zero * value indicating why the last attempt to relocate a virtual page was * unsuccessful. * * "req_class" must be an allocation class. */ static int vm_page_reclaim_run(int req_class, u_long npages, vm_page_t m_run, vm_paddr_t high) { struct mtx *m_mtx, *new_mtx; struct spglist free; vm_object_t object; vm_paddr_t pa; vm_page_t m, m_end, m_new; int error, order, req; KASSERT((req_class & VM_ALLOC_CLASS_MASK) == req_class, ("req_class is not an allocation class")); SLIST_INIT(&free); error = 0; m = m_run; m_end = m_run + npages; m_mtx = NULL; for (; error == 0 && m < m_end; m++) { KASSERT((m->flags & (PG_FICTITIOUS | PG_MARKER)) == 0, ("page %p is PG_FICTITIOUS or PG_MARKER", m)); /* * Avoid releasing and reacquiring the same page lock. */ new_mtx = vm_page_lockptr(m); if (m_mtx != new_mtx) { if (m_mtx != NULL) mtx_unlock(m_mtx); m_mtx = new_mtx; mtx_lock(m_mtx); } retry: if (m->wire_count != 0 || m->hold_count != 0) error = EBUSY; else if ((object = m->object) != NULL) { /* * The page is relocated if and only if it could be * laundered or reclaimed by the page daemon. */ if (!VM_OBJECT_TRYWLOCK(object)) { mtx_unlock(m_mtx); VM_OBJECT_WLOCK(object); mtx_lock(m_mtx); if (m->object != object) { /* * The page may have been freed. */ VM_OBJECT_WUNLOCK(object); goto retry; } else if (m->wire_count != 0 || m->hold_count != 0) { error = EBUSY; goto unlock; } } KASSERT((m->flags & PG_UNHOLDFREE) == 0, ("page %p is PG_UNHOLDFREE", m)); /* Don't care: PG_NODUMP, PG_ZERO. */ if (object->type != OBJT_DEFAULT && object->type != OBJT_SWAP && object->type != OBJT_VNODE) error = EINVAL; else if (object->memattr != VM_MEMATTR_DEFAULT) error = EINVAL; else if (m->queue != PQ_NONE && !vm_page_busied(m)) { KASSERT(pmap_page_get_memattr(m) == VM_MEMATTR_DEFAULT, ("page %p has an unexpected memattr", m)); KASSERT((m->oflags & (VPO_SWAPINPROG | VPO_SWAPSLEEP | VPO_UNMANAGED)) == 0, ("page %p has unexpected oflags", m)); /* Don't care: VPO_NOSYNC. */ if (m->valid != 0) { /* * First, try to allocate a new page * that is above "high". Failing * that, try to allocate a new page * that is below "m_run". Allocate * the new page between the end of * "m_run" and "high" only as a last * resort. */ req = req_class | VM_ALLOC_NOOBJ; if ((m->flags & PG_NODUMP) != 0) req |= VM_ALLOC_NODUMP; if (trunc_page(high) != ~(vm_paddr_t)PAGE_MASK) { m_new = vm_page_alloc_contig( NULL, 0, req, 1, round_page(high), ~(vm_paddr_t)0, PAGE_SIZE, 0, VM_MEMATTR_DEFAULT); } else m_new = NULL; if (m_new == NULL) { pa = VM_PAGE_TO_PHYS(m_run); m_new = vm_page_alloc_contig( NULL, 0, req, 1, 0, pa - 1, PAGE_SIZE, 0, VM_MEMATTR_DEFAULT); } if (m_new == NULL) { pa += ptoa(npages); m_new = vm_page_alloc_contig( NULL, 0, req, 1, pa, high, PAGE_SIZE, 0, VM_MEMATTR_DEFAULT); } if (m_new == NULL) { error = ENOMEM; goto unlock; } KASSERT(m_new->wire_count == 0, ("page %p is wired", m)); /* * Replace "m" with the new page. For * vm_page_replace(), "m" must be busy * and dequeued. Finally, change "m" * as if vm_page_free() was called. */ if (object->ref_count != 0) pmap_remove_all(m); m_new->aflags = m->aflags; KASSERT(m_new->oflags == VPO_UNMANAGED, ("page %p is managed", m)); m_new->oflags = m->oflags & VPO_NOSYNC; pmap_copy_page(m, m_new); m_new->valid = m->valid; m_new->dirty = m->dirty; m->flags &= ~PG_ZERO; vm_page_xbusy(m); vm_page_remque(m); vm_page_replace_checked(m_new, object, m->pindex, m); m->valid = 0; vm_page_undirty(m); /* * The new page must be deactivated * before the object is unlocked. */ new_mtx = vm_page_lockptr(m_new); if (m_mtx != new_mtx) { mtx_unlock(m_mtx); m_mtx = new_mtx; mtx_lock(m_mtx); } vm_page_deactivate(m_new); } else { m->flags &= ~PG_ZERO; vm_page_remque(m); vm_page_remove(m); KASSERT(m->dirty == 0, ("page %p is dirty", m)); } SLIST_INSERT_HEAD(&free, m, plinks.s.ss); } else error = EBUSY; unlock: VM_OBJECT_WUNLOCK(object); } else { mtx_lock(&vm_page_queue_free_mtx); order = m->order; if (order < VM_NFREEORDER) { /* * The page is enqueued in the physical memory * allocator's free page queues. Moreover, it * is the first page in a power-of-two-sized * run of contiguous free pages. Jump ahead * to the last page within that run, and * continue from there. */ m += (1 << order) - 1; } #if VM_NRESERVLEVEL > 0 else if (vm_reserv_is_page_free(m)) order = 0; #endif mtx_unlock(&vm_page_queue_free_mtx); if (order == VM_NFREEORDER) error = EINVAL; } } if (m_mtx != NULL) mtx_unlock(m_mtx); if ((m = SLIST_FIRST(&free)) != NULL) { mtx_lock(&vm_page_queue_free_mtx); do { SLIST_REMOVE_HEAD(&free, plinks.s.ss); vm_phys_freecnt_adj(m, 1); #if VM_NRESERVLEVEL > 0 if (!vm_reserv_free_page(m)) #else if (true) #endif vm_phys_free_pages(m, 0); } while ((m = SLIST_FIRST(&free)) != NULL); vm_page_free_wakeup(); mtx_unlock(&vm_page_queue_free_mtx); } return (error); } #define NRUNS 16 CTASSERT(powerof2(NRUNS)); #define RUN_INDEX(count) ((count) & (NRUNS - 1)) #define MIN_RECLAIM 8 /* * vm_page_reclaim_contig: * * Reclaim allocated, contiguous physical memory satisfying the specified * conditions by relocating the virtual pages using that physical memory. * Returns true if reclamation is successful and false otherwise. Since * relocation requires the allocation of physical pages, reclamation may * fail due to a shortage of free pages. When reclamation fails, callers * are expected to perform VM_WAIT before retrying a failed allocation * operation, e.g., vm_page_alloc_contig(). * * The caller must always specify an allocation class through "req". * * allocation classes: * VM_ALLOC_NORMAL normal process request * VM_ALLOC_SYSTEM system *really* needs a page * VM_ALLOC_INTERRUPT interrupt time request * * The optional allocation flags are ignored. * * "npages" must be greater than zero. Both "alignment" and "boundary" * must be a power of two. */ bool vm_page_reclaim_contig(int req, u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary) { vm_paddr_t curr_low; vm_page_t m_run, m_runs[NRUNS]; u_long count, reclaimed; int error, i, options, req_class; KASSERT(npages > 0, ("npages is 0")); KASSERT(powerof2(alignment), ("alignment is not a power of 2")); KASSERT(powerof2(boundary), ("boundary is not a power of 2")); req_class = req & VM_ALLOC_CLASS_MASK; /* * The page daemon is allowed to dig deeper into the free page list. */ if (curproc == pageproc && req_class != VM_ALLOC_INTERRUPT) req_class = VM_ALLOC_SYSTEM; /* * Return if the number of free pages cannot satisfy the requested * allocation. */ count = vm_cnt.v_free_count; if (count < npages + vm_cnt.v_free_reserved || (count < npages + vm_cnt.v_interrupt_free_min && req_class == VM_ALLOC_SYSTEM) || (count < npages && req_class == VM_ALLOC_INTERRUPT)) return (false); /* * Scan up to three times, relaxing the restrictions ("options") on * the reclamation of reservations and superpages each time. */ for (options = VPSC_NORESERV;;) { /* * Find the highest runs that satisfy the given constraints * and restrictions, and record them in "m_runs". */ curr_low = low; count = 0; for (;;) { m_run = vm_phys_scan_contig(npages, curr_low, high, alignment, boundary, options); if (m_run == NULL) break; curr_low = VM_PAGE_TO_PHYS(m_run) + ptoa(npages); m_runs[RUN_INDEX(count)] = m_run; count++; } /* * Reclaim the highest runs in LIFO (descending) order until * the number of reclaimed pages, "reclaimed", is at least * MIN_RECLAIM. Reset "reclaimed" each time because each * reclamation is idempotent, and runs will (likely) recur * from one scan to the next as restrictions are relaxed. */ reclaimed = 0; for (i = 0; count > 0 && i < NRUNS; i++) { count--; m_run = m_runs[RUN_INDEX(count)]; error = vm_page_reclaim_run(req_class, npages, m_run, high); if (error == 0) { reclaimed += npages; if (reclaimed >= MIN_RECLAIM) return (true); } } /* * Either relax the restrictions on the next scan or return if * the last scan had no restrictions. */ if (options == VPSC_NORESERV) options = VPSC_NOSUPER; else if (options == VPSC_NOSUPER) options = VPSC_ANY; else if (options == VPSC_ANY) return (reclaimed != 0); } } /* * vm_wait: (also see VM_WAIT macro) * * Sleep until free pages are available for allocation. * - Called in various places before memory allocations. */ void vm_wait(void) { mtx_lock(&vm_page_queue_free_mtx); if (curproc == pageproc) { vm_pageout_pages_needed = 1; msleep(&vm_pageout_pages_needed, &vm_page_queue_free_mtx, PDROP | PSWP, "VMWait", 0); } else { if (__predict_false(pageproc == NULL)) panic("vm_wait in early boot"); if (!vm_pageout_wanted) { vm_pageout_wanted = true; wakeup(&vm_pageout_wanted); } vm_pages_needed = true; msleep(&vm_cnt.v_free_count, &vm_page_queue_free_mtx, PDROP | PVM, "vmwait", 0); } } /* * vm_waitpfault: (also see VM_WAITPFAULT macro) * * Sleep until free pages are available for allocation. * - Called only in vm_fault so that processes page faulting * can be easily tracked. * - Sleeps at a lower priority than vm_wait() so that vm_wait()ing * processes will be able to grab memory first. Do not change * this balance without careful testing first. */ void vm_waitpfault(void) { mtx_lock(&vm_page_queue_free_mtx); if (!vm_pageout_wanted) { vm_pageout_wanted = true; wakeup(&vm_pageout_wanted); } vm_pages_needed = true; msleep(&vm_cnt.v_free_count, &vm_page_queue_free_mtx, PDROP | PUSER, "pfault", 0); } struct vm_pagequeue * vm_page_pagequeue(vm_page_t m) { if (vm_page_in_laundry(m)) return (&vm_dom[0].vmd_pagequeues[m->queue]); else return (&vm_phys_domain(m)->vmd_pagequeues[m->queue]); } /* * vm_page_dequeue: * * Remove the given page from its current page queue. * * The page must be locked. */ void vm_page_dequeue(vm_page_t m) { struct vm_pagequeue *pq; vm_page_assert_locked(m); KASSERT(m->queue < PQ_COUNT, ("vm_page_dequeue: page %p is not queued", m)); pq = vm_page_pagequeue(m); vm_pagequeue_lock(pq); m->queue = PQ_NONE; TAILQ_REMOVE(&pq->pq_pl, m, plinks.q); vm_pagequeue_cnt_dec(pq); vm_pagequeue_unlock(pq); } /* * vm_page_dequeue_locked: * * Remove the given page from its current page queue. * * The page and page queue must be locked. */ void vm_page_dequeue_locked(vm_page_t m) { struct vm_pagequeue *pq; vm_page_lock_assert(m, MA_OWNED); pq = vm_page_pagequeue(m); vm_pagequeue_assert_locked(pq); m->queue = PQ_NONE; TAILQ_REMOVE(&pq->pq_pl, m, plinks.q); vm_pagequeue_cnt_dec(pq); } /* * vm_page_enqueue: * * Add the given page to the specified page queue. * * The page must be locked. */ static void vm_page_enqueue(uint8_t queue, vm_page_t m) { struct vm_pagequeue *pq; vm_page_lock_assert(m, MA_OWNED); KASSERT(queue < PQ_COUNT, ("vm_page_enqueue: invalid queue %u request for page %p", queue, m)); if (queue == PQ_LAUNDRY || queue == PQ_UNSWAPPABLE) pq = &vm_dom[0].vmd_pagequeues[queue]; else pq = &vm_phys_domain(m)->vmd_pagequeues[queue]; vm_pagequeue_lock(pq); m->queue = queue; TAILQ_INSERT_TAIL(&pq->pq_pl, m, plinks.q); vm_pagequeue_cnt_inc(pq); vm_pagequeue_unlock(pq); } /* * vm_page_requeue: * * Move the given page to the tail of its current page queue. * * The page must be locked. */ void vm_page_requeue(vm_page_t m) { struct vm_pagequeue *pq; vm_page_lock_assert(m, MA_OWNED); KASSERT(m->queue != PQ_NONE, ("vm_page_requeue: page %p is not queued", m)); pq = vm_page_pagequeue(m); vm_pagequeue_lock(pq); TAILQ_REMOVE(&pq->pq_pl, m, plinks.q); TAILQ_INSERT_TAIL(&pq->pq_pl, m, plinks.q); vm_pagequeue_unlock(pq); } /* * vm_page_requeue_locked: * * Move the given page to the tail of its current page queue. * * The page queue must be locked. */ void vm_page_requeue_locked(vm_page_t m) { struct vm_pagequeue *pq; KASSERT(m->queue != PQ_NONE, ("vm_page_requeue_locked: page %p is not queued", m)); pq = vm_page_pagequeue(m); vm_pagequeue_assert_locked(pq); TAILQ_REMOVE(&pq->pq_pl, m, plinks.q); TAILQ_INSERT_TAIL(&pq->pq_pl, m, plinks.q); } /* * vm_page_activate: * * Put the specified page on the active list (if appropriate). * Ensure that act_count is at least ACT_INIT but do not otherwise * mess with it. * * The page must be locked. */ void vm_page_activate(vm_page_t m) { int queue; vm_page_lock_assert(m, MA_OWNED); if ((queue = m->queue) != PQ_ACTIVE) { if (m->wire_count == 0 && (m->oflags & VPO_UNMANAGED) == 0) { if (m->act_count < ACT_INIT) m->act_count = ACT_INIT; if (queue != PQ_NONE) vm_page_dequeue(m); vm_page_enqueue(PQ_ACTIVE, m); } else KASSERT(queue == PQ_NONE, ("vm_page_activate: wired page %p is queued", m)); } else { if (m->act_count < ACT_INIT) m->act_count = ACT_INIT; } } /* * vm_page_free_wakeup: * * Helper routine for vm_page_free_toq(). This routine is called * when a page is added to the free queues. * * The page queues must be locked. */ static inline void vm_page_free_wakeup(void) { mtx_assert(&vm_page_queue_free_mtx, MA_OWNED); /* * if pageout daemon needs pages, then tell it that there are * some free. */ if (vm_pageout_pages_needed && vm_cnt.v_free_count >= vm_cnt.v_pageout_free_min) { wakeup(&vm_pageout_pages_needed); vm_pageout_pages_needed = 0; } /* * wakeup processes that are waiting on memory if we hit a * high water mark. And wakeup scheduler process if we have * lots of memory. this process will swapin processes. */ if (vm_pages_needed && !vm_page_count_min()) { vm_pages_needed = false; wakeup(&vm_cnt.v_free_count); } } /* * vm_page_free_toq: * * Returns the given page to the free list, * disassociating it with any VM object. * * The object must be locked. The page must be locked if it is managed. */ void vm_page_free_toq(vm_page_t m) { if ((m->oflags & VPO_UNMANAGED) == 0) { vm_page_lock_assert(m, MA_OWNED); KASSERT(!pmap_page_is_mapped(m), ("vm_page_free_toq: freeing mapped page %p", m)); } else KASSERT(m->queue == PQ_NONE, ("vm_page_free_toq: unmanaged page %p is queued", m)); PCPU_INC(cnt.v_tfree); if (vm_page_sbusied(m)) panic("vm_page_free: freeing busy page %p", m); /* * Unqueue, then remove page. Note that we cannot destroy * the page here because we do not want to call the pager's * callback routine until after we've put the page on the * appropriate free queue. */ vm_page_remque(m); vm_page_remove(m); /* * If fictitious remove object association and * return, otherwise delay object association removal. */ if ((m->flags & PG_FICTITIOUS) != 0) { return; } m->valid = 0; vm_page_undirty(m); if (m->wire_count != 0) panic("vm_page_free: freeing wired page %p", m); if (m->hold_count != 0) { m->flags &= ~PG_ZERO; KASSERT((m->flags & PG_UNHOLDFREE) == 0, ("vm_page_free: freeing PG_UNHOLDFREE page %p", m)); m->flags |= PG_UNHOLDFREE; } else { /* * Restore the default memory attribute to the page. */ if (pmap_page_get_memattr(m) != VM_MEMATTR_DEFAULT) pmap_page_set_memattr(m, VM_MEMATTR_DEFAULT); /* * Insert the page into the physical memory allocator's free * page queues. */ mtx_lock(&vm_page_queue_free_mtx); vm_phys_freecnt_adj(m, 1); #if VM_NRESERVLEVEL > 0 if (!vm_reserv_free_page(m)) #else if (TRUE) #endif vm_phys_free_pages(m, 0); vm_page_free_wakeup(); mtx_unlock(&vm_page_queue_free_mtx); } } /* * vm_page_wire: * * Mark this page as wired down by yet * another map, removing it from paging queues * as necessary. * * If the page is fictitious, then its wire count must remain one. * * The page must be locked. */ void vm_page_wire(vm_page_t m) { /* * Only bump the wire statistics if the page is not already wired, * and only unqueue the page if it is on some queue (if it is unmanaged * it is already off the queues). */ vm_page_lock_assert(m, MA_OWNED); if ((m->flags & PG_FICTITIOUS) != 0) { KASSERT(m->wire_count == 1, ("vm_page_wire: fictitious page %p's wire count isn't one", m)); return; } if (m->wire_count == 0) { KASSERT((m->oflags & VPO_UNMANAGED) == 0 || m->queue == PQ_NONE, ("vm_page_wire: unmanaged page %p is queued", m)); vm_page_remque(m); atomic_add_int(&vm_cnt.v_wire_count, 1); } m->wire_count++; KASSERT(m->wire_count != 0, ("vm_page_wire: wire_count overflow m=%p", m)); } /* * vm_page_unwire: * * Release one wiring of the specified page, potentially allowing it to be * paged out. Returns TRUE if the number of wirings transitions to zero and * FALSE otherwise. * * Only managed pages belonging to an object can be paged out. If the number * of wirings transitions to zero and the page is eligible for page out, then * the page is added to the specified paging queue (unless PQ_NONE is * specified). * * If a page is fictitious, then its wire count must always be one. * * A managed page must be locked. */ boolean_t vm_page_unwire(vm_page_t m, uint8_t queue) { KASSERT(queue < PQ_COUNT || queue == PQ_NONE, ("vm_page_unwire: invalid queue %u request for page %p", queue, m)); if ((m->oflags & VPO_UNMANAGED) == 0) vm_page_assert_locked(m); if ((m->flags & PG_FICTITIOUS) != 0) { KASSERT(m->wire_count == 1, ("vm_page_unwire: fictitious page %p's wire count isn't one", m)); return (FALSE); } if (m->wire_count > 0) { m->wire_count--; if (m->wire_count == 0) { atomic_subtract_int(&vm_cnt.v_wire_count, 1); if ((m->oflags & VPO_UNMANAGED) == 0 && m->object != NULL && queue != PQ_NONE) vm_page_enqueue(queue, m); return (TRUE); } else return (FALSE); } else panic("vm_page_unwire: page %p's wire count is zero", m); } /* * Move the specified page to the inactive queue. * * Normally, "noreuse" is FALSE, resulting in LRU ordering of the inactive * queue. However, setting "noreuse" to TRUE will accelerate the specified * page's reclamation, but it will not unmap the page from any address space. * This is implemented by inserting the page near the head of the inactive * queue, using a marker page to guide FIFO insertion ordering. * * The page must be locked. */ static inline void _vm_page_deactivate(vm_page_t m, boolean_t noreuse) { struct vm_pagequeue *pq; int queue; vm_page_assert_locked(m); /* * Ignore if the page is already inactive, unless it is unlikely to be * reactivated. */ if ((queue = m->queue) == PQ_INACTIVE && !noreuse) return; if (m->wire_count == 0 && (m->oflags & VPO_UNMANAGED) == 0) { pq = &vm_phys_domain(m)->vmd_pagequeues[PQ_INACTIVE]; /* Avoid multiple acquisitions of the inactive queue lock. */ if (queue == PQ_INACTIVE) { vm_pagequeue_lock(pq); vm_page_dequeue_locked(m); } else { if (queue != PQ_NONE) vm_page_dequeue(m); vm_pagequeue_lock(pq); } m->queue = PQ_INACTIVE; if (noreuse) TAILQ_INSERT_BEFORE(&vm_phys_domain(m)->vmd_inacthead, m, plinks.q); else TAILQ_INSERT_TAIL(&pq->pq_pl, m, plinks.q); vm_pagequeue_cnt_inc(pq); vm_pagequeue_unlock(pq); } } /* * Move the specified page to the inactive queue. * * The page must be locked. */ void vm_page_deactivate(vm_page_t m) { _vm_page_deactivate(m, FALSE); } /* * Move the specified page to the inactive queue with the expectation * that it is unlikely to be reused. * * The page must be locked. */ void vm_page_deactivate_noreuse(vm_page_t m) { _vm_page_deactivate(m, TRUE); } /* * vm_page_launder * * Put a page in the laundry. */ void vm_page_launder(vm_page_t m) { int queue; vm_page_assert_locked(m); if ((queue = m->queue) != PQ_LAUNDRY) { if (m->wire_count == 0 && (m->oflags & VPO_UNMANAGED) == 0) { if (queue != PQ_NONE) vm_page_dequeue(m); vm_page_enqueue(PQ_LAUNDRY, m); } else KASSERT(queue == PQ_NONE, ("wired page %p is queued", m)); } } /* * vm_page_unswappable * * Put a page in the PQ_UNSWAPPABLE holding queue. */ void vm_page_unswappable(vm_page_t m) { vm_page_assert_locked(m); KASSERT(m->wire_count == 0 && (m->oflags & VPO_UNMANAGED) == 0, ("page %p already unswappable", m)); if (m->queue != PQ_NONE) vm_page_dequeue(m); vm_page_enqueue(PQ_UNSWAPPABLE, m); } /* * vm_page_try_to_free() * * Attempt to free the page. If we cannot free it, we do nothing. * 1 is returned on success, 0 on failure. */ int vm_page_try_to_free(vm_page_t m) { vm_page_lock_assert(m, MA_OWNED); if (m->object != NULL) VM_OBJECT_ASSERT_WLOCKED(m->object); if (m->dirty || m->hold_count || m->wire_count || (m->oflags & VPO_UNMANAGED) != 0 || vm_page_busied(m)) return (0); pmap_remove_all(m); if (m->dirty) return (0); vm_page_free(m); return (1); } /* * vm_page_advise * - * Deactivate or do nothing, as appropriate. + * Apply the specified advice to the given page. * * The object and page must be locked. */ void vm_page_advise(vm_page_t m, int advice) { vm_page_assert_locked(m); VM_OBJECT_ASSERT_WLOCKED(m->object); if (advice == MADV_FREE) /* * Mark the page clean. This will allow the page to be freed * without first paging it out. MADV_FREE pages are often * quickly reused by malloc(3), so we do not do anything that * would result in a page fault on a later access. */ vm_page_undirty(m); - else if (advice != MADV_DONTNEED) + else if (advice != MADV_DONTNEED) { + if (advice == MADV_WILLNEED) + vm_page_activate(m); return; + } /* * Clear any references to the page. Otherwise, the page daemon will * immediately reactivate the page. */ vm_page_aflag_clear(m, PGA_REFERENCED); if (advice != MADV_FREE && m->dirty == 0 && pmap_is_modified(m)) vm_page_dirty(m); /* * Place clean pages near the head of the inactive queue rather than * the tail, thus defeating the queue's LRU operation and ensuring that * the page will be reused quickly. Dirty pages not already in the * laundry are moved there. */ if (m->dirty == 0) vm_page_deactivate_noreuse(m); else vm_page_launder(m); } /* * Grab a page, waiting until we are waken up due to the page * changing state. We keep on waiting, if the page continues * to be in the object. If the page doesn't exist, first allocate it * and then conditionally zero it. * * This routine may sleep. * * The object must be locked on entry. The lock will, however, be released * and reacquired if the routine sleeps. */ vm_page_t vm_page_grab(vm_object_t object, vm_pindex_t pindex, int allocflags) { vm_page_t m; int sleep; VM_OBJECT_ASSERT_WLOCKED(object); KASSERT((allocflags & VM_ALLOC_SBUSY) == 0 || (allocflags & VM_ALLOC_IGN_SBUSY) != 0, ("vm_page_grab: VM_ALLOC_SBUSY/VM_ALLOC_IGN_SBUSY mismatch")); retrylookup: if ((m = vm_page_lookup(object, pindex)) != NULL) { sleep = (allocflags & VM_ALLOC_IGN_SBUSY) != 0 ? vm_page_xbusied(m) : vm_page_busied(m); if (sleep) { if ((allocflags & VM_ALLOC_NOWAIT) != 0) return (NULL); /* * Reference the page before unlocking and * sleeping so that the page daemon is less * likely to reclaim it. */ vm_page_aflag_set(m, PGA_REFERENCED); vm_page_lock(m); VM_OBJECT_WUNLOCK(object); vm_page_busy_sleep(m, "pgrbwt", (allocflags & VM_ALLOC_IGN_SBUSY) != 0); VM_OBJECT_WLOCK(object); goto retrylookup; } else { if ((allocflags & VM_ALLOC_WIRED) != 0) { vm_page_lock(m); vm_page_wire(m); vm_page_unlock(m); } if ((allocflags & (VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY)) == 0) vm_page_xbusy(m); if ((allocflags & VM_ALLOC_SBUSY) != 0) vm_page_sbusy(m); return (m); } } m = vm_page_alloc(object, pindex, allocflags); if (m == NULL) { if ((allocflags & VM_ALLOC_NOWAIT) != 0) return (NULL); VM_OBJECT_WUNLOCK(object); VM_WAIT; VM_OBJECT_WLOCK(object); goto retrylookup; } if (allocflags & VM_ALLOC_ZERO && (m->flags & PG_ZERO) == 0) pmap_zero_page(m); return (m); } /* * Mapping function for valid or dirty bits in a page. * * Inputs are required to range within a page. */ vm_page_bits_t vm_page_bits(int base, int size) { int first_bit; int last_bit; KASSERT( base + size <= PAGE_SIZE, ("vm_page_bits: illegal base/size %d/%d", base, size) ); if (size == 0) /* handle degenerate case */ return (0); first_bit = base >> DEV_BSHIFT; last_bit = (base + size - 1) >> DEV_BSHIFT; return (((vm_page_bits_t)2 << last_bit) - ((vm_page_bits_t)1 << first_bit)); } /* * vm_page_set_valid_range: * * Sets portions of a page valid. The arguments are expected * to be DEV_BSIZE aligned but if they aren't the bitmap is inclusive * of any partial chunks touched by the range. The invalid portion of * such chunks will be zeroed. * * (base + size) must be less then or equal to PAGE_SIZE. */ void vm_page_set_valid_range(vm_page_t m, int base, int size) { int endoff, frag; VM_OBJECT_ASSERT_WLOCKED(m->object); if (size == 0) /* handle degenerate case */ return; /* * If the base is not DEV_BSIZE aligned and the valid * bit is clear, we have to zero out a portion of the * first block. */ if ((frag = rounddown2(base, DEV_BSIZE)) != base && (m->valid & (1 << (base >> DEV_BSHIFT))) == 0) pmap_zero_page_area(m, frag, base - frag); /* * If the ending offset is not DEV_BSIZE aligned and the * valid bit is clear, we have to zero out a portion of * the last block. */ endoff = base + size; if ((frag = rounddown2(endoff, DEV_BSIZE)) != endoff && (m->valid & (1 << (endoff >> DEV_BSHIFT))) == 0) pmap_zero_page_area(m, endoff, DEV_BSIZE - (endoff & (DEV_BSIZE - 1))); /* * Assert that no previously invalid block that is now being validated * is already dirty. */ KASSERT((~m->valid & vm_page_bits(base, size) & m->dirty) == 0, ("vm_page_set_valid_range: page %p is dirty", m)); /* * Set valid bits inclusive of any overlap. */ m->valid |= vm_page_bits(base, size); } /* * Clear the given bits from the specified page's dirty field. */ static __inline void vm_page_clear_dirty_mask(vm_page_t m, vm_page_bits_t pagebits) { uintptr_t addr; #if PAGE_SIZE < 16384 int shift; #endif /* * If the object is locked and the page is neither exclusive busy nor * write mapped, then the page's dirty field cannot possibly be * set by a concurrent pmap operation. */ VM_OBJECT_ASSERT_WLOCKED(m->object); if (!vm_page_xbusied(m) && !pmap_page_is_write_mapped(m)) m->dirty &= ~pagebits; else { /* * The pmap layer can call vm_page_dirty() without * holding a distinguished lock. The combination of * the object's lock and an atomic operation suffice * to guarantee consistency of the page dirty field. * * For PAGE_SIZE == 32768 case, compiler already * properly aligns the dirty field, so no forcible * alignment is needed. Only require existence of * atomic_clear_64 when page size is 32768. */ addr = (uintptr_t)&m->dirty; #if PAGE_SIZE == 32768 atomic_clear_64((uint64_t *)addr, pagebits); #elif PAGE_SIZE == 16384 atomic_clear_32((uint32_t *)addr, pagebits); #else /* PAGE_SIZE <= 8192 */ /* * Use a trick to perform a 32-bit atomic on the * containing aligned word, to not depend on the existence * of atomic_clear_{8, 16}. */ shift = addr & (sizeof(uint32_t) - 1); #if BYTE_ORDER == BIG_ENDIAN shift = (sizeof(uint32_t) - sizeof(m->dirty) - shift) * NBBY; #else shift *= NBBY; #endif addr &= ~(sizeof(uint32_t) - 1); atomic_clear_32((uint32_t *)addr, pagebits << shift); #endif /* PAGE_SIZE */ } } /* * vm_page_set_validclean: * * Sets portions of a page valid and clean. The arguments are expected * to be DEV_BSIZE aligned but if they aren't the bitmap is inclusive * of any partial chunks touched by the range. The invalid portion of * such chunks will be zero'd. * * (base + size) must be less then or equal to PAGE_SIZE. */ void vm_page_set_validclean(vm_page_t m, int base, int size) { vm_page_bits_t oldvalid, pagebits; int endoff, frag; VM_OBJECT_ASSERT_WLOCKED(m->object); if (size == 0) /* handle degenerate case */ return; /* * If the base is not DEV_BSIZE aligned and the valid * bit is clear, we have to zero out a portion of the * first block. */ if ((frag = rounddown2(base, DEV_BSIZE)) != base && (m->valid & ((vm_page_bits_t)1 << (base >> DEV_BSHIFT))) == 0) pmap_zero_page_area(m, frag, base - frag); /* * If the ending offset is not DEV_BSIZE aligned and the * valid bit is clear, we have to zero out a portion of * the last block. */ endoff = base + size; if ((frag = rounddown2(endoff, DEV_BSIZE)) != endoff && (m->valid & ((vm_page_bits_t)1 << (endoff >> DEV_BSHIFT))) == 0) pmap_zero_page_area(m, endoff, DEV_BSIZE - (endoff & (DEV_BSIZE - 1))); /* * Set valid, clear dirty bits. If validating the entire * page we can safely clear the pmap modify bit. We also * use this opportunity to clear the VPO_NOSYNC flag. If a process * takes a write fault on a MAP_NOSYNC memory area the flag will * be set again. * * We set valid bits inclusive of any overlap, but we can only * clear dirty bits for DEV_BSIZE chunks that are fully within * the range. */ oldvalid = m->valid; pagebits = vm_page_bits(base, size); m->valid |= pagebits; #if 0 /* NOT YET */ if ((frag = base & (DEV_BSIZE - 1)) != 0) { frag = DEV_BSIZE - frag; base += frag; size -= frag; if (size < 0) size = 0; } pagebits = vm_page_bits(base, size & (DEV_BSIZE - 1)); #endif if (base == 0 && size == PAGE_SIZE) { /* * The page can only be modified within the pmap if it is * mapped, and it can only be mapped if it was previously * fully valid. */ if (oldvalid == VM_PAGE_BITS_ALL) /* * Perform the pmap_clear_modify() first. Otherwise, * a concurrent pmap operation, such as * pmap_protect(), could clear a modification in the * pmap and set the dirty field on the page before * pmap_clear_modify() had begun and after the dirty * field was cleared here. */ pmap_clear_modify(m); m->dirty = 0; m->oflags &= ~VPO_NOSYNC; } else if (oldvalid != VM_PAGE_BITS_ALL) m->dirty &= ~pagebits; else vm_page_clear_dirty_mask(m, pagebits); } void vm_page_clear_dirty(vm_page_t m, int base, int size) { vm_page_clear_dirty_mask(m, vm_page_bits(base, size)); } /* * vm_page_set_invalid: * * Invalidates DEV_BSIZE'd chunks within a page. Both the * valid and dirty bits for the effected areas are cleared. */ void vm_page_set_invalid(vm_page_t m, int base, int size) { vm_page_bits_t bits; vm_object_t object; object = m->object; VM_OBJECT_ASSERT_WLOCKED(object); if (object->type == OBJT_VNODE && base == 0 && IDX_TO_OFF(m->pindex) + size >= object->un_pager.vnp.vnp_size) bits = VM_PAGE_BITS_ALL; else bits = vm_page_bits(base, size); if (object->ref_count != 0 && m->valid == VM_PAGE_BITS_ALL && bits != 0) pmap_remove_all(m); KASSERT((bits == 0 && m->valid == VM_PAGE_BITS_ALL) || !pmap_page_is_mapped(m), ("vm_page_set_invalid: page %p is mapped", m)); m->valid &= ~bits; m->dirty &= ~bits; } /* * vm_page_zero_invalid() * * The kernel assumes that the invalid portions of a page contain * garbage, but such pages can be mapped into memory by user code. * When this occurs, we must zero out the non-valid portions of the * page so user code sees what it expects. * * Pages are most often semi-valid when the end of a file is mapped * into memory and the file's size is not page aligned. */ void vm_page_zero_invalid(vm_page_t m, boolean_t setvalid) { int b; int i; VM_OBJECT_ASSERT_WLOCKED(m->object); /* * Scan the valid bits looking for invalid sections that * must be zeroed. Invalid sub-DEV_BSIZE'd areas ( where the * valid bit may be set ) have already been zeroed by * vm_page_set_validclean(). */ for (b = i = 0; i <= PAGE_SIZE / DEV_BSIZE; ++i) { if (i == (PAGE_SIZE / DEV_BSIZE) || (m->valid & ((vm_page_bits_t)1 << i))) { if (i > b) { pmap_zero_page_area(m, b << DEV_BSHIFT, (i - b) << DEV_BSHIFT); } b = i + 1; } } /* * setvalid is TRUE when we can safely set the zero'd areas * as being valid. We can do this if there are no cache consistancy * issues. e.g. it is ok to do with UFS, but not ok to do with NFS. */ if (setvalid) m->valid = VM_PAGE_BITS_ALL; } /* * vm_page_is_valid: * * Is (partial) page valid? Note that the case where size == 0 * will return FALSE in the degenerate case where the page is * entirely invalid, and TRUE otherwise. */ int vm_page_is_valid(vm_page_t m, int base, int size) { vm_page_bits_t bits; VM_OBJECT_ASSERT_LOCKED(m->object); bits = vm_page_bits(base, size); return (m->valid != 0 && (m->valid & bits) == bits); } /* * vm_page_ps_is_valid: * * Returns TRUE if the entire (super)page is valid and FALSE otherwise. */ boolean_t vm_page_ps_is_valid(vm_page_t m) { int i, npages; VM_OBJECT_ASSERT_LOCKED(m->object); npages = atop(pagesizes[m->psind]); /* * The physically contiguous pages that make up a superpage, i.e., a * page with a page size index ("psind") greater than zero, will * occupy adjacent entries in vm_page_array[]. */ for (i = 0; i < npages; i++) { if (m[i].valid != VM_PAGE_BITS_ALL) return (FALSE); } return (TRUE); } /* * Set the page's dirty bits if the page is modified. */ void vm_page_test_dirty(vm_page_t m) { VM_OBJECT_ASSERT_WLOCKED(m->object); if (m->dirty != VM_PAGE_BITS_ALL && pmap_is_modified(m)) vm_page_dirty(m); } void vm_page_lock_KBI(vm_page_t m, const char *file, int line) { mtx_lock_flags_(vm_page_lockptr(m), 0, file, line); } void vm_page_unlock_KBI(vm_page_t m, const char *file, int line) { mtx_unlock_flags_(vm_page_lockptr(m), 0, file, line); } int vm_page_trylock_KBI(vm_page_t m, const char *file, int line) { return (mtx_trylock_flags_(vm_page_lockptr(m), 0, file, line)); } #if defined(INVARIANTS) || defined(INVARIANT_SUPPORT) void vm_page_assert_locked_KBI(vm_page_t m, const char *file, int line) { vm_page_lock_assert_KBI(m, MA_OWNED, file, line); } void vm_page_lock_assert_KBI(vm_page_t m, int a, const char *file, int line) { mtx_assert_(vm_page_lockptr(m), a, file, line); } #endif #ifdef INVARIANTS void vm_page_object_lock_assert(vm_page_t m) { /* * Certain of the page's fields may only be modified by the * holder of the containing object's lock or the exclusive busy. * holder. Unfortunately, the holder of the write busy is * not recorded, and thus cannot be checked here. */ if (m->object != NULL && !vm_page_xbusied(m)) VM_OBJECT_ASSERT_WLOCKED(m->object); } void vm_page_assert_pga_writeable(vm_page_t m, uint8_t bits) { if ((bits & PGA_WRITEABLE) == 0) return; /* * The PGA_WRITEABLE flag can only be set if the page is * managed, is exclusively busied or the object is locked. * Currently, this flag is only set by pmap_enter(). */ KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("PGA_WRITEABLE on unmanaged page")); if (!vm_page_xbusied(m)) VM_OBJECT_ASSERT_LOCKED(m->object); } #endif #include "opt_ddb.h" #ifdef DDB #include #include DB_SHOW_COMMAND(page, vm_page_print_page_info) { db_printf("vm_cnt.v_free_count: %d\n", vm_cnt.v_free_count); db_printf("vm_cnt.v_inactive_count: %d\n", vm_cnt.v_inactive_count); db_printf("vm_cnt.v_active_count: %d\n", vm_cnt.v_active_count); db_printf("vm_cnt.v_laundry_count: %d\n", vm_cnt.v_laundry_count); db_printf("vm_cnt.v_wire_count: %d\n", vm_cnt.v_wire_count); db_printf("vm_cnt.v_free_reserved: %d\n", vm_cnt.v_free_reserved); db_printf("vm_cnt.v_free_min: %d\n", vm_cnt.v_free_min); db_printf("vm_cnt.v_free_target: %d\n", vm_cnt.v_free_target); db_printf("vm_cnt.v_inactive_target: %d\n", vm_cnt.v_inactive_target); } DB_SHOW_COMMAND(pageq, vm_page_print_pageq_info) { int dom; db_printf("pq_free %d\n", vm_cnt.v_free_count); for (dom = 0; dom < vm_ndomains; dom++) { db_printf( "dom %d page_cnt %d free %d pq_act %d pq_inact %d pq_laund %d pq_unsw %d\n", dom, vm_dom[dom].vmd_page_count, vm_dom[dom].vmd_free_count, vm_dom[dom].vmd_pagequeues[PQ_ACTIVE].pq_cnt, vm_dom[dom].vmd_pagequeues[PQ_INACTIVE].pq_cnt, vm_dom[dom].vmd_pagequeues[PQ_LAUNDRY].pq_cnt, vm_dom[dom].vmd_pagequeues[PQ_UNSWAPPABLE].pq_cnt); } } DB_SHOW_COMMAND(pginfo, vm_page_print_pginfo) { vm_page_t m; boolean_t phys; if (!have_addr) { db_printf("show pginfo addr\n"); return; } phys = strchr(modif, 'p') != NULL; if (phys) m = PHYS_TO_VM_PAGE(addr); else m = (vm_page_t)addr; db_printf( "page %p obj %p pidx 0x%jx phys 0x%jx q %d hold %d wire %d\n" " af 0x%x of 0x%x f 0x%x act %d busy %x valid 0x%x dirty 0x%x\n", m, m->object, (uintmax_t)m->pindex, (uintmax_t)m->phys_addr, m->queue, m->hold_count, m->wire_count, m->aflags, m->oflags, m->flags, m->act_count, m->busy_lock, m->valid, m->dirty); } #endif /* DDB */ Index: projects/netbsd-tests-upstream-01-2017/tests/sys/kern/acct/Makefile =================================================================== --- projects/netbsd-tests-upstream-01-2017/tests/sys/kern/acct/Makefile (revision 312217) +++ projects/netbsd-tests-upstream-01-2017/tests/sys/kern/acct/Makefile (revision 312218) @@ -1,19 +1,20 @@ # $FreeBSD$ TESTSDIR= ${TESTSBASE}/sys/kern/acct ATF_TESTS_C= acct_test CFLAGS+= -I${.OBJDIR} CLEANFILES+= convert.c convert.c.tmp DPSRCS.acct_test= convert.c acct_test.o: convert.c convert.c: ${SRCTOP}/sys/kern/kern_acct.c sed -n -e 's/log(/syslog(/g' \ + -e 's/exp/expected/g' \ -e '/FLOAT_CONVERSION_START/,/FLOAT_CONVERSION_END/p' ${.ALLSRC} >${.TARGET}.tmp mv ${.TARGET}.tmp ${.TARGET} .include Index: projects/netbsd-tests-upstream-01-2017/tests/sys/mac/bsdextended/ugidfw_test.c =================================================================== --- projects/netbsd-tests-upstream-01-2017/tests/sys/mac/bsdextended/ugidfw_test.c (revision 312217) +++ projects/netbsd-tests-upstream-01-2017/tests/sys/mac/bsdextended/ugidfw_test.c (revision 312218) @@ -1,253 +1,253 @@ /*- * Copyright (c) 2005 McAfee, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Starting point for a regression test for mac_bsdextended(4) and the * supporting libugidfw(3). */ /* * This section of the regression test passes some test cases through the * rule<->string routines to confirm they work approximately as desired. */ /* * List of users and groups we must check exists before we can begin, since * they are used in the string test rules. We use users and groups that will * always exist in a default install used for regression testing. */ static const char *test_users[] = { "root", "daemon", "operator", "bin", }; static const char *test_groups[] = { "wheel", "daemon", "operator", "bin", }; static int test_num; /* * List of test strings that must go in (and come out) of libugidfw intact. */ static const char *test_strings[] = { /* Variations on subject and object uids. */ "subject uid root object uid root mode n", "subject uid root object uid daemon mode n", "subject uid daemon object uid root mode n", "subject uid daemon object uid daemon mode n", /* Variations on mode. */ "subject uid root object uid root mode a", "subject uid root object uid root mode r", "subject uid root object uid root mode s", "subject uid root object uid root mode w", "subject uid root object uid root mode x", "subject uid root object uid root mode arswx", /* Variations on subject and object gids. */ "subject gid wheel object gid wheel mode n", "subject gid wheel object gid daemon mode n", "subject gid daemon object gid wheel mode n", "subject gid daemon object gid daemon mode n", /* Subject uids and subject gids. */ "subject uid bin gid daemon object uid operator gid wheel mode n", /* Not */ "subject not uid operator object uid bin mode n", "subject uid bin object not uid operator mode n", "subject not uid daemon object not uid operator mode n", /* Ranges */ "subject uid root:operator object gid wheel:bin mode n", /* Jail ID */ "subject jailid 1 object uid root mode n", /* Filesys */ "subject uid root object filesys / mode n", "subject uid root object filesys /dev mode n", /* S/UGID */ "subject not uid root object sgid mode n", "subject not uid root object sgid mode n", /* Matching uid/gid */ "subject not uid root:operator object not uid_of_subject mode n", "subject not gid wheel:bin object not gid_of_subject mode n", /* Object types */ "subject uid root object type a mode a", "subject uid root object type r mode a", "subject uid root object type d mode a", "subject uid root object type b mode a", "subject uid root object type c mode a", "subject uid root object type l mode a", "subject uid root object type s mode a", "subject uid root object type rbc mode a", "subject uid root object type dls mode a", /* Empty rules always match */ "subject object mode a", /* Partial negations */ "subject ! uid root object mode n", "subject ! gid wheel object mode n", "subject ! jailid 2 object mode n", "subject object ! uid root mode n", "subject object ! gid wheel mode n", "subject object ! filesys / mode n", "subject object ! suid mode n", "subject object ! sgid mode n", "subject object ! uid_of_subject mode n", "subject object ! gid_of_subject mode n", "subject object ! type d mode n", /* All out nonsense */ "subject uid root ! gid wheel:bin ! jailid 1 " "object ! uid root:daemon gid daemon filesys / suid sgid uid_of_subject gid_of_subject ! type r " "mode rsx", }; static void test_libugidfw_strings(void) { struct mac_bsdextended_rule rule; char errorstr[256]; char rulestr[256]; size_t i; int error; for (i = 0; i < nitems(test_users); i++, test_num++) { if (getpwnam(test_users[i]) == NULL) printf("not ok %d # test_libugidfw_strings: getpwnam(%s) " "failed: %s\n", test_num, test_users[i], strerror(errno)); else printf("ok %d\n", test_num); } for (i = 0; i < nitems(test_groups); i++, test_num++) { if (getgrnam(test_groups[i]) == NULL) printf("not ok %d # test_libugidfw_strings: getgrnam(%s) " "failed: %s\n", test_num, test_groups[i], strerror(errno)); else printf("ok %d\n", test_num); } for (i = 0; i < nitems(test_strings); i++) { error = bsde_parse_rule_string(test_strings[i], &rule, sizeof(errorstr), errorstr); if (error == -1) printf("not ok %d # bsde_parse_rule_string: '%s' (%zu) " "failed: %s\n", test_num, test_strings[i], i, errorstr); else printf("ok %d\n", test_num); test_num++; error = bsde_rule_to_string(&rule, rulestr, sizeof(rulestr)); if (error < 0) printf("not ok %d # bsde_rule_to_string: rule for '%s' " "returned %d\n", test_num, test_strings[i], error); else printf("ok %d\n", test_num); test_num++; if (strcmp(test_strings[i], rulestr) != 0) printf("not ok %d # test_libugidfw: '%s' in, '%s' " "out\n", test_num, test_strings[i], rulestr); else printf("ok %d\n", test_num); test_num++; } } int main(void) { char errorstr[256]; int count, slots; test_num = 1; /* Print an error if a non-root user attemps to run the tests. */ if (getuid() != 0) { printf("1..0 # SKIP you must be root\n"); return (0); } switch (mac_is_present("bsdextended")) { case -1: printf("1..0 # SKIP mac_is_present failed: %s\n", strerror(errno)); return (0); case 1: break; case 0: default: printf("1..0 # SKIP mac_bsdextended not loaded\n"); return (0); } - printf("1..%lu\n", nitems(test_users) + nitems(test_groups) + + printf("1..%zu\n", nitems(test_users) + nitems(test_groups) + 3 * nitems(test_strings) + 2); test_libugidfw_strings(); /* * Some simple up-front checks to see if we're able to query the * policy for basic state. We want the rule count to be 0 before * starting, but "slots" is a property of prior runs and so we ignore * the return value. */ count = bsde_get_rule_count(sizeof(errorstr), errorstr); if (count == -1) printf("not ok %d # bsde_get_rule_count: %s\n", test_num, errorstr); else printf("ok %d\n", test_num); test_num++; slots = bsde_get_rule_slots(sizeof(errorstr), errorstr); if (slots == -1) printf("not ok %d # bsde_get_rule_slots: %s\n", test_num, errorstr); else printf("ok %d\n", test_num); return (0); } Index: projects/netbsd-tests-upstream-01-2017/tests/sys/vfs/lookup_cap_dotdot.c =================================================================== --- projects/netbsd-tests-upstream-01-2017/tests/sys/vfs/lookup_cap_dotdot.c (revision 312217) +++ projects/netbsd-tests-upstream-01-2017/tests/sys/vfs/lookup_cap_dotdot.c (revision 312218) @@ -1,248 +1,256 @@ /*- * Copyright (c) 2016 Ed Maste * Copyright (c) 2016 Conrad Meyer * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include "freebsd_test_suite/macros.h" static int dirfd = -1; static char *abspath; static void touchat(int _dirfd, const char *name) { int fd; ATF_REQUIRE((fd = openat(_dirfd, name, O_CREAT | O_TRUNC | O_WRONLY, 0777)) >= 0); ATF_REQUIRE(close(fd) == 0); } static void prepare_dotdot_tests(void) { char cwd[MAXPATHLEN]; ATF_REQUIRE(getcwd(cwd, sizeof(cwd)) != NULL); asprintf(&abspath, "%s/testdir/d1/f1", cwd); ATF_REQUIRE(mkdir("testdir", 0777) == 0); ATF_REQUIRE((dirfd = open("testdir", O_RDONLY)) >= 0); ATF_REQUIRE(mkdirat(dirfd, "d1", 0777) == 0); ATF_REQUIRE(mkdirat(dirfd, "d1/d2", 0777) == 0); ATF_REQUIRE(mkdirat(dirfd, "d1/d2/d3", 0777) == 0); touchat(dirfd, "d1/f1"); touchat(dirfd, "d1/d2/f2"); touchat(dirfd, "d1/d2/d3/f3"); ATF_REQUIRE(symlinkat("d1/d2/d3", dirfd, "l3") == 0); ATF_REQUIRE(symlinkat("../testdir/d1", dirfd, "lup") == 0); ATF_REQUIRE(symlinkat("../..", dirfd, "d1/d2/d3/ld1") == 0); ATF_REQUIRE(symlinkat("../../f1", dirfd, "d1/d2/d3/lf1") == 0); } static void check_capsicum(void) { ATF_REQUIRE_FEATURE("security_capabilities"); ATF_REQUIRE_FEATURE("security_capability_mode"); } /* * Positive tests */ ATF_TC(openat__basic_positive); ATF_TC_HEAD(openat__basic_positive, tc) { atf_tc_set_md_var(tc, "descr", "Basic positive openat testcases"); } ATF_TC_BODY(openat__basic_positive, tc) { prepare_dotdot_tests(); ATF_REQUIRE(openat(dirfd, "d1/d2/d3/f3", O_RDONLY) >= 0); ATF_REQUIRE(openat(dirfd, "d1/d2/d3/../../f1", O_RDONLY) >= 0); ATF_REQUIRE(openat(dirfd, "l3/f3", O_RDONLY) >= 0); ATF_REQUIRE(openat(dirfd, "l3/../../f1", O_RDONLY) >= 0); ATF_REQUIRE(openat(dirfd, "../testdir/d1/f1", O_RDONLY) >= 0); ATF_REQUIRE(openat(dirfd, "lup/f1", O_RDONLY) >= 0); ATF_REQUIRE(openat(dirfd, "l3/ld1", O_RDONLY) >= 0); ATF_REQUIRE(openat(dirfd, "l3/lf1", O_RDONLY) >= 0); ATF_REQUIRE(open(abspath, O_RDONLY) >= 0); ATF_REQUIRE(openat(dirfd, abspath, O_RDONLY) >= 0); } ATF_TC(lookup_cap_dotdot__basic); ATF_TC_HEAD(lookup_cap_dotdot__basic, tc) { atf_tc_set_md_var(tc, "descr", "Validate cap-mode (testdir)/d1/.. lookup"); } ATF_TC_BODY(lookup_cap_dotdot__basic, tc) { cap_rights_t rights; check_capsicum(); prepare_dotdot_tests(); cap_rights_init(&rights, CAP_LOOKUP, CAP_READ); ATF_REQUIRE(cap_rights_limit(dirfd, &rights) >= 0); + atf_tc_expect_signal(SIGABRT, "needs change done upstream in atf/kyua according to cem: bug 215690"); + ATF_REQUIRE(cap_enter() >= 0); ATF_REQUIRE_MSG(openat(dirfd, "d1/..", O_RDONLY) >= 0, "%s", strerror(errno)); } ATF_TC(lookup_cap_dotdot__advanced); ATF_TC_HEAD(lookup_cap_dotdot__advanced, tc) { atf_tc_set_md_var(tc, "descr", "Validate cap-mode (testdir)/d1/.. lookup"); } ATF_TC_BODY(lookup_cap_dotdot__advanced, tc) { cap_rights_t rights; check_capsicum(); prepare_dotdot_tests(); + atf_tc_expect_signal(SIGABRT, "needs change done upstream in atf/kyua according to cem: bug 215690"); + cap_rights_init(&rights, CAP_LOOKUP, CAP_READ); ATF_REQUIRE(cap_rights_limit(dirfd, &rights) >= 0); ATF_REQUIRE(cap_enter() >= 0); ATF_REQUIRE(openat(dirfd, "d1/d2/d3/../../f1", O_RDONLY) >= 0); ATF_REQUIRE(openat(dirfd, "l3/../../f1", O_RDONLY) >= 0); ATF_REQUIRE(openat(dirfd, "l3/ld1", O_RDONLY) >= 0); ATF_REQUIRE(openat(dirfd, "l3/lf1", O_RDONLY) >= 0); } /* * Negative tests */ ATF_TC(openat__basic_negative); ATF_TC_HEAD(openat__basic_negative, tc) { atf_tc_set_md_var(tc, "descr", "Basic negative openat testcases"); } ATF_TC_BODY(openat__basic_negative, tc) { prepare_dotdot_tests(); ATF_REQUIRE_ERRNO(ENOENT, openat(dirfd, "does-not-exist", O_RDONLY) < 0); ATF_REQUIRE_ERRNO(ENOENT, openat(dirfd, "l3/does-not-exist", O_RDONLY) < 0); } ATF_TC(capmode__negative); ATF_TC_HEAD(capmode__negative, tc) { atf_tc_set_md_var(tc, "descr", "Negative Capability mode testcases"); } ATF_TC_BODY(capmode__negative, tc) { int subdirfd; check_capsicum(); prepare_dotdot_tests(); + atf_tc_expect_signal(SIGABRT, "needs change done upstream in atf/kyua according to cem: bug 215690"); + ATF_REQUIRE(cap_enter() == 0); /* open() not permitted in capability mode */ ATF_REQUIRE_ERRNO(ECAPMODE, open("testdir", O_RDONLY) < 0); /* AT_FDCWD not permitted in capability mode */ ATF_REQUIRE_ERRNO(ECAPMODE, openat(AT_FDCWD, "d1/f1", O_RDONLY) < 0); /* Relative path above dirfd not capable */ ATF_REQUIRE_ERRNO(ENOTCAPABLE, openat(dirfd, "..", O_RDONLY) < 0); ATF_REQUIRE((subdirfd = openat(dirfd, "l3", O_RDONLY)) >= 0); ATF_REQUIRE_ERRNO(ENOTCAPABLE, openat(subdirfd, "../../f1", O_RDONLY) < 0); /* Absolute paths not capable */ ATF_REQUIRE_ERRNO(ENOTCAPABLE, openat(dirfd, abspath, O_RDONLY) < 0); /* Symlink above dirfd */ ATF_REQUIRE_ERRNO(ENOTCAPABLE, openat(dirfd, "lup/f1", O_RDONLY) < 0); } ATF_TC(lookup_cap_dotdot__negative); ATF_TC_HEAD(lookup_cap_dotdot__negative, tc) { atf_tc_set_md_var(tc, "descr", "Validate cap-mode (testdir)/.. lookup fails"); } ATF_TC_BODY(lookup_cap_dotdot__negative, tc) { cap_rights_t rights; check_capsicum(); prepare_dotdot_tests(); cap_rights_init(&rights, CAP_LOOKUP, CAP_READ); ATF_REQUIRE(cap_rights_limit(dirfd, &rights) >= 0); + + atf_tc_expect_signal(SIGABRT, "needs change done upstream in atf/kyua according to cem: bug 215690"); ATF_REQUIRE(cap_enter() >= 0); ATF_REQUIRE_ERRNO(ENOTCAPABLE, openat(dirfd, "..", O_RDONLY) < 0); ATF_REQUIRE_ERRNO(ENOTCAPABLE, openat(dirfd, "d1/../..", O_RDONLY) < 0); ATF_REQUIRE_ERRNO(ENOTCAPABLE, openat(dirfd, "../testdir/d1/f1", O_RDONLY) < 0); } ATF_TP_ADD_TCS(tp) { ATF_TP_ADD_TC(tp, openat__basic_positive); ATF_TP_ADD_TC(tp, openat__basic_negative); ATF_TP_ADD_TC(tp, capmode__negative); ATF_TP_ADD_TC(tp, lookup_cap_dotdot__basic); ATF_TP_ADD_TC(tp, lookup_cap_dotdot__advanced); ATF_TP_ADD_TC(tp, lookup_cap_dotdot__negative); return (atf_no_error()); } Index: projects/netbsd-tests-upstream-01-2017/usr.sbin/ctld/ctld.c =================================================================== --- projects/netbsd-tests-upstream-01-2017/usr.sbin/ctld/ctld.c (revision 312217) +++ projects/netbsd-tests-upstream-01-2017/usr.sbin/ctld/ctld.c (revision 312218) @@ -1,2729 +1,2730 @@ /*- * Copyright (c) 2012 The FreeBSD Foundation * All rights reserved. * * This software was developed by Edward Tomasz Napierala under sponsorship * from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "ctld.h" #include "isns.h" bool proxy_mode = false; static volatile bool sighup_received = false; static volatile bool sigterm_received = false; static volatile bool sigalrm_received = false; static int nchildren = 0; static uint16_t last_portal_group_tag = 0xff; static void usage(void) { fprintf(stderr, "usage: ctld [-d][-u][-f config-file]\n"); exit(1); } char * checked_strdup(const char *s) { char *c; c = strdup(s); if (c == NULL) log_err(1, "strdup"); return (c); } struct conf * conf_new(void) { struct conf *conf; conf = calloc(1, sizeof(*conf)); if (conf == NULL) log_err(1, "calloc"); TAILQ_INIT(&conf->conf_luns); TAILQ_INIT(&conf->conf_targets); TAILQ_INIT(&conf->conf_auth_groups); TAILQ_INIT(&conf->conf_ports); TAILQ_INIT(&conf->conf_portal_groups); TAILQ_INIT(&conf->conf_pports); TAILQ_INIT(&conf->conf_isns); conf->conf_isns_period = 900; conf->conf_isns_timeout = 5; conf->conf_debug = 0; conf->conf_timeout = 60; conf->conf_maxproc = 30; return (conf); } void conf_delete(struct conf *conf) { struct lun *lun, *ltmp; struct target *targ, *tmp; struct auth_group *ag, *cagtmp; struct portal_group *pg, *cpgtmp; struct pport *pp, *pptmp; struct isns *is, *istmp; assert(conf->conf_pidfh == NULL); TAILQ_FOREACH_SAFE(lun, &conf->conf_luns, l_next, ltmp) lun_delete(lun); TAILQ_FOREACH_SAFE(targ, &conf->conf_targets, t_next, tmp) target_delete(targ); TAILQ_FOREACH_SAFE(ag, &conf->conf_auth_groups, ag_next, cagtmp) auth_group_delete(ag); TAILQ_FOREACH_SAFE(pg, &conf->conf_portal_groups, pg_next, cpgtmp) portal_group_delete(pg); TAILQ_FOREACH_SAFE(pp, &conf->conf_pports, pp_next, pptmp) pport_delete(pp); TAILQ_FOREACH_SAFE(is, &conf->conf_isns, i_next, istmp) isns_delete(is); assert(TAILQ_EMPTY(&conf->conf_ports)); free(conf->conf_pidfile_path); free(conf); } static struct auth * auth_new(struct auth_group *ag) { struct auth *auth; auth = calloc(1, sizeof(*auth)); if (auth == NULL) log_err(1, "calloc"); auth->a_auth_group = ag; TAILQ_INSERT_TAIL(&ag->ag_auths, auth, a_next); return (auth); } static void auth_delete(struct auth *auth) { TAILQ_REMOVE(&auth->a_auth_group->ag_auths, auth, a_next); free(auth->a_user); free(auth->a_secret); free(auth->a_mutual_user); free(auth->a_mutual_secret); free(auth); } const struct auth * auth_find(const struct auth_group *ag, const char *user) { const struct auth *auth; TAILQ_FOREACH(auth, &ag->ag_auths, a_next) { if (strcmp(auth->a_user, user) == 0) return (auth); } return (NULL); } static void auth_check_secret_length(struct auth *auth) { size_t len; len = strlen(auth->a_secret); if (len > 16) { if (auth->a_auth_group->ag_name != NULL) log_warnx("secret for user \"%s\", auth-group \"%s\", " "is too long; it should be at most 16 characters " "long", auth->a_user, auth->a_auth_group->ag_name); else log_warnx("secret for user \"%s\", target \"%s\", " "is too long; it should be at most 16 characters " "long", auth->a_user, auth->a_auth_group->ag_target->t_name); } if (len < 12) { if (auth->a_auth_group->ag_name != NULL) log_warnx("secret for user \"%s\", auth-group \"%s\", " "is too short; it should be at least 12 characters " "long", auth->a_user, auth->a_auth_group->ag_name); else log_warnx("secret for user \"%s\", target \"%s\", " "is too short; it should be at least 12 characters " "long", auth->a_user, auth->a_auth_group->ag_target->t_name); } if (auth->a_mutual_secret != NULL) { len = strlen(auth->a_mutual_secret); if (len > 16) { if (auth->a_auth_group->ag_name != NULL) log_warnx("mutual secret for user \"%s\", " "auth-group \"%s\", is too long; it should " "be at most 16 characters long", auth->a_user, auth->a_auth_group->ag_name); else log_warnx("mutual secret for user \"%s\", " "target \"%s\", is too long; it should " "be at most 16 characters long", auth->a_user, auth->a_auth_group->ag_target->t_name); } if (len < 12) { if (auth->a_auth_group->ag_name != NULL) log_warnx("mutual secret for user \"%s\", " "auth-group \"%s\", is too short; it " "should be at least 12 characters long", auth->a_user, auth->a_auth_group->ag_name); else log_warnx("mutual secret for user \"%s\", " "target \"%s\", is too short; it should be " "at least 12 characters long", auth->a_user, auth->a_auth_group->ag_target->t_name); } } } const struct auth * auth_new_chap(struct auth_group *ag, const char *user, const char *secret) { struct auth *auth; if (ag->ag_type == AG_TYPE_UNKNOWN) ag->ag_type = AG_TYPE_CHAP; if (ag->ag_type != AG_TYPE_CHAP) { if (ag->ag_name != NULL) log_warnx("cannot mix \"chap\" authentication with " "other types for auth-group \"%s\"", ag->ag_name); else log_warnx("cannot mix \"chap\" authentication with " "other types for target \"%s\"", ag->ag_target->t_name); return (NULL); } auth = auth_new(ag); auth->a_user = checked_strdup(user); auth->a_secret = checked_strdup(secret); auth_check_secret_length(auth); return (auth); } const struct auth * auth_new_chap_mutual(struct auth_group *ag, const char *user, const char *secret, const char *user2, const char *secret2) { struct auth *auth; if (ag->ag_type == AG_TYPE_UNKNOWN) ag->ag_type = AG_TYPE_CHAP_MUTUAL; if (ag->ag_type != AG_TYPE_CHAP_MUTUAL) { if (ag->ag_name != NULL) log_warnx("cannot mix \"chap-mutual\" authentication " "with other types for auth-group \"%s\"", ag->ag_name); else log_warnx("cannot mix \"chap-mutual\" authentication " "with other types for target \"%s\"", ag->ag_target->t_name); return (NULL); } auth = auth_new(ag); auth->a_user = checked_strdup(user); auth->a_secret = checked_strdup(secret); auth->a_mutual_user = checked_strdup(user2); auth->a_mutual_secret = checked_strdup(secret2); auth_check_secret_length(auth); return (auth); } const struct auth_name * auth_name_new(struct auth_group *ag, const char *name) { struct auth_name *an; an = calloc(1, sizeof(*an)); if (an == NULL) log_err(1, "calloc"); an->an_auth_group = ag; an->an_initator_name = checked_strdup(name); TAILQ_INSERT_TAIL(&ag->ag_names, an, an_next); return (an); } static void auth_name_delete(struct auth_name *an) { TAILQ_REMOVE(&an->an_auth_group->ag_names, an, an_next); free(an->an_initator_name); free(an); } bool auth_name_defined(const struct auth_group *ag) { if (TAILQ_EMPTY(&ag->ag_names)) return (false); return (true); } const struct auth_name * auth_name_find(const struct auth_group *ag, const char *name) { const struct auth_name *auth_name; TAILQ_FOREACH(auth_name, &ag->ag_names, an_next) { if (strcmp(auth_name->an_initator_name, name) == 0) return (auth_name); } return (NULL); } int auth_name_check(const struct auth_group *ag, const char *initiator_name) { if (!auth_name_defined(ag)) return (0); if (auth_name_find(ag, initiator_name) == NULL) return (1); return (0); } const struct auth_portal * auth_portal_new(struct auth_group *ag, const char *portal) { struct auth_portal *ap; char *net, *mask, *str, *tmp; int len, dm, m; ap = calloc(1, sizeof(*ap)); if (ap == NULL) log_err(1, "calloc"); ap->ap_auth_group = ag; ap->ap_initator_portal = checked_strdup(portal); mask = str = checked_strdup(portal); net = strsep(&mask, "/"); if (net[0] == '[') net++; len = strlen(net); if (len == 0) goto error; if (net[len - 1] == ']') net[len - 1] = 0; if (strchr(net, ':') != NULL) { struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)&ap->ap_sa; sin6->sin6_len = sizeof(*sin6); sin6->sin6_family = AF_INET6; if (inet_pton(AF_INET6, net, &sin6->sin6_addr) <= 0) goto error; dm = 128; } else { struct sockaddr_in *sin = (struct sockaddr_in *)&ap->ap_sa; sin->sin_len = sizeof(*sin); sin->sin_family = AF_INET; if (inet_pton(AF_INET, net, &sin->sin_addr) <= 0) goto error; dm = 32; } if (mask != NULL) { m = strtol(mask, &tmp, 0); if (m < 0 || m > dm || tmp[0] != 0) goto error; } else m = dm; ap->ap_mask = m; free(str); TAILQ_INSERT_TAIL(&ag->ag_portals, ap, ap_next); return (ap); error: free(str); free(ap); log_warnx("incorrect initiator portal \"%s\"", portal); return (NULL); } static void auth_portal_delete(struct auth_portal *ap) { TAILQ_REMOVE(&ap->ap_auth_group->ag_portals, ap, ap_next); free(ap->ap_initator_portal); free(ap); } bool auth_portal_defined(const struct auth_group *ag) { if (TAILQ_EMPTY(&ag->ag_portals)) return (false); return (true); } const struct auth_portal * auth_portal_find(const struct auth_group *ag, const struct sockaddr_storage *ss) { const struct auth_portal *ap; const uint8_t *a, *b; int i; uint8_t bmask; TAILQ_FOREACH(ap, &ag->ag_portals, ap_next) { if (ap->ap_sa.ss_family != ss->ss_family) continue; if (ss->ss_family == AF_INET) { a = (const uint8_t *) &((const struct sockaddr_in *)ss)->sin_addr; b = (const uint8_t *) &((const struct sockaddr_in *)&ap->ap_sa)->sin_addr; } else { a = (const uint8_t *) &((const struct sockaddr_in6 *)ss)->sin6_addr; b = (const uint8_t *) &((const struct sockaddr_in6 *)&ap->ap_sa)->sin6_addr; } for (i = 0; i < ap->ap_mask / 8; i++) { if (a[i] != b[i]) goto next; } if (ap->ap_mask % 8) { bmask = 0xff << (8 - (ap->ap_mask % 8)); if ((a[i] & bmask) != (b[i] & bmask)) goto next; } return (ap); next: ; } return (NULL); } int auth_portal_check(const struct auth_group *ag, const struct sockaddr_storage *sa) { if (!auth_portal_defined(ag)) return (0); if (auth_portal_find(ag, sa) == NULL) return (1); return (0); } struct auth_group * auth_group_new(struct conf *conf, const char *name) { struct auth_group *ag; if (name != NULL) { ag = auth_group_find(conf, name); if (ag != NULL) { log_warnx("duplicated auth-group \"%s\"", name); return (NULL); } } ag = calloc(1, sizeof(*ag)); if (ag == NULL) log_err(1, "calloc"); if (name != NULL) ag->ag_name = checked_strdup(name); TAILQ_INIT(&ag->ag_auths); TAILQ_INIT(&ag->ag_names); TAILQ_INIT(&ag->ag_portals); ag->ag_conf = conf; TAILQ_INSERT_TAIL(&conf->conf_auth_groups, ag, ag_next); return (ag); } void auth_group_delete(struct auth_group *ag) { struct auth *auth, *auth_tmp; struct auth_name *auth_name, *auth_name_tmp; struct auth_portal *auth_portal, *auth_portal_tmp; TAILQ_REMOVE(&ag->ag_conf->conf_auth_groups, ag, ag_next); TAILQ_FOREACH_SAFE(auth, &ag->ag_auths, a_next, auth_tmp) auth_delete(auth); TAILQ_FOREACH_SAFE(auth_name, &ag->ag_names, an_next, auth_name_tmp) auth_name_delete(auth_name); TAILQ_FOREACH_SAFE(auth_portal, &ag->ag_portals, ap_next, auth_portal_tmp) auth_portal_delete(auth_portal); free(ag->ag_name); free(ag); } struct auth_group * auth_group_find(const struct conf *conf, const char *name) { struct auth_group *ag; TAILQ_FOREACH(ag, &conf->conf_auth_groups, ag_next) { if (ag->ag_name != NULL && strcmp(ag->ag_name, name) == 0) return (ag); } return (NULL); } int auth_group_set_type(struct auth_group *ag, const char *str) { int type; if (strcmp(str, "none") == 0) { type = AG_TYPE_NO_AUTHENTICATION; } else if (strcmp(str, "deny") == 0) { type = AG_TYPE_DENY; } else if (strcmp(str, "chap") == 0) { type = AG_TYPE_CHAP; } else if (strcmp(str, "chap-mutual") == 0) { type = AG_TYPE_CHAP_MUTUAL; } else { if (ag->ag_name != NULL) log_warnx("invalid auth-type \"%s\" for auth-group " "\"%s\"", str, ag->ag_name); else log_warnx("invalid auth-type \"%s\" for target " "\"%s\"", str, ag->ag_target->t_name); return (1); } if (ag->ag_type != AG_TYPE_UNKNOWN && ag->ag_type != type) { if (ag->ag_name != NULL) { log_warnx("cannot set auth-type to \"%s\" for " "auth-group \"%s\"; already has a different " "type", str, ag->ag_name); } else { log_warnx("cannot set auth-type to \"%s\" for target " "\"%s\"; already has a different type", str, ag->ag_target->t_name); } return (1); } ag->ag_type = type; return (0); } static struct portal * portal_new(struct portal_group *pg) { struct portal *portal; portal = calloc(1, sizeof(*portal)); if (portal == NULL) log_err(1, "calloc"); TAILQ_INIT(&portal->p_targets); portal->p_portal_group = pg; TAILQ_INSERT_TAIL(&pg->pg_portals, portal, p_next); return (portal); } static void portal_delete(struct portal *portal) { TAILQ_REMOVE(&portal->p_portal_group->pg_portals, portal, p_next); if (portal->p_ai != NULL) freeaddrinfo(portal->p_ai); free(portal->p_listen); free(portal); } struct portal_group * portal_group_new(struct conf *conf, const char *name) { struct portal_group *pg; pg = portal_group_find(conf, name); if (pg != NULL) { log_warnx("duplicated portal-group \"%s\"", name); return (NULL); } pg = calloc(1, sizeof(*pg)); if (pg == NULL) log_err(1, "calloc"); pg->pg_name = checked_strdup(name); TAILQ_INIT(&pg->pg_options); TAILQ_INIT(&pg->pg_portals); TAILQ_INIT(&pg->pg_ports); pg->pg_conf = conf; pg->pg_tag = 0; /* Assigned later in conf_apply(). */ TAILQ_INSERT_TAIL(&conf->conf_portal_groups, pg, pg_next); return (pg); } void portal_group_delete(struct portal_group *pg) { struct portal *portal, *tmp; struct port *port, *tport; struct option *o, *otmp; TAILQ_FOREACH_SAFE(port, &pg->pg_ports, p_pgs, tport) port_delete(port); TAILQ_REMOVE(&pg->pg_conf->conf_portal_groups, pg, pg_next); TAILQ_FOREACH_SAFE(portal, &pg->pg_portals, p_next, tmp) portal_delete(portal); TAILQ_FOREACH_SAFE(o, &pg->pg_options, o_next, otmp) option_delete(&pg->pg_options, o); free(pg->pg_name); free(pg->pg_offload); free(pg->pg_redirection); free(pg); } struct portal_group * portal_group_find(const struct conf *conf, const char *name) { struct portal_group *pg; TAILQ_FOREACH(pg, &conf->conf_portal_groups, pg_next) { if (strcmp(pg->pg_name, name) == 0) return (pg); } return (NULL); } static int parse_addr_port(char *arg, const char *def_port, struct addrinfo **ai) { struct addrinfo hints; char *str, *addr, *ch; const char *port; int error, colons = 0; str = arg = strdup(arg); if (arg[0] == '[') { /* * IPv6 address in square brackets, perhaps with port. */ arg++; addr = strsep(&arg, "]"); if (arg == NULL) { free(str); return (1); } if (arg[0] == '\0') { port = def_port; } else if (arg[0] == ':') { port = arg + 1; } else { free(str); return (1); } } else { /* * Either IPv6 address without brackets - and without * a port - or IPv4 address. Just count the colons. */ for (ch = arg; *ch != '\0'; ch++) { if (*ch == ':') colons++; } if (colons > 1) { addr = arg; port = def_port; } else { addr = strsep(&arg, ":"); if (arg == NULL) port = def_port; else port = arg; } } memset(&hints, 0, sizeof(hints)); hints.ai_family = PF_UNSPEC; hints.ai_socktype = SOCK_STREAM; hints.ai_flags = AI_PASSIVE; error = getaddrinfo(addr, port, &hints, ai); free(str); return ((error != 0) ? 1 : 0); } int portal_group_add_listen(struct portal_group *pg, const char *value, bool iser) { struct portal *portal; portal = portal_new(pg); portal->p_listen = checked_strdup(value); portal->p_iser = iser; if (parse_addr_port(portal->p_listen, "3260", &portal->p_ai)) { log_warnx("invalid listen address %s", portal->p_listen); portal_delete(portal); return (1); } /* * XXX: getaddrinfo(3) may return multiple addresses; we should turn * those into multiple portals. */ return (0); } int isns_new(struct conf *conf, const char *addr) { struct isns *isns; isns = calloc(1, sizeof(*isns)); if (isns == NULL) log_err(1, "calloc"); isns->i_conf = conf; TAILQ_INSERT_TAIL(&conf->conf_isns, isns, i_next); isns->i_addr = checked_strdup(addr); if (parse_addr_port(isns->i_addr, "3205", &isns->i_ai)) { log_warnx("invalid iSNS address %s", isns->i_addr); isns_delete(isns); return (1); } /* * XXX: getaddrinfo(3) may return multiple addresses; we should turn * those into multiple servers. */ return (0); } void isns_delete(struct isns *isns) { TAILQ_REMOVE(&isns->i_conf->conf_isns, isns, i_next); free(isns->i_addr); if (isns->i_ai != NULL) freeaddrinfo(isns->i_ai); free(isns); } static int isns_do_connect(struct isns *isns) { int s; s = socket(isns->i_ai->ai_family, isns->i_ai->ai_socktype, isns->i_ai->ai_protocol); if (s < 0) { log_warn("socket(2) failed for %s", isns->i_addr); return (-1); } if (connect(s, isns->i_ai->ai_addr, isns->i_ai->ai_addrlen)) { log_warn("connect(2) failed for %s", isns->i_addr); close(s); return (-1); } return(s); } static int isns_do_register(struct isns *isns, int s, const char *hostname) { struct conf *conf = isns->i_conf; struct target *target; struct portal *portal; struct portal_group *pg; struct port *port; struct isns_req *req; int res = 0; uint32_t error; req = isns_req_create(ISNS_FUNC_DEVATTRREG, ISNS_FLAG_CLIENT); isns_req_add_str(req, 32, TAILQ_FIRST(&conf->conf_targets)->t_name); isns_req_add_delim(req); isns_req_add_str(req, 1, hostname); isns_req_add_32(req, 2, 2); /* 2 -- iSCSI */ isns_req_add_32(req, 6, conf->conf_isns_period); TAILQ_FOREACH(pg, &conf->conf_portal_groups, pg_next) { if (pg->pg_unassigned) continue; TAILQ_FOREACH(portal, &pg->pg_portals, p_next) { isns_req_add_addr(req, 16, portal->p_ai); isns_req_add_port(req, 17, portal->p_ai); } } TAILQ_FOREACH(target, &conf->conf_targets, t_next) { isns_req_add_str(req, 32, target->t_name); isns_req_add_32(req, 33, 1); /* 1 -- Target*/ if (target->t_alias != NULL) isns_req_add_str(req, 34, target->t_alias); TAILQ_FOREACH(port, &target->t_ports, p_ts) { if ((pg = port->p_portal_group) == NULL) continue; isns_req_add_32(req, 51, pg->pg_tag); TAILQ_FOREACH(portal, &pg->pg_portals, p_next) { isns_req_add_addr(req, 49, portal->p_ai); isns_req_add_port(req, 50, portal->p_ai); } } } res = isns_req_send(s, req); if (res < 0) { log_warn("send(2) failed for %s", isns->i_addr); goto quit; } res = isns_req_receive(s, req); if (res < 0) { log_warn("receive(2) failed for %s", isns->i_addr); goto quit; } error = isns_req_get_status(req); if (error != 0) { log_warnx("iSNS register error %d for %s", error, isns->i_addr); res = -1; } quit: isns_req_free(req); return (res); } static int isns_do_check(struct isns *isns, int s, const char *hostname) { struct conf *conf = isns->i_conf; struct isns_req *req; int res = 0; uint32_t error; req = isns_req_create(ISNS_FUNC_DEVATTRQRY, ISNS_FLAG_CLIENT); isns_req_add_str(req, 32, TAILQ_FIRST(&conf->conf_targets)->t_name); isns_req_add_str(req, 1, hostname); isns_req_add_delim(req); isns_req_add(req, 2, 0, NULL); res = isns_req_send(s, req); if (res < 0) { log_warn("send(2) failed for %s", isns->i_addr); goto quit; } res = isns_req_receive(s, req); if (res < 0) { log_warn("receive(2) failed for %s", isns->i_addr); goto quit; } error = isns_req_get_status(req); if (error != 0) { log_warnx("iSNS check error %d for %s", error, isns->i_addr); res = -1; } quit: isns_req_free(req); return (res); } static int isns_do_deregister(struct isns *isns, int s, const char *hostname) { struct conf *conf = isns->i_conf; struct isns_req *req; int res = 0; uint32_t error; req = isns_req_create(ISNS_FUNC_DEVDEREG, ISNS_FLAG_CLIENT); isns_req_add_str(req, 32, TAILQ_FIRST(&conf->conf_targets)->t_name); isns_req_add_delim(req); isns_req_add_str(req, 1, hostname); res = isns_req_send(s, req); if (res < 0) { log_warn("send(2) failed for %s", isns->i_addr); goto quit; } res = isns_req_receive(s, req); if (res < 0) { log_warn("receive(2) failed for %s", isns->i_addr); goto quit; } error = isns_req_get_status(req); if (error != 0) { log_warnx("iSNS deregister error %d for %s", error, isns->i_addr); res = -1; } quit: isns_req_free(req); return (res); } void isns_register(struct isns *isns, struct isns *oldisns) { struct conf *conf = isns->i_conf; int s; char hostname[256]; if (TAILQ_EMPTY(&conf->conf_targets) || TAILQ_EMPTY(&conf->conf_portal_groups)) return; set_timeout(conf->conf_isns_timeout, false); s = isns_do_connect(isns); if (s < 0) { set_timeout(0, false); return; } gethostname(hostname, sizeof(hostname)); if (oldisns == NULL || TAILQ_EMPTY(&oldisns->i_conf->conf_targets)) oldisns = isns; isns_do_deregister(oldisns, s, hostname); isns_do_register(isns, s, hostname); close(s); set_timeout(0, false); } void isns_check(struct isns *isns) { struct conf *conf = isns->i_conf; int s, res; char hostname[256]; if (TAILQ_EMPTY(&conf->conf_targets) || TAILQ_EMPTY(&conf->conf_portal_groups)) return; set_timeout(conf->conf_isns_timeout, false); s = isns_do_connect(isns); if (s < 0) { set_timeout(0, false); return; } gethostname(hostname, sizeof(hostname)); res = isns_do_check(isns, s, hostname); if (res < 0) { isns_do_deregister(isns, s, hostname); isns_do_register(isns, s, hostname); } close(s); set_timeout(0, false); } void isns_deregister(struct isns *isns) { struct conf *conf = isns->i_conf; int s; char hostname[256]; if (TAILQ_EMPTY(&conf->conf_targets) || TAILQ_EMPTY(&conf->conf_portal_groups)) return; set_timeout(conf->conf_isns_timeout, false); s = isns_do_connect(isns); if (s < 0) return; gethostname(hostname, sizeof(hostname)); isns_do_deregister(isns, s, hostname); close(s); set_timeout(0, false); } int portal_group_set_filter(struct portal_group *pg, const char *str) { int filter; if (strcmp(str, "none") == 0) { filter = PG_FILTER_NONE; } else if (strcmp(str, "portal") == 0) { filter = PG_FILTER_PORTAL; } else if (strcmp(str, "portal-name") == 0) { filter = PG_FILTER_PORTAL_NAME; } else if (strcmp(str, "portal-name-auth") == 0) { filter = PG_FILTER_PORTAL_NAME_AUTH; } else { log_warnx("invalid discovery-filter \"%s\" for portal-group " "\"%s\"; valid values are \"none\", \"portal\", " "\"portal-name\", and \"portal-name-auth\"", str, pg->pg_name); return (1); } if (pg->pg_discovery_filter != PG_FILTER_UNKNOWN && pg->pg_discovery_filter != filter) { log_warnx("cannot set discovery-filter to \"%s\" for " "portal-group \"%s\"; already has a different " "value", str, pg->pg_name); return (1); } pg->pg_discovery_filter = filter; return (0); } int portal_group_set_offload(struct portal_group *pg, const char *offload) { if (pg->pg_offload != NULL) { log_warnx("cannot set offload to \"%s\" for " "portal-group \"%s\"; already defined", offload, pg->pg_name); return (1); } pg->pg_offload = checked_strdup(offload); return (0); } int portal_group_set_redirection(struct portal_group *pg, const char *addr) { if (pg->pg_redirection != NULL) { log_warnx("cannot set redirection to \"%s\" for " "portal-group \"%s\"; already defined", addr, pg->pg_name); return (1); } pg->pg_redirection = checked_strdup(addr); return (0); } static bool valid_hex(const char ch) { switch (ch) { case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': case 'a': case 'A': case 'b': case 'B': case 'c': case 'C': case 'd': case 'D': case 'e': case 'E': case 'f': case 'F': return (true); default: return (false); } } bool valid_iscsi_name(const char *name) { int i; if (strlen(name) >= MAX_NAME_LEN) { log_warnx("overlong name for target \"%s\"; max length allowed " "by iSCSI specification is %d characters", name, MAX_NAME_LEN); return (false); } /* * In the cases below, we don't return an error, just in case the admin * was right, and we're wrong. */ if (strncasecmp(name, "iqn.", strlen("iqn.")) == 0) { for (i = strlen("iqn."); name[i] != '\0'; i++) { /* * XXX: We should verify UTF-8 normalisation, as defined * by 3.2.6.2: iSCSI Name Encoding. */ if (isalnum(name[i])) continue; if (name[i] == '-' || name[i] == '.' || name[i] == ':') continue; log_warnx("invalid character \"%c\" in target name " "\"%s\"; allowed characters are letters, digits, " "'-', '.', and ':'", name[i], name); break; } /* * XXX: Check more stuff: valid date and a valid reversed domain. */ } else if (strncasecmp(name, "eui.", strlen("eui.")) == 0) { if (strlen(name) != strlen("eui.") + 16) log_warnx("invalid target name \"%s\"; the \"eui.\" " "should be followed by exactly 16 hexadecimal " "digits", name); for (i = strlen("eui."); name[i] != '\0'; i++) { if (!valid_hex(name[i])) { log_warnx("invalid character \"%c\" in target " "name \"%s\"; allowed characters are 1-9 " "and A-F", name[i], name); break; } } } else if (strncasecmp(name, "naa.", strlen("naa.")) == 0) { if (strlen(name) > strlen("naa.") + 32) log_warnx("invalid target name \"%s\"; the \"naa.\" " "should be followed by at most 32 hexadecimal " "digits", name); for (i = strlen("naa."); name[i] != '\0'; i++) { if (!valid_hex(name[i])) { log_warnx("invalid character \"%c\" in target " "name \"%s\"; allowed characters are 1-9 " "and A-F", name[i], name); break; } } } else { log_warnx("invalid target name \"%s\"; should start with " "either \"iqn.\", \"eui.\", or \"naa.\"", name); } return (true); } struct pport * pport_new(struct conf *conf, const char *name, uint32_t ctl_port) { struct pport *pp; pp = calloc(1, sizeof(*pp)); if (pp == NULL) log_err(1, "calloc"); pp->pp_conf = conf; pp->pp_name = checked_strdup(name); pp->pp_ctl_port = ctl_port; TAILQ_INIT(&pp->pp_ports); TAILQ_INSERT_TAIL(&conf->conf_pports, pp, pp_next); return (pp); } struct pport * pport_find(const struct conf *conf, const char *name) { struct pport *pp; TAILQ_FOREACH(pp, &conf->conf_pports, pp_next) { if (strcasecmp(pp->pp_name, name) == 0) return (pp); } return (NULL); } struct pport * pport_copy(struct pport *pp, struct conf *conf) { struct pport *ppnew; ppnew = pport_new(conf, pp->pp_name, pp->pp_ctl_port); return (ppnew); } void pport_delete(struct pport *pp) { struct port *port, *tport; TAILQ_FOREACH_SAFE(port, &pp->pp_ports, p_ts, tport) port_delete(port); TAILQ_REMOVE(&pp->pp_conf->conf_pports, pp, pp_next); free(pp->pp_name); free(pp); } struct port * port_new(struct conf *conf, struct target *target, struct portal_group *pg) { struct port *port; char *name; int ret; ret = asprintf(&name, "%s-%s", pg->pg_name, target->t_name); if (ret <= 0) log_err(1, "asprintf"); if (port_find(conf, name) != NULL) { log_warnx("duplicate port \"%s\"", name); free(name); return (NULL); } port = calloc(1, sizeof(*port)); if (port == NULL) log_err(1, "calloc"); port->p_conf = conf; port->p_name = name; TAILQ_INSERT_TAIL(&conf->conf_ports, port, p_next); TAILQ_INSERT_TAIL(&target->t_ports, port, p_ts); port->p_target = target; TAILQ_INSERT_TAIL(&pg->pg_ports, port, p_pgs); port->p_portal_group = pg; port->p_foreign = pg->pg_foreign; return (port); } struct port * port_new_pp(struct conf *conf, struct target *target, struct pport *pp) { struct port *port; char *name; int ret; ret = asprintf(&name, "%s-%s", pp->pp_name, target->t_name); if (ret <= 0) log_err(1, "asprintf"); if (port_find(conf, name) != NULL) { log_warnx("duplicate port \"%s\"", name); free(name); return (NULL); } port = calloc(1, sizeof(*port)); if (port == NULL) log_err(1, "calloc"); port->p_conf = conf; port->p_name = name; TAILQ_INSERT_TAIL(&conf->conf_ports, port, p_next); TAILQ_INSERT_TAIL(&target->t_ports, port, p_ts); port->p_target = target; TAILQ_INSERT_TAIL(&pp->pp_ports, port, p_pps); port->p_pport = pp; return (port); } struct port * port_find(const struct conf *conf, const char *name) { struct port *port; TAILQ_FOREACH(port, &conf->conf_ports, p_next) { if (strcasecmp(port->p_name, name) == 0) return (port); } return (NULL); } struct port * port_find_in_pg(const struct portal_group *pg, const char *target) { struct port *port; TAILQ_FOREACH(port, &pg->pg_ports, p_pgs) { if (strcasecmp(port->p_target->t_name, target) == 0) return (port); } return (NULL); } void port_delete(struct port *port) { if (port->p_portal_group) TAILQ_REMOVE(&port->p_portal_group->pg_ports, port, p_pgs); if (port->p_pport) TAILQ_REMOVE(&port->p_pport->pp_ports, port, p_pps); if (port->p_target) TAILQ_REMOVE(&port->p_target->t_ports, port, p_ts); TAILQ_REMOVE(&port->p_conf->conf_ports, port, p_next); free(port->p_name); free(port); } struct target * target_new(struct conf *conf, const char *name) { struct target *targ; int i, len; targ = target_find(conf, name); if (targ != NULL) { log_warnx("duplicated target \"%s\"", name); return (NULL); } if (valid_iscsi_name(name) == false) { log_warnx("target name \"%s\" is invalid", name); return (NULL); } targ = calloc(1, sizeof(*targ)); if (targ == NULL) log_err(1, "calloc"); targ->t_name = checked_strdup(name); /* * RFC 3722 requires us to normalize the name to lowercase. */ len = strlen(name); for (i = 0; i < len; i++) targ->t_name[i] = tolower(targ->t_name[i]); targ->t_conf = conf; TAILQ_INIT(&targ->t_ports); TAILQ_INSERT_TAIL(&conf->conf_targets, targ, t_next); return (targ); } void target_delete(struct target *targ) { struct port *port, *tport; TAILQ_FOREACH_SAFE(port, &targ->t_ports, p_ts, tport) port_delete(port); TAILQ_REMOVE(&targ->t_conf->conf_targets, targ, t_next); free(targ->t_name); free(targ->t_redirection); free(targ); } struct target * target_find(struct conf *conf, const char *name) { struct target *targ; TAILQ_FOREACH(targ, &conf->conf_targets, t_next) { if (strcasecmp(targ->t_name, name) == 0) return (targ); } return (NULL); } int target_set_redirection(struct target *target, const char *addr) { if (target->t_redirection != NULL) { log_warnx("cannot set redirection to \"%s\" for " "target \"%s\"; already defined", addr, target->t_name); return (1); } target->t_redirection = checked_strdup(addr); return (0); } struct lun * lun_new(struct conf *conf, const char *name) { struct lun *lun; lun = lun_find(conf, name); if (lun != NULL) { log_warnx("duplicated lun \"%s\"", name); return (NULL); } lun = calloc(1, sizeof(*lun)); if (lun == NULL) log_err(1, "calloc"); lun->l_conf = conf; lun->l_name = checked_strdup(name); TAILQ_INIT(&lun->l_options); TAILQ_INSERT_TAIL(&conf->conf_luns, lun, l_next); lun->l_ctl_lun = -1; return (lun); } void lun_delete(struct lun *lun) { struct target *targ; struct option *o, *tmp; int i; TAILQ_FOREACH(targ, &lun->l_conf->conf_targets, t_next) { for (i = 0; i < MAX_LUNS; i++) { if (targ->t_luns[i] == lun) targ->t_luns[i] = NULL; } } TAILQ_REMOVE(&lun->l_conf->conf_luns, lun, l_next); TAILQ_FOREACH_SAFE(o, &lun->l_options, o_next, tmp) option_delete(&lun->l_options, o); free(lun->l_name); free(lun->l_backend); free(lun->l_device_id); free(lun->l_path); free(lun->l_scsiname); free(lun->l_serial); free(lun); } struct lun * lun_find(const struct conf *conf, const char *name) { struct lun *lun; TAILQ_FOREACH(lun, &conf->conf_luns, l_next) { if (strcmp(lun->l_name, name) == 0) return (lun); } return (NULL); } void lun_set_backend(struct lun *lun, const char *value) { free(lun->l_backend); lun->l_backend = checked_strdup(value); } void lun_set_blocksize(struct lun *lun, size_t value) { lun->l_blocksize = value; } void lun_set_device_type(struct lun *lun, uint8_t value) { lun->l_device_type = value; } void lun_set_device_id(struct lun *lun, const char *value) { free(lun->l_device_id); lun->l_device_id = checked_strdup(value); } void lun_set_path(struct lun *lun, const char *value) { free(lun->l_path); lun->l_path = checked_strdup(value); } void lun_set_scsiname(struct lun *lun, const char *value) { free(lun->l_scsiname); lun->l_scsiname = checked_strdup(value); } void lun_set_serial(struct lun *lun, const char *value) { free(lun->l_serial); lun->l_serial = checked_strdup(value); } void lun_set_size(struct lun *lun, size_t value) { lun->l_size = value; } void lun_set_ctl_lun(struct lun *lun, uint32_t value) { lun->l_ctl_lun = value; } struct option * option_new(struct options *options, const char *name, const char *value) { struct option *o; o = option_find(options, name); if (o != NULL) { log_warnx("duplicated option \"%s\"", name); return (NULL); } o = calloc(1, sizeof(*o)); if (o == NULL) log_err(1, "calloc"); o->o_name = checked_strdup(name); o->o_value = checked_strdup(value); TAILQ_INSERT_TAIL(options, o, o_next); return (o); } void option_delete(struct options *options, struct option *o) { TAILQ_REMOVE(options, o, o_next); free(o->o_name); free(o->o_value); free(o); } struct option * option_find(const struct options *options, const char *name) { struct option *o; TAILQ_FOREACH(o, options, o_next) { if (strcmp(o->o_name, name) == 0) return (o); } return (NULL); } void option_set(struct option *o, const char *value) { free(o->o_value); o->o_value = checked_strdup(value); } static struct connection * connection_new(struct portal *portal, int fd, const char *host, const struct sockaddr *client_sa) { struct connection *conn; conn = calloc(1, sizeof(*conn)); if (conn == NULL) log_err(1, "calloc"); conn->conn_portal = portal; conn->conn_socket = fd; conn->conn_initiator_addr = checked_strdup(host); memcpy(&conn->conn_initiator_sa, client_sa, client_sa->sa_len); /* * Default values, from RFC 3720, section 12. */ conn->conn_max_recv_data_segment_length = 8192; + conn->conn_max_send_data_segment_length = 8192; conn->conn_max_burst_length = 262144; conn->conn_first_burst_length = 65536; conn->conn_immediate_data = true; return (conn); } #if 0 static void conf_print(struct conf *conf) { struct auth_group *ag; struct auth *auth; struct auth_name *auth_name; struct auth_portal *auth_portal; struct portal_group *pg; struct portal *portal; struct target *targ; struct lun *lun; struct option *o; TAILQ_FOREACH(ag, &conf->conf_auth_groups, ag_next) { fprintf(stderr, "auth-group %s {\n", ag->ag_name); TAILQ_FOREACH(auth, &ag->ag_auths, a_next) fprintf(stderr, "\t chap-mutual %s %s %s %s\n", auth->a_user, auth->a_secret, auth->a_mutual_user, auth->a_mutual_secret); TAILQ_FOREACH(auth_name, &ag->ag_names, an_next) fprintf(stderr, "\t initiator-name %s\n", auth_name->an_initator_name); TAILQ_FOREACH(auth_portal, &ag->ag_portals, an_next) fprintf(stderr, "\t initiator-portal %s\n", auth_portal->an_initator_portal); fprintf(stderr, "}\n"); } TAILQ_FOREACH(pg, &conf->conf_portal_groups, pg_next) { fprintf(stderr, "portal-group %s {\n", pg->pg_name); TAILQ_FOREACH(portal, &pg->pg_portals, p_next) fprintf(stderr, "\t listen %s\n", portal->p_listen); fprintf(stderr, "}\n"); } TAILQ_FOREACH(lun, &conf->conf_luns, l_next) { fprintf(stderr, "\tlun %s {\n", lun->l_name); fprintf(stderr, "\t\tpath %s\n", lun->l_path); TAILQ_FOREACH(o, &lun->l_options, o_next) fprintf(stderr, "\t\toption %s %s\n", lo->o_name, lo->o_value); fprintf(stderr, "\t}\n"); } TAILQ_FOREACH(targ, &conf->conf_targets, t_next) { fprintf(stderr, "target %s {\n", targ->t_name); if (targ->t_alias != NULL) fprintf(stderr, "\t alias %s\n", targ->t_alias); fprintf(stderr, "}\n"); } } #endif static int conf_verify_lun(struct lun *lun) { const struct lun *lun2; if (lun->l_backend == NULL) lun_set_backend(lun, "block"); if (strcmp(lun->l_backend, "block") == 0) { if (lun->l_path == NULL) { log_warnx("missing path for lun \"%s\"", lun->l_name); return (1); } } else if (strcmp(lun->l_backend, "ramdisk") == 0) { if (lun->l_size == 0) { log_warnx("missing size for ramdisk-backed lun \"%s\"", lun->l_name); return (1); } if (lun->l_path != NULL) { log_warnx("path must not be specified " "for ramdisk-backed lun \"%s\"", lun->l_name); return (1); } } if (lun->l_blocksize == 0) { if (lun->l_device_type == 5) lun_set_blocksize(lun, DEFAULT_CD_BLOCKSIZE); else lun_set_blocksize(lun, DEFAULT_BLOCKSIZE); } else if (lun->l_blocksize < 0) { log_warnx("invalid blocksize for lun \"%s\"; " "must be larger than 0", lun->l_name); return (1); } if (lun->l_size != 0 && lun->l_size % lun->l_blocksize != 0) { log_warnx("invalid size for lun \"%s\"; " "must be multiple of blocksize", lun->l_name); return (1); } TAILQ_FOREACH(lun2, &lun->l_conf->conf_luns, l_next) { if (lun == lun2) continue; if (lun->l_path != NULL && lun2->l_path != NULL && strcmp(lun->l_path, lun2->l_path) == 0) { log_debugx("WARNING: path \"%s\" duplicated " "between lun \"%s\", and " "lun \"%s\"", lun->l_path, lun->l_name, lun2->l_name); } } return (0); } int conf_verify(struct conf *conf) { struct auth_group *ag; struct portal_group *pg; struct port *port; struct target *targ; struct lun *lun; bool found; int error, i; if (conf->conf_pidfile_path == NULL) conf->conf_pidfile_path = checked_strdup(DEFAULT_PIDFILE); TAILQ_FOREACH(lun, &conf->conf_luns, l_next) { error = conf_verify_lun(lun); if (error != 0) return (error); } TAILQ_FOREACH(targ, &conf->conf_targets, t_next) { if (targ->t_auth_group == NULL) { targ->t_auth_group = auth_group_find(conf, "default"); assert(targ->t_auth_group != NULL); } if (TAILQ_EMPTY(&targ->t_ports)) { pg = portal_group_find(conf, "default"); assert(pg != NULL); port_new(conf, targ, pg); } found = false; for (i = 0; i < MAX_LUNS; i++) { if (targ->t_luns[i] != NULL) found = true; } if (!found && targ->t_redirection == NULL) { log_warnx("no LUNs defined for target \"%s\"", targ->t_name); } if (found && targ->t_redirection != NULL) { log_debugx("target \"%s\" contains luns, " " but configured for redirection", targ->t_name); } } TAILQ_FOREACH(pg, &conf->conf_portal_groups, pg_next) { assert(pg->pg_name != NULL); if (pg->pg_discovery_auth_group == NULL) { pg->pg_discovery_auth_group = auth_group_find(conf, "default"); assert(pg->pg_discovery_auth_group != NULL); } if (pg->pg_discovery_filter == PG_FILTER_UNKNOWN) pg->pg_discovery_filter = PG_FILTER_NONE; if (pg->pg_redirection != NULL) { if (!TAILQ_EMPTY(&pg->pg_ports)) { log_debugx("portal-group \"%s\" assigned " "to target, but configured " "for redirection", pg->pg_name); } pg->pg_unassigned = false; } else if (!TAILQ_EMPTY(&pg->pg_ports)) { pg->pg_unassigned = false; } else { if (strcmp(pg->pg_name, "default") != 0) log_warnx("portal-group \"%s\" not assigned " "to any target", pg->pg_name); pg->pg_unassigned = true; } } TAILQ_FOREACH(ag, &conf->conf_auth_groups, ag_next) { if (ag->ag_name == NULL) assert(ag->ag_target != NULL); else assert(ag->ag_target == NULL); found = false; TAILQ_FOREACH(targ, &conf->conf_targets, t_next) { if (targ->t_auth_group == ag) { found = true; break; } } TAILQ_FOREACH(port, &conf->conf_ports, p_next) { if (port->p_auth_group == ag) { found = true; break; } } TAILQ_FOREACH(pg, &conf->conf_portal_groups, pg_next) { if (pg->pg_discovery_auth_group == ag) { found = true; break; } } if (!found && ag->ag_name != NULL && strcmp(ag->ag_name, "default") != 0 && strcmp(ag->ag_name, "no-authentication") != 0 && strcmp(ag->ag_name, "no-access") != 0) { log_warnx("auth-group \"%s\" not assigned " "to any target", ag->ag_name); } } return (0); } static int conf_apply(struct conf *oldconf, struct conf *newconf) { struct lun *oldlun, *newlun, *tmplun; struct portal_group *oldpg, *newpg; struct portal *oldp, *newp; struct port *oldport, *newport, *tmpport; struct isns *oldns, *newns; pid_t otherpid; int changed, cumulated_error = 0, error, sockbuf; int one = 1; if (oldconf->conf_debug != newconf->conf_debug) { log_debugx("changing debug level to %d", newconf->conf_debug); log_init(newconf->conf_debug); } if (oldconf->conf_pidfh != NULL) { assert(oldconf->conf_pidfile_path != NULL); if (newconf->conf_pidfile_path != NULL && strcmp(oldconf->conf_pidfile_path, newconf->conf_pidfile_path) == 0) { newconf->conf_pidfh = oldconf->conf_pidfh; oldconf->conf_pidfh = NULL; } else { log_debugx("removing pidfile %s", oldconf->conf_pidfile_path); pidfile_remove(oldconf->conf_pidfh); oldconf->conf_pidfh = NULL; } } if (newconf->conf_pidfh == NULL && newconf->conf_pidfile_path != NULL) { log_debugx("opening pidfile %s", newconf->conf_pidfile_path); newconf->conf_pidfh = pidfile_open(newconf->conf_pidfile_path, 0600, &otherpid); if (newconf->conf_pidfh == NULL) { if (errno == EEXIST) log_errx(1, "daemon already running, pid: %jd.", (intmax_t)otherpid); log_err(1, "cannot open or create pidfile \"%s\"", newconf->conf_pidfile_path); } } /* * Go through the new portal groups, assigning tags or preserving old. */ TAILQ_FOREACH(newpg, &newconf->conf_portal_groups, pg_next) { if (newpg->pg_tag != 0) continue; oldpg = portal_group_find(oldconf, newpg->pg_name); if (oldpg != NULL) newpg->pg_tag = oldpg->pg_tag; else newpg->pg_tag = ++last_portal_group_tag; } /* Deregister on removed iSNS servers. */ TAILQ_FOREACH(oldns, &oldconf->conf_isns, i_next) { TAILQ_FOREACH(newns, &newconf->conf_isns, i_next) { if (strcmp(oldns->i_addr, newns->i_addr) == 0) break; } if (newns == NULL) isns_deregister(oldns); } /* * XXX: If target or lun removal fails, we should somehow "move" * the old lun or target into newconf, so that subsequent * conf_apply() would try to remove them again. That would * be somewhat hairy, though, and lun deletion failures don't * really happen, so leave it as it is for now. */ /* * First, remove any ports present in the old configuration * and missing in the new one. */ TAILQ_FOREACH_SAFE(oldport, &oldconf->conf_ports, p_next, tmpport) { if (oldport->p_foreign) continue; newport = port_find(newconf, oldport->p_name); if (newport != NULL && !newport->p_foreign) continue; log_debugx("removing port \"%s\"", oldport->p_name); error = kernel_port_remove(oldport); if (error != 0) { log_warnx("failed to remove port %s", oldport->p_name); /* * XXX: Uncomment after fixing the root cause. * * cumulated_error++; */ } } /* * Second, remove any LUNs present in the old configuration * and missing in the new one. */ TAILQ_FOREACH_SAFE(oldlun, &oldconf->conf_luns, l_next, tmplun) { newlun = lun_find(newconf, oldlun->l_name); if (newlun == NULL) { log_debugx("lun \"%s\", CTL lun %d " "not found in new configuration; " "removing", oldlun->l_name, oldlun->l_ctl_lun); error = kernel_lun_remove(oldlun); if (error != 0) { log_warnx("failed to remove lun \"%s\", " "CTL lun %d", oldlun->l_name, oldlun->l_ctl_lun); cumulated_error++; } continue; } /* * Also remove the LUNs changed by more than size. */ changed = 0; assert(oldlun->l_backend != NULL); assert(newlun->l_backend != NULL); if (strcmp(newlun->l_backend, oldlun->l_backend) != 0) { log_debugx("backend for lun \"%s\", " "CTL lun %d changed; removing", oldlun->l_name, oldlun->l_ctl_lun); changed = 1; } if (oldlun->l_blocksize != newlun->l_blocksize) { log_debugx("blocksize for lun \"%s\", " "CTL lun %d changed; removing", oldlun->l_name, oldlun->l_ctl_lun); changed = 1; } if (newlun->l_device_id != NULL && (oldlun->l_device_id == NULL || strcmp(oldlun->l_device_id, newlun->l_device_id) != 0)) { log_debugx("device-id for lun \"%s\", " "CTL lun %d changed; removing", oldlun->l_name, oldlun->l_ctl_lun); changed = 1; } if (newlun->l_path != NULL && (oldlun->l_path == NULL || strcmp(oldlun->l_path, newlun->l_path) != 0)) { log_debugx("path for lun \"%s\", " "CTL lun %d, changed; removing", oldlun->l_name, oldlun->l_ctl_lun); changed = 1; } if (newlun->l_serial != NULL && (oldlun->l_serial == NULL || strcmp(oldlun->l_serial, newlun->l_serial) != 0)) { log_debugx("serial for lun \"%s\", " "CTL lun %d changed; removing", oldlun->l_name, oldlun->l_ctl_lun); changed = 1; } if (changed) { error = kernel_lun_remove(oldlun); if (error != 0) { log_warnx("failed to remove lun \"%s\", " "CTL lun %d", oldlun->l_name, oldlun->l_ctl_lun); cumulated_error++; } lun_delete(oldlun); continue; } lun_set_ctl_lun(newlun, oldlun->l_ctl_lun); } TAILQ_FOREACH_SAFE(newlun, &newconf->conf_luns, l_next, tmplun) { oldlun = lun_find(oldconf, newlun->l_name); if (oldlun != NULL) { log_debugx("modifying lun \"%s\", CTL lun %d", newlun->l_name, newlun->l_ctl_lun); error = kernel_lun_modify(newlun); if (error != 0) { log_warnx("failed to " "modify lun \"%s\", CTL lun %d", newlun->l_name, newlun->l_ctl_lun); cumulated_error++; } continue; } log_debugx("adding lun \"%s\"", newlun->l_name); error = kernel_lun_add(newlun); if (error != 0) { log_warnx("failed to add lun \"%s\"", newlun->l_name); lun_delete(newlun); cumulated_error++; } } /* * Now add new ports or modify existing ones. */ TAILQ_FOREACH(newport, &newconf->conf_ports, p_next) { if (newport->p_foreign) continue; oldport = port_find(oldconf, newport->p_name); if (oldport == NULL || oldport->p_foreign) { log_debugx("adding port \"%s\"", newport->p_name); error = kernel_port_add(newport); } else { log_debugx("updating port \"%s\"", newport->p_name); newport->p_ctl_port = oldport->p_ctl_port; error = kernel_port_update(newport, oldport); } if (error != 0) { log_warnx("failed to %s port %s", (oldport == NULL) ? "add" : "update", newport->p_name); /* * XXX: Uncomment after fixing the root cause. * * cumulated_error++; */ } } /* * Go through the new portals, opening the sockets as necessary. */ TAILQ_FOREACH(newpg, &newconf->conf_portal_groups, pg_next) { if (newpg->pg_foreign) continue; if (newpg->pg_unassigned) { log_debugx("not listening on portal-group \"%s\", " "not assigned to any target", newpg->pg_name); continue; } TAILQ_FOREACH(newp, &newpg->pg_portals, p_next) { /* * Try to find already open portal and reuse * the listening socket. We don't care about * what portal or portal group that was, what * matters is the listening address. */ TAILQ_FOREACH(oldpg, &oldconf->conf_portal_groups, pg_next) { TAILQ_FOREACH(oldp, &oldpg->pg_portals, p_next) { if (strcmp(newp->p_listen, oldp->p_listen) == 0 && oldp->p_socket > 0) { newp->p_socket = oldp->p_socket; oldp->p_socket = 0; break; } } } if (newp->p_socket > 0) { /* * We're done with this portal. */ continue; } #ifdef ICL_KERNEL_PROXY if (proxy_mode) { newpg->pg_conf->conf_portal_id++; newp->p_id = newpg->pg_conf->conf_portal_id; log_debugx("listening on %s, portal-group " "\"%s\", portal id %d, using ICL proxy", newp->p_listen, newpg->pg_name, newp->p_id); kernel_listen(newp->p_ai, newp->p_iser, newp->p_id); continue; } #endif assert(proxy_mode == false); assert(newp->p_iser == false); log_debugx("listening on %s, portal-group \"%s\"", newp->p_listen, newpg->pg_name); newp->p_socket = socket(newp->p_ai->ai_family, newp->p_ai->ai_socktype, newp->p_ai->ai_protocol); if (newp->p_socket < 0) { log_warn("socket(2) failed for %s", newp->p_listen); cumulated_error++; continue; } sockbuf = SOCKBUF_SIZE; if (setsockopt(newp->p_socket, SOL_SOCKET, SO_RCVBUF, &sockbuf, sizeof(sockbuf)) == -1) log_warn("setsockopt(SO_RCVBUF) failed " "for %s", newp->p_listen); sockbuf = SOCKBUF_SIZE; if (setsockopt(newp->p_socket, SOL_SOCKET, SO_SNDBUF, &sockbuf, sizeof(sockbuf)) == -1) log_warn("setsockopt(SO_SNDBUF) failed " "for %s", newp->p_listen); error = setsockopt(newp->p_socket, SOL_SOCKET, SO_REUSEADDR, &one, sizeof(one)); if (error != 0) { log_warn("setsockopt(SO_REUSEADDR) failed " "for %s", newp->p_listen); close(newp->p_socket); newp->p_socket = 0; cumulated_error++; continue; } error = bind(newp->p_socket, newp->p_ai->ai_addr, newp->p_ai->ai_addrlen); if (error != 0) { log_warn("bind(2) failed for %s", newp->p_listen); close(newp->p_socket); newp->p_socket = 0; cumulated_error++; continue; } error = listen(newp->p_socket, -1); if (error != 0) { log_warn("listen(2) failed for %s", newp->p_listen); close(newp->p_socket); newp->p_socket = 0; cumulated_error++; continue; } } } /* * Go through the no longer used sockets, closing them. */ TAILQ_FOREACH(oldpg, &oldconf->conf_portal_groups, pg_next) { TAILQ_FOREACH(oldp, &oldpg->pg_portals, p_next) { if (oldp->p_socket <= 0) continue; log_debugx("closing socket for %s, portal-group \"%s\"", oldp->p_listen, oldpg->pg_name); close(oldp->p_socket); oldp->p_socket = 0; } } /* (Re-)Register on remaining/new iSNS servers. */ TAILQ_FOREACH(newns, &newconf->conf_isns, i_next) { TAILQ_FOREACH(oldns, &oldconf->conf_isns, i_next) { if (strcmp(oldns->i_addr, newns->i_addr) == 0) break; } isns_register(newns, oldns); } /* Schedule iSNS update */ if (!TAILQ_EMPTY(&newconf->conf_isns)) set_timeout((newconf->conf_isns_period + 2) / 3, false); return (cumulated_error); } bool timed_out(void) { return (sigalrm_received); } static void sigalrm_handler_fatal(int dummy __unused) { /* * It would be easiest to just log an error and exit. We can't * do this, though, because log_errx() is not signal safe, since * it calls syslog(3). Instead, set a flag checked by pdu_send() * and pdu_receive(), to call log_errx() there. Should they fail * to notice, we'll exit here one second later. */ if (sigalrm_received) { /* * Oh well. Just give up and quit. */ _exit(2); } sigalrm_received = true; } static void sigalrm_handler(int dummy __unused) { sigalrm_received = true; } void set_timeout(int timeout, int fatal) { struct sigaction sa; struct itimerval itv; int error; if (timeout <= 0) { log_debugx("session timeout disabled"); bzero(&itv, sizeof(itv)); error = setitimer(ITIMER_REAL, &itv, NULL); if (error != 0) log_err(1, "setitimer"); sigalrm_received = false; return; } sigalrm_received = false; bzero(&sa, sizeof(sa)); if (fatal) sa.sa_handler = sigalrm_handler_fatal; else sa.sa_handler = sigalrm_handler; sigfillset(&sa.sa_mask); error = sigaction(SIGALRM, &sa, NULL); if (error != 0) log_err(1, "sigaction"); /* * First SIGALRM will arive after conf_timeout seconds. * If we do nothing, another one will arrive a second later. */ log_debugx("setting session timeout to %d seconds", timeout); bzero(&itv, sizeof(itv)); itv.it_interval.tv_sec = 1; itv.it_value.tv_sec = timeout; error = setitimer(ITIMER_REAL, &itv, NULL); if (error != 0) log_err(1, "setitimer"); } static int wait_for_children(bool block) { pid_t pid; int status; int num = 0; for (;;) { /* * If "block" is true, wait for at least one process. */ if (block && num == 0) pid = wait4(-1, &status, 0, NULL); else pid = wait4(-1, &status, WNOHANG, NULL); if (pid <= 0) break; if (WIFSIGNALED(status)) { log_warnx("child process %d terminated with signal %d", pid, WTERMSIG(status)); } else if (WEXITSTATUS(status) != 0) { log_warnx("child process %d terminated with exit status %d", pid, WEXITSTATUS(status)); } else { log_debugx("child process %d terminated gracefully", pid); } num++; } return (num); } static void handle_connection(struct portal *portal, int fd, const struct sockaddr *client_sa, bool dont_fork) { struct connection *conn; int error; pid_t pid; char host[NI_MAXHOST + 1]; struct conf *conf; conf = portal->p_portal_group->pg_conf; if (dont_fork) { log_debugx("incoming connection; not forking due to -d flag"); } else { nchildren -= wait_for_children(false); assert(nchildren >= 0); while (conf->conf_maxproc > 0 && nchildren >= conf->conf_maxproc) { log_debugx("maxproc limit of %d child processes hit; " "waiting for child process to exit", conf->conf_maxproc); nchildren -= wait_for_children(true); assert(nchildren >= 0); } log_debugx("incoming connection; forking child process #%d", nchildren); nchildren++; pid = fork(); if (pid < 0) log_err(1, "fork"); if (pid > 0) { close(fd); return; } } pidfile_close(conf->conf_pidfh); error = getnameinfo(client_sa, client_sa->sa_len, host, sizeof(host), NULL, 0, NI_NUMERICHOST); if (error != 0) log_errx(1, "getnameinfo: %s", gai_strerror(error)); log_debugx("accepted connection from %s; portal group \"%s\"", host, portal->p_portal_group->pg_name); log_set_peer_addr(host); setproctitle("%s", host); conn = connection_new(portal, fd, host, client_sa); set_timeout(conf->conf_timeout, true); kernel_capsicate(); login(conn); if (conn->conn_session_type == CONN_SESSION_TYPE_NORMAL) { kernel_handoff(conn); log_debugx("connection handed off to the kernel"); } else { assert(conn->conn_session_type == CONN_SESSION_TYPE_DISCOVERY); discovery(conn); } log_debugx("nothing more to do; exiting"); exit(0); } static int fd_add(int fd, fd_set *fdset, int nfds) { /* * Skip sockets which we failed to bind. */ if (fd <= 0) return (nfds); FD_SET(fd, fdset); if (fd > nfds) nfds = fd; return (nfds); } static void main_loop(struct conf *conf, bool dont_fork) { struct portal_group *pg; struct portal *portal; struct sockaddr_storage client_sa; socklen_t client_salen; #ifdef ICL_KERNEL_PROXY int connection_id; int portal_id; #endif fd_set fdset; int error, nfds, client_fd; pidfile_write(conf->conf_pidfh); for (;;) { if (sighup_received || sigterm_received || timed_out()) return; #ifdef ICL_KERNEL_PROXY if (proxy_mode) { client_salen = sizeof(client_sa); kernel_accept(&connection_id, &portal_id, (struct sockaddr *)&client_sa, &client_salen); assert(client_salen >= client_sa.ss_len); log_debugx("incoming connection, id %d, portal id %d", connection_id, portal_id); TAILQ_FOREACH(pg, &conf->conf_portal_groups, pg_next) { TAILQ_FOREACH(portal, &pg->pg_portals, p_next) { if (portal->p_id == portal_id) { goto found; } } } log_errx(1, "kernel returned invalid portal_id %d", portal_id); found: handle_connection(portal, connection_id, (struct sockaddr *)&client_sa, dont_fork); } else { #endif assert(proxy_mode == false); FD_ZERO(&fdset); nfds = 0; TAILQ_FOREACH(pg, &conf->conf_portal_groups, pg_next) { TAILQ_FOREACH(portal, &pg->pg_portals, p_next) nfds = fd_add(portal->p_socket, &fdset, nfds); } error = select(nfds + 1, &fdset, NULL, NULL, NULL); if (error <= 0) { if (errno == EINTR) return; log_err(1, "select"); } TAILQ_FOREACH(pg, &conf->conf_portal_groups, pg_next) { TAILQ_FOREACH(portal, &pg->pg_portals, p_next) { if (!FD_ISSET(portal->p_socket, &fdset)) continue; client_salen = sizeof(client_sa); client_fd = accept(portal->p_socket, (struct sockaddr *)&client_sa, &client_salen); if (client_fd < 0) { if (errno == ECONNABORTED) continue; log_err(1, "accept"); } assert(client_salen >= client_sa.ss_len); handle_connection(portal, client_fd, (struct sockaddr *)&client_sa, dont_fork); break; } } #ifdef ICL_KERNEL_PROXY } #endif } } static void sighup_handler(int dummy __unused) { sighup_received = true; } static void sigterm_handler(int dummy __unused) { sigterm_received = true; } static void sigchld_handler(int dummy __unused) { /* * The only purpose of this handler is to make SIGCHLD * interrupt the ISCSIDWAIT ioctl(2), so we can call * wait_for_children(). */ } static void register_signals(void) { struct sigaction sa; int error; bzero(&sa, sizeof(sa)); sa.sa_handler = sighup_handler; sigfillset(&sa.sa_mask); error = sigaction(SIGHUP, &sa, NULL); if (error != 0) log_err(1, "sigaction"); sa.sa_handler = sigterm_handler; error = sigaction(SIGTERM, &sa, NULL); if (error != 0) log_err(1, "sigaction"); sa.sa_handler = sigterm_handler; error = sigaction(SIGINT, &sa, NULL); if (error != 0) log_err(1, "sigaction"); sa.sa_handler = sigchld_handler; error = sigaction(SIGCHLD, &sa, NULL); if (error != 0) log_err(1, "sigaction"); } static void check_perms(const char *path) { struct stat sb; int error; error = stat(path, &sb); if (error != 0) { log_warn("stat"); return; } if (sb.st_mode & S_IWOTH) { log_warnx("%s is world-writable", path); } else if (sb.st_mode & S_IROTH) { log_warnx("%s is world-readable", path); } else if (sb.st_mode & S_IXOTH) { /* * Ok, this one doesn't matter, but still do it, * just for consistency. */ log_warnx("%s is world-executable", path); } /* * XXX: Should we also check for owner != 0? */ } static struct conf * conf_new_from_file(const char *path, struct conf *oldconf, bool ucl) { struct conf *conf; struct auth_group *ag; struct portal_group *pg; struct pport *pp; int error; log_debugx("obtaining configuration from %s", path); conf = conf_new(); TAILQ_FOREACH(pp, &oldconf->conf_pports, pp_next) pport_copy(pp, conf); ag = auth_group_new(conf, "default"); assert(ag != NULL); ag = auth_group_new(conf, "no-authentication"); assert(ag != NULL); ag->ag_type = AG_TYPE_NO_AUTHENTICATION; ag = auth_group_new(conf, "no-access"); assert(ag != NULL); ag->ag_type = AG_TYPE_DENY; pg = portal_group_new(conf, "default"); assert(pg != NULL); if (ucl) error = uclparse_conf(conf, path); else error = parse_conf(conf, path); if (error != 0) { conf_delete(conf); return (NULL); } check_perms(path); if (conf->conf_default_ag_defined == false) { log_debugx("auth-group \"default\" not defined; " "going with defaults"); ag = auth_group_find(conf, "default"); assert(ag != NULL); ag->ag_type = AG_TYPE_DENY; } if (conf->conf_default_pg_defined == false) { log_debugx("portal-group \"default\" not defined; " "going with defaults"); pg = portal_group_find(conf, "default"); assert(pg != NULL); portal_group_add_listen(pg, "0.0.0.0:3260", false); portal_group_add_listen(pg, "[::]:3260", false); } conf->conf_kernel_port_on = true; error = conf_verify(conf); if (error != 0) { conf_delete(conf); return (NULL); } return (conf); } int main(int argc, char **argv) { struct conf *oldconf, *newconf, *tmpconf; struct isns *newns; const char *config_path = DEFAULT_CONFIG_PATH; int debug = 0, ch, error; bool dont_daemonize = false; bool use_ucl = false; while ((ch = getopt(argc, argv, "duf:R")) != -1) { switch (ch) { case 'd': dont_daemonize = true; debug++; break; case 'u': use_ucl = true; break; case 'f': config_path = optarg; break; case 'R': #ifndef ICL_KERNEL_PROXY log_errx(1, "ctld(8) compiled without ICL_KERNEL_PROXY " "does not support iSER protocol"); #endif proxy_mode = true; break; case '?': default: usage(); } } argc -= optind; if (argc != 0) usage(); log_init(debug); kernel_init(); oldconf = conf_new_from_kernel(); newconf = conf_new_from_file(config_path, oldconf, use_ucl); if (newconf == NULL) log_errx(1, "configuration error; exiting"); if (debug > 0) { oldconf->conf_debug = debug; newconf->conf_debug = debug; } error = conf_apply(oldconf, newconf); if (error != 0) log_errx(1, "failed to apply configuration; exiting"); conf_delete(oldconf); oldconf = NULL; register_signals(); if (dont_daemonize == false) { log_debugx("daemonizing"); if (daemon(0, 0) == -1) { log_warn("cannot daemonize"); pidfile_remove(newconf->conf_pidfh); exit(1); } } /* Schedule iSNS update */ if (!TAILQ_EMPTY(&newconf->conf_isns)) set_timeout((newconf->conf_isns_period + 2) / 3, false); for (;;) { main_loop(newconf, dont_daemonize); if (sighup_received) { sighup_received = false; log_debugx("received SIGHUP, reloading configuration"); tmpconf = conf_new_from_file(config_path, newconf, use_ucl); if (tmpconf == NULL) { log_warnx("configuration error, " "continuing with old configuration"); } else { if (debug > 0) tmpconf->conf_debug = debug; oldconf = newconf; newconf = tmpconf; error = conf_apply(oldconf, newconf); if (error != 0) log_warnx("failed to reload " "configuration"); conf_delete(oldconf); oldconf = NULL; } } else if (sigterm_received) { log_debugx("exiting on signal; " "reloading empty configuration"); log_debugx("removing CTL iSCSI ports " "and terminating all connections"); oldconf = newconf; newconf = conf_new(); if (debug > 0) newconf->conf_debug = debug; error = conf_apply(oldconf, newconf); if (error != 0) log_warnx("failed to apply configuration"); conf_delete(oldconf); oldconf = NULL; log_warnx("exiting on signal"); exit(0); } else { nchildren -= wait_for_children(false); assert(nchildren >= 0); if (timed_out()) { set_timeout(0, false); TAILQ_FOREACH(newns, &newconf->conf_isns, i_next) isns_check(newns); /* Schedule iSNS update */ if (!TAILQ_EMPTY(&newconf->conf_isns)) { set_timeout((newconf->conf_isns_period + 2) / 3, false); } } } } /* NOTREACHED */ } Index: projects/netbsd-tests-upstream-01-2017/usr.sbin/ctld/ctld.h =================================================================== --- projects/netbsd-tests-upstream-01-2017/usr.sbin/ctld/ctld.h (revision 312217) +++ projects/netbsd-tests-upstream-01-2017/usr.sbin/ctld/ctld.h (revision 312218) @@ -1,462 +1,464 @@ /*- * Copyright (c) 2012 The FreeBSD Foundation * All rights reserved. * * This software was developed by Edward Tomasz Napierala under sponsorship * from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef CTLD_H #define CTLD_H #include #ifdef ICL_KERNEL_PROXY #include #endif #include #include #include #define DEFAULT_CONFIG_PATH "/etc/ctl.conf" #define DEFAULT_PIDFILE "/var/run/ctld.pid" #define DEFAULT_BLOCKSIZE 512 #define DEFAULT_CD_BLOCKSIZE 2048 #define MAX_LUNS 1024 #define MAX_NAME_LEN 223 #define MAX_DATA_SEGMENT_LENGTH (128 * 1024) -#define MAX_BURST_LENGTH 16776192 -#define FIRST_BURST_LENGTH (128 * 1024) #define SOCKBUF_SIZE 1048576 struct auth { TAILQ_ENTRY(auth) a_next; struct auth_group *a_auth_group; char *a_user; char *a_secret; char *a_mutual_user; char *a_mutual_secret; }; struct auth_name { TAILQ_ENTRY(auth_name) an_next; struct auth_group *an_auth_group; char *an_initator_name; }; struct auth_portal { TAILQ_ENTRY(auth_portal) ap_next; struct auth_group *ap_auth_group; char *ap_initator_portal; struct sockaddr_storage ap_sa; int ap_mask; }; #define AG_TYPE_UNKNOWN 0 #define AG_TYPE_DENY 1 #define AG_TYPE_NO_AUTHENTICATION 2 #define AG_TYPE_CHAP 3 #define AG_TYPE_CHAP_MUTUAL 4 struct auth_group { TAILQ_ENTRY(auth_group) ag_next; struct conf *ag_conf; char *ag_name; struct target *ag_target; int ag_type; TAILQ_HEAD(, auth) ag_auths; TAILQ_HEAD(, auth_name) ag_names; TAILQ_HEAD(, auth_portal) ag_portals; }; struct portal { TAILQ_ENTRY(portal) p_next; struct portal_group *p_portal_group; bool p_iser; char *p_listen; struct addrinfo *p_ai; #ifdef ICL_KERNEL_PROXY int p_id; #endif TAILQ_HEAD(, target) p_targets; int p_socket; }; TAILQ_HEAD(options, option); #define PG_FILTER_UNKNOWN 0 #define PG_FILTER_NONE 1 #define PG_FILTER_PORTAL 2 #define PG_FILTER_PORTAL_NAME 3 #define PG_FILTER_PORTAL_NAME_AUTH 4 struct portal_group { TAILQ_ENTRY(portal_group) pg_next; struct conf *pg_conf; struct options pg_options; char *pg_name; struct auth_group *pg_discovery_auth_group; int pg_discovery_filter; int pg_foreign; bool pg_unassigned; TAILQ_HEAD(, portal) pg_portals; TAILQ_HEAD(, port) pg_ports; char *pg_offload; char *pg_redirection; uint16_t pg_tag; }; struct pport { TAILQ_ENTRY(pport) pp_next; TAILQ_HEAD(, port) pp_ports; struct conf *pp_conf; char *pp_name; uint32_t pp_ctl_port; }; struct port { TAILQ_ENTRY(port) p_next; TAILQ_ENTRY(port) p_pgs; TAILQ_ENTRY(port) p_pps; TAILQ_ENTRY(port) p_ts; struct conf *p_conf; char *p_name; struct auth_group *p_auth_group; struct portal_group *p_portal_group; struct pport *p_pport; struct target *p_target; int p_foreign; uint32_t p_ctl_port; }; struct option { TAILQ_ENTRY(option) o_next; char *o_name; char *o_value; }; struct lun { TAILQ_ENTRY(lun) l_next; struct conf *l_conf; struct options l_options; char *l_name; char *l_backend; uint8_t l_device_type; int l_blocksize; char *l_device_id; char *l_path; char *l_scsiname; char *l_serial; int64_t l_size; int l_ctl_lun; }; struct target { TAILQ_ENTRY(target) t_next; struct conf *t_conf; struct lun *t_luns[MAX_LUNS]; struct auth_group *t_auth_group; TAILQ_HEAD(, port) t_ports; char *t_name; char *t_alias; char *t_redirection; }; struct isns { TAILQ_ENTRY(isns) i_next; struct conf *i_conf; char *i_addr; struct addrinfo *i_ai; }; struct conf { char *conf_pidfile_path; TAILQ_HEAD(, lun) conf_luns; TAILQ_HEAD(, target) conf_targets; TAILQ_HEAD(, auth_group) conf_auth_groups; TAILQ_HEAD(, port) conf_ports; TAILQ_HEAD(, portal_group) conf_portal_groups; TAILQ_HEAD(, pport) conf_pports; TAILQ_HEAD(, isns) conf_isns; int conf_isns_period; int conf_isns_timeout; int conf_debug; int conf_timeout; int conf_maxproc; #ifdef ICL_KERNEL_PROXY int conf_portal_id; #endif struct pidfh *conf_pidfh; bool conf_default_pg_defined; bool conf_default_ag_defined; bool conf_kernel_port_on; }; #define CONN_SESSION_TYPE_NONE 0 #define CONN_SESSION_TYPE_DISCOVERY 1 #define CONN_SESSION_TYPE_NORMAL 2 #define CONN_DIGEST_NONE 0 #define CONN_DIGEST_CRC32C 1 struct connection { struct portal *conn_portal; struct port *conn_port; struct target *conn_target; int conn_socket; int conn_session_type; char *conn_initiator_name; char *conn_initiator_addr; char *conn_initiator_alias; uint8_t conn_initiator_isid[6]; struct sockaddr_storage conn_initiator_sa; uint32_t conn_cmdsn; uint32_t conn_statsn; + int conn_max_recv_data_segment_limit; + int conn_max_send_data_segment_limit; + int conn_max_burst_limit; + int conn_first_burst_limit; int conn_max_recv_data_segment_length; int conn_max_send_data_segment_length; int conn_max_burst_length; int conn_first_burst_length; int conn_immediate_data; int conn_header_digest; int conn_data_digest; const char *conn_user; struct chap *conn_chap; }; struct pdu { struct connection *pdu_connection; struct iscsi_bhs *pdu_bhs; char *pdu_data; size_t pdu_data_len; }; #define KEYS_MAX 1024 struct keys { char *keys_names[KEYS_MAX]; char *keys_values[KEYS_MAX]; char *keys_data; size_t keys_data_len; }; #define CHAP_CHALLENGE_LEN 1024 #define CHAP_DIGEST_LEN 16 /* Equal to MD5 digest size. */ struct chap { unsigned char chap_id; char chap_challenge[CHAP_CHALLENGE_LEN]; char chap_response[CHAP_DIGEST_LEN]; }; struct rchap { char *rchap_secret; unsigned char rchap_id; void *rchap_challenge; size_t rchap_challenge_len; }; struct chap *chap_new(void); char *chap_get_id(const struct chap *chap); char *chap_get_challenge(const struct chap *chap); int chap_receive(struct chap *chap, const char *response); int chap_authenticate(struct chap *chap, const char *secret); void chap_delete(struct chap *chap); struct rchap *rchap_new(const char *secret); int rchap_receive(struct rchap *rchap, const char *id, const char *challenge); char *rchap_get_response(struct rchap *rchap); void rchap_delete(struct rchap *rchap); int parse_conf(struct conf *conf, const char *path); int uclparse_conf(struct conf *conf, const char *path); struct conf *conf_new(void); struct conf *conf_new_from_kernel(void); void conf_delete(struct conf *conf); int conf_verify(struct conf *conf); struct auth_group *auth_group_new(struct conf *conf, const char *name); void auth_group_delete(struct auth_group *ag); struct auth_group *auth_group_find(const struct conf *conf, const char *name); int auth_group_set_type(struct auth_group *ag, const char *type); const struct auth *auth_new_chap(struct auth_group *ag, const char *user, const char *secret); const struct auth *auth_new_chap_mutual(struct auth_group *ag, const char *user, const char *secret, const char *user2, const char *secret2); const struct auth *auth_find(const struct auth_group *ag, const char *user); const struct auth_name *auth_name_new(struct auth_group *ag, const char *initiator_name); bool auth_name_defined(const struct auth_group *ag); const struct auth_name *auth_name_find(const struct auth_group *ag, const char *initiator_name); int auth_name_check(const struct auth_group *ag, const char *initiator_name); const struct auth_portal *auth_portal_new(struct auth_group *ag, const char *initiator_portal); bool auth_portal_defined(const struct auth_group *ag); const struct auth_portal *auth_portal_find(const struct auth_group *ag, const struct sockaddr_storage *sa); int auth_portal_check(const struct auth_group *ag, const struct sockaddr_storage *sa); struct portal_group *portal_group_new(struct conf *conf, const char *name); void portal_group_delete(struct portal_group *pg); struct portal_group *portal_group_find(const struct conf *conf, const char *name); int portal_group_add_listen(struct portal_group *pg, const char *listen, bool iser); int portal_group_set_filter(struct portal_group *pg, const char *filter); int portal_group_set_offload(struct portal_group *pg, const char *offload); int portal_group_set_redirection(struct portal_group *pg, const char *addr); int isns_new(struct conf *conf, const char *addr); void isns_delete(struct isns *is); void isns_register(struct isns *isns, struct isns *oldisns); void isns_check(struct isns *isns); void isns_deregister(struct isns *isns); struct pport *pport_new(struct conf *conf, const char *name, uint32_t ctl_port); struct pport *pport_find(const struct conf *conf, const char *name); struct pport *pport_copy(struct pport *pport, struct conf *conf); void pport_delete(struct pport *pport); struct port *port_new(struct conf *conf, struct target *target, struct portal_group *pg); struct port *port_new_pp(struct conf *conf, struct target *target, struct pport *pp); struct port *port_find(const struct conf *conf, const char *name); struct port *port_find_in_pg(const struct portal_group *pg, const char *target); void port_delete(struct port *port); struct target *target_new(struct conf *conf, const char *name); void target_delete(struct target *target); struct target *target_find(struct conf *conf, const char *name); int target_set_redirection(struct target *target, const char *addr); struct lun *lun_new(struct conf *conf, const char *name); void lun_delete(struct lun *lun); struct lun *lun_find(const struct conf *conf, const char *name); void lun_set_backend(struct lun *lun, const char *value); void lun_set_device_type(struct lun *lun, uint8_t value); void lun_set_blocksize(struct lun *lun, size_t value); void lun_set_device_id(struct lun *lun, const char *value); void lun_set_path(struct lun *lun, const char *value); void lun_set_scsiname(struct lun *lun, const char *value); void lun_set_serial(struct lun *lun, const char *value); void lun_set_size(struct lun *lun, size_t value); void lun_set_ctl_lun(struct lun *lun, uint32_t value); struct option *option_new(struct options *os, const char *name, const char *value); void option_delete(struct options *os, struct option *co); struct option *option_find(const struct options *os, const char *name); void option_set(struct option *o, const char *value); void kernel_init(void); int kernel_lun_add(struct lun *lun); int kernel_lun_modify(struct lun *lun); int kernel_lun_remove(struct lun *lun); void kernel_handoff(struct connection *conn); void kernel_limits(const char *offload, int *max_recv_data_segment_length, int *max_send_data_segment_length, int *max_burst_length, int *first_burst_length); int kernel_port_add(struct port *port); int kernel_port_update(struct port *port, struct port *old); int kernel_port_remove(struct port *port); void kernel_capsicate(void); #ifdef ICL_KERNEL_PROXY void kernel_listen(struct addrinfo *ai, bool iser, int portal_id); void kernel_accept(int *connection_id, int *portal_id, struct sockaddr *client_sa, socklen_t *client_salen); void kernel_send(struct pdu *pdu); void kernel_receive(struct pdu *pdu); #endif struct keys *keys_new(void); void keys_delete(struct keys *keys); void keys_load(struct keys *keys, const struct pdu *pdu); void keys_save(struct keys *keys, struct pdu *pdu); const char *keys_find(struct keys *keys, const char *name); void keys_add(struct keys *keys, const char *name, const char *value); void keys_add_int(struct keys *keys, const char *name, int value); struct pdu *pdu_new(struct connection *conn); struct pdu *pdu_new_response(struct pdu *request); void pdu_delete(struct pdu *pdu); void pdu_receive(struct pdu *request); void pdu_send(struct pdu *response); void login(struct connection *conn); void discovery(struct connection *conn); void log_init(int level); void log_set_peer_name(const char *name); void log_set_peer_addr(const char *addr); void log_err(int, const char *, ...) __dead2 __printflike(2, 3); void log_errx(int, const char *, ...) __dead2 __printflike(2, 3); void log_warn(const char *, ...) __printflike(1, 2); void log_warnx(const char *, ...) __printflike(1, 2); void log_debugx(const char *, ...) __printflike(1, 2); char *checked_strdup(const char *); bool valid_iscsi_name(const char *name); void set_timeout(int timeout, int fatal); bool timed_out(void); #endif /* !CTLD_H */ Index: projects/netbsd-tests-upstream-01-2017/usr.sbin/ctld/login.c =================================================================== --- projects/netbsd-tests-upstream-01-2017/usr.sbin/ctld/login.c (revision 312217) +++ projects/netbsd-tests-upstream-01-2017/usr.sbin/ctld/login.c (revision 312218) @@ -1,1038 +1,1057 @@ /*- * Copyright (c) 2012 The FreeBSD Foundation * All rights reserved. * * This software was developed by Edward Tomasz Napierala under sponsorship * from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include "ctld.h" #include "iscsi_proto.h" static void login_send_error(struct pdu *request, char class, char detail); static void login_set_nsg(struct pdu *response, int nsg) { struct iscsi_bhs_login_response *bhslr; assert(nsg == BHSLR_STAGE_SECURITY_NEGOTIATION || nsg == BHSLR_STAGE_OPERATIONAL_NEGOTIATION || nsg == BHSLR_STAGE_FULL_FEATURE_PHASE); bhslr = (struct iscsi_bhs_login_response *)response->pdu_bhs; bhslr->bhslr_flags &= 0xFC; bhslr->bhslr_flags |= nsg; bhslr->bhslr_flags |= BHSLR_FLAGS_TRANSIT; } static int login_csg(const struct pdu *request) { struct iscsi_bhs_login_request *bhslr; bhslr = (struct iscsi_bhs_login_request *)request->pdu_bhs; return ((bhslr->bhslr_flags & 0x0C) >> 2); } static void login_set_csg(struct pdu *response, int csg) { struct iscsi_bhs_login_response *bhslr; assert(csg == BHSLR_STAGE_SECURITY_NEGOTIATION || csg == BHSLR_STAGE_OPERATIONAL_NEGOTIATION || csg == BHSLR_STAGE_FULL_FEATURE_PHASE); bhslr = (struct iscsi_bhs_login_response *)response->pdu_bhs; bhslr->bhslr_flags &= 0xF3; bhslr->bhslr_flags |= csg << 2; } static struct pdu * login_receive(struct connection *conn, bool initial) { struct pdu *request; struct iscsi_bhs_login_request *bhslr; request = pdu_new(conn); pdu_receive(request); if ((request->pdu_bhs->bhs_opcode & ~ISCSI_BHS_OPCODE_IMMEDIATE) != ISCSI_BHS_OPCODE_LOGIN_REQUEST) { /* * The first PDU in session is special - if we receive any PDU * different than login request, we have to drop the connection * without sending response ("A target receiving any PDU * except a Login request before the Login Phase is started MUST * immediately terminate the connection on which the PDU * was received.") */ if (initial == false) login_send_error(request, 0x02, 0x0b); log_errx(1, "protocol error: received invalid opcode 0x%x", request->pdu_bhs->bhs_opcode); } bhslr = (struct iscsi_bhs_login_request *)request->pdu_bhs; /* * XXX: Implement the C flag some day. */ if ((bhslr->bhslr_flags & BHSLR_FLAGS_CONTINUE) != 0) { login_send_error(request, 0x03, 0x00); log_errx(1, "received Login PDU with unsupported \"C\" flag"); } if (bhslr->bhslr_version_max != 0x00) { login_send_error(request, 0x02, 0x05); log_errx(1, "received Login PDU with unsupported " "Version-max 0x%x", bhslr->bhslr_version_max); } if (bhslr->bhslr_version_min != 0x00) { login_send_error(request, 0x02, 0x05); log_errx(1, "received Login PDU with unsupported " "Version-min 0x%x", bhslr->bhslr_version_min); } if (initial == false && ISCSI_SNLT(ntohl(bhslr->bhslr_cmdsn), conn->conn_cmdsn)) { login_send_error(request, 0x02, 0x00); log_errx(1, "received Login PDU with decreasing CmdSN: " "was %u, is %u", conn->conn_cmdsn, ntohl(bhslr->bhslr_cmdsn)); } if (initial == false && ntohl(bhslr->bhslr_expstatsn) != conn->conn_statsn) { login_send_error(request, 0x02, 0x00); log_errx(1, "received Login PDU with wrong ExpStatSN: " "is %u, should be %u", ntohl(bhslr->bhslr_expstatsn), conn->conn_statsn); } conn->conn_cmdsn = ntohl(bhslr->bhslr_cmdsn); return (request); } static struct pdu * login_new_response(struct pdu *request) { struct pdu *response; struct connection *conn; struct iscsi_bhs_login_request *bhslr; struct iscsi_bhs_login_response *bhslr2; bhslr = (struct iscsi_bhs_login_request *)request->pdu_bhs; conn = request->pdu_connection; response = pdu_new_response(request); bhslr2 = (struct iscsi_bhs_login_response *)response->pdu_bhs; bhslr2->bhslr_opcode = ISCSI_BHS_OPCODE_LOGIN_RESPONSE; login_set_csg(response, BHSLR_STAGE_SECURITY_NEGOTIATION); memcpy(bhslr2->bhslr_isid, bhslr->bhslr_isid, sizeof(bhslr2->bhslr_isid)); bhslr2->bhslr_initiator_task_tag = bhslr->bhslr_initiator_task_tag; bhslr2->bhslr_statsn = htonl(conn->conn_statsn++); bhslr2->bhslr_expcmdsn = htonl(conn->conn_cmdsn); bhslr2->bhslr_maxcmdsn = htonl(conn->conn_cmdsn); return (response); } static void login_send_error(struct pdu *request, char class, char detail) { struct pdu *response; struct iscsi_bhs_login_response *bhslr2; log_debugx("sending Login Response PDU with failure class 0x%x/0x%x; " "see next line for reason", class, detail); response = login_new_response(request); bhslr2 = (struct iscsi_bhs_login_response *)response->pdu_bhs; bhslr2->bhslr_status_class = class; bhslr2->bhslr_status_detail = detail; pdu_send(response); pdu_delete(response); } static int login_list_contains(const char *list, const char *what) { char *tofree, *str, *token; tofree = str = checked_strdup(list); while ((token = strsep(&str, ",")) != NULL) { if (strcmp(token, what) == 0) { free(tofree); return (1); } } free(tofree); return (0); } static int login_list_prefers(const char *list, const char *choice1, const char *choice2) { char *tofree, *str, *token; tofree = str = checked_strdup(list); while ((token = strsep(&str, ",")) != NULL) { if (strcmp(token, choice1) == 0) { free(tofree); return (1); } if (strcmp(token, choice2) == 0) { free(tofree); return (2); } } free(tofree); return (-1); } static struct pdu * login_receive_chap_a(struct connection *conn) { struct pdu *request; struct keys *request_keys; const char *chap_a; request = login_receive(conn, false); request_keys = keys_new(); keys_load(request_keys, request); chap_a = keys_find(request_keys, "CHAP_A"); if (chap_a == NULL) { login_send_error(request, 0x02, 0x07); log_errx(1, "received CHAP Login PDU without CHAP_A"); } if (login_list_contains(chap_a, "5") == 0) { login_send_error(request, 0x02, 0x01); log_errx(1, "received CHAP Login PDU with unsupported CHAP_A " "\"%s\"", chap_a); } keys_delete(request_keys); return (request); } static void login_send_chap_c(struct pdu *request, struct chap *chap) { struct pdu *response; struct keys *response_keys; char *chap_c, *chap_i; chap_c = chap_get_challenge(chap); chap_i = chap_get_id(chap); response = login_new_response(request); response_keys = keys_new(); keys_add(response_keys, "CHAP_A", "5"); keys_add(response_keys, "CHAP_I", chap_i); keys_add(response_keys, "CHAP_C", chap_c); free(chap_i); free(chap_c); keys_save(response_keys, response); pdu_send(response); pdu_delete(response); keys_delete(response_keys); } static struct pdu * login_receive_chap_r(struct connection *conn, struct auth_group *ag, struct chap *chap, const struct auth **authp) { struct pdu *request; struct keys *request_keys; const char *chap_n, *chap_r; const struct auth *auth; int error; request = login_receive(conn, false); request_keys = keys_new(); keys_load(request_keys, request); chap_n = keys_find(request_keys, "CHAP_N"); if (chap_n == NULL) { login_send_error(request, 0x02, 0x07); log_errx(1, "received CHAP Login PDU without CHAP_N"); } chap_r = keys_find(request_keys, "CHAP_R"); if (chap_r == NULL) { login_send_error(request, 0x02, 0x07); log_errx(1, "received CHAP Login PDU without CHAP_R"); } error = chap_receive(chap, chap_r); if (error != 0) { login_send_error(request, 0x02, 0x07); log_errx(1, "received CHAP Login PDU with malformed CHAP_R"); } /* * Verify the response. */ assert(ag->ag_type == AG_TYPE_CHAP || ag->ag_type == AG_TYPE_CHAP_MUTUAL); auth = auth_find(ag, chap_n); if (auth == NULL) { login_send_error(request, 0x02, 0x01); log_errx(1, "received CHAP Login with invalid user \"%s\"", chap_n); } assert(auth->a_secret != NULL); assert(strlen(auth->a_secret) > 0); error = chap_authenticate(chap, auth->a_secret); if (error != 0) { login_send_error(request, 0x02, 0x01); log_errx(1, "CHAP authentication failed for user \"%s\"", auth->a_user); } keys_delete(request_keys); *authp = auth; return (request); } static void login_send_chap_success(struct pdu *request, const struct auth *auth) { struct pdu *response; struct keys *request_keys, *response_keys; struct rchap *rchap; const char *chap_i, *chap_c; char *chap_r; int error; response = login_new_response(request); login_set_nsg(response, BHSLR_STAGE_OPERATIONAL_NEGOTIATION); /* * Actually, one more thing: mutual authentication. */ request_keys = keys_new(); keys_load(request_keys, request); chap_i = keys_find(request_keys, "CHAP_I"); chap_c = keys_find(request_keys, "CHAP_C"); if (chap_i != NULL || chap_c != NULL) { if (chap_i == NULL) { login_send_error(request, 0x02, 0x07); log_errx(1, "initiator requested target " "authentication, but didn't send CHAP_I"); } if (chap_c == NULL) { login_send_error(request, 0x02, 0x07); log_errx(1, "initiator requested target " "authentication, but didn't send CHAP_C"); } if (auth->a_auth_group->ag_type != AG_TYPE_CHAP_MUTUAL) { login_send_error(request, 0x02, 0x01); log_errx(1, "initiator requests target authentication " "for user \"%s\", but mutual user/secret " "is not set", auth->a_user); } log_debugx("performing mutual authentication as user \"%s\"", auth->a_mutual_user); rchap = rchap_new(auth->a_mutual_secret); error = rchap_receive(rchap, chap_i, chap_c); if (error != 0) { login_send_error(request, 0x02, 0x07); log_errx(1, "received CHAP Login PDU with malformed " "CHAP_I or CHAP_C"); } chap_r = rchap_get_response(rchap); rchap_delete(rchap); response_keys = keys_new(); keys_add(response_keys, "CHAP_N", auth->a_mutual_user); keys_add(response_keys, "CHAP_R", chap_r); free(chap_r); keys_save(response_keys, response); keys_delete(response_keys); } else { log_debugx("initiator did not request target authentication"); } keys_delete(request_keys); pdu_send(response); pdu_delete(response); } static void login_chap(struct connection *conn, struct auth_group *ag) { const struct auth *auth; struct chap *chap; struct pdu *request; /* * Receive CHAP_A PDU. */ log_debugx("beginning CHAP authentication; waiting for CHAP_A"); request = login_receive_chap_a(conn); /* * Generate the challenge. */ chap = chap_new(); /* * Send the challenge. */ log_debugx("sending CHAP_C, binary challenge size is %zd bytes", sizeof(chap->chap_challenge)); login_send_chap_c(request, chap); pdu_delete(request); /* * Receive CHAP_N/CHAP_R PDU and authenticate. */ log_debugx("waiting for CHAP_N/CHAP_R"); request = login_receive_chap_r(conn, ag, chap, &auth); /* * Yay, authentication succeeded! */ log_debugx("authentication succeeded for user \"%s\"; " "transitioning to Negotiation Phase", auth->a_user); login_send_chap_success(request, auth); pdu_delete(request); /* * Leave username and CHAP information for discovery(). */ conn->conn_user = auth->a_user; conn->conn_chap = chap; } static void login_negotiate_key(struct pdu *request, const char *name, const char *value, bool skipped_security, struct keys *response_keys) { int which; size_t tmp; struct connection *conn; conn = request->pdu_connection; if (strcmp(name, "InitiatorName") == 0) { if (!skipped_security) log_errx(1, "initiator resent InitiatorName"); } else if (strcmp(name, "SessionType") == 0) { if (!skipped_security) log_errx(1, "initiator resent SessionType"); } else if (strcmp(name, "TargetName") == 0) { if (!skipped_security) log_errx(1, "initiator resent TargetName"); } else if (strcmp(name, "InitiatorAlias") == 0) { if (conn->conn_initiator_alias != NULL) free(conn->conn_initiator_alias); conn->conn_initiator_alias = checked_strdup(value); } else if (strcmp(value, "Irrelevant") == 0) { /* Ignore. */ } else if (strcmp(name, "HeaderDigest") == 0) { /* * We don't handle digests for discovery sessions. */ if (conn->conn_session_type == CONN_SESSION_TYPE_DISCOVERY) { log_debugx("discovery session; digests disabled"); keys_add(response_keys, name, "None"); return; } which = login_list_prefers(value, "CRC32C", "None"); switch (which) { case 1: log_debugx("initiator prefers CRC32C " "for header digest; we'll use it"); conn->conn_header_digest = CONN_DIGEST_CRC32C; keys_add(response_keys, name, "CRC32C"); break; case 2: log_debugx("initiator prefers not to do " "header digest; we'll comply"); keys_add(response_keys, name, "None"); break; default: log_warnx("initiator sent unrecognized " "HeaderDigest value \"%s\"; will use None", value); keys_add(response_keys, name, "None"); break; } } else if (strcmp(name, "DataDigest") == 0) { if (conn->conn_session_type == CONN_SESSION_TYPE_DISCOVERY) { log_debugx("discovery session; digests disabled"); keys_add(response_keys, name, "None"); return; } which = login_list_prefers(value, "CRC32C", "None"); switch (which) { case 1: log_debugx("initiator prefers CRC32C " "for data digest; we'll use it"); conn->conn_data_digest = CONN_DIGEST_CRC32C; keys_add(response_keys, name, "CRC32C"); break; case 2: log_debugx("initiator prefers not to do " "data digest; we'll comply"); keys_add(response_keys, name, "None"); break; default: log_warnx("initiator sent unrecognized " "DataDigest value \"%s\"; will use None", value); keys_add(response_keys, name, "None"); break; } } else if (strcmp(name, "MaxConnections") == 0) { keys_add(response_keys, name, "1"); } else if (strcmp(name, "InitialR2T") == 0) { keys_add(response_keys, name, "Yes"); } else if (strcmp(name, "ImmediateData") == 0) { if (conn->conn_session_type == CONN_SESSION_TYPE_DISCOVERY) { log_debugx("discovery session; ImmediateData irrelevant"); keys_add(response_keys, name, "Irrelevant"); } else { if (strcmp(value, "Yes") == 0) { conn->conn_immediate_data = true; keys_add(response_keys, name, "Yes"); } else { conn->conn_immediate_data = false; keys_add(response_keys, name, "No"); } } } else if (strcmp(name, "MaxRecvDataSegmentLength") == 0) { tmp = strtoul(value, NULL, 10); if (tmp <= 0) { login_send_error(request, 0x02, 0x00); log_errx(1, "received invalid " "MaxRecvDataSegmentLength"); } /* * MaxRecvDataSegmentLength is a direction-specific parameter. * We'll limit our _send_ to what the initiator can handle but * our MaxRecvDataSegmentLength is not influenced by the * initiator in any way. */ - if ((int)tmp > conn->conn_max_send_data_segment_length) { - log_debugx("capping max_send_data_segment_length " + if ((int)tmp > conn->conn_max_send_data_segment_limit) { + log_debugx("capping MaxRecvDataSegmentLength " "from %zd to %d", tmp, - conn->conn_max_send_data_segment_length); - tmp = conn->conn_max_send_data_segment_length; + conn->conn_max_send_data_segment_limit); + tmp = conn->conn_max_send_data_segment_limit; } conn->conn_max_send_data_segment_length = tmp; + conn->conn_max_recv_data_segment_length = + conn->conn_max_recv_data_segment_limit; keys_add_int(response_keys, name, conn->conn_max_recv_data_segment_length); } else if (strcmp(name, "MaxBurstLength") == 0) { tmp = strtoul(value, NULL, 10); if (tmp <= 0) { login_send_error(request, 0x02, 0x00); log_errx(1, "received invalid MaxBurstLength"); } - if ((int)tmp > conn->conn_max_burst_length) { + if ((int)tmp > conn->conn_max_burst_limit) { log_debugx("capping MaxBurstLength from %zd to %d", - tmp, conn->conn_max_burst_length); - tmp = conn->conn_max_burst_length; + tmp, conn->conn_max_burst_limit); + tmp = conn->conn_max_burst_limit; } conn->conn_max_burst_length = tmp; keys_add_int(response_keys, name, tmp); } else if (strcmp(name, "FirstBurstLength") == 0) { tmp = strtoul(value, NULL, 10); if (tmp <= 0) { login_send_error(request, 0x02, 0x00); log_errx(1, "received invalid FirstBurstLength"); } - if ((int)tmp > conn->conn_first_burst_length) { + if ((int)tmp > conn->conn_first_burst_limit) { log_debugx("capping FirstBurstLength from %zd to %d", - tmp, conn->conn_first_burst_length); - tmp = conn->conn_first_burst_length; + tmp, conn->conn_first_burst_limit); + tmp = conn->conn_first_burst_limit; } conn->conn_first_burst_length = tmp; keys_add_int(response_keys, name, tmp); } else if (strcmp(name, "DefaultTime2Wait") == 0) { keys_add(response_keys, name, value); } else if (strcmp(name, "DefaultTime2Retain") == 0) { keys_add(response_keys, name, "0"); } else if (strcmp(name, "MaxOutstandingR2T") == 0) { keys_add(response_keys, name, "1"); } else if (strcmp(name, "DataPDUInOrder") == 0) { keys_add(response_keys, name, "Yes"); } else if (strcmp(name, "DataSequenceInOrder") == 0) { keys_add(response_keys, name, "Yes"); } else if (strcmp(name, "ErrorRecoveryLevel") == 0) { keys_add(response_keys, name, "0"); } else if (strcmp(name, "OFMarker") == 0) { keys_add(response_keys, name, "No"); } else if (strcmp(name, "IFMarker") == 0) { keys_add(response_keys, name, "No"); } else if (strcmp(name, "iSCSIProtocolLevel") == 0) { tmp = strtoul(value, NULL, 10); if (tmp > 2) tmp = 2; keys_add_int(response_keys, name, tmp); } else { log_debugx("unknown key \"%s\"; responding " "with NotUnderstood", name); keys_add(response_keys, name, "NotUnderstood"); } } static void login_redirect(struct pdu *request, const char *target_address) { struct pdu *response; struct iscsi_bhs_login_response *bhslr2; struct keys *response_keys; response = login_new_response(request); login_set_csg(response, login_csg(request)); bhslr2 = (struct iscsi_bhs_login_response *)response->pdu_bhs; bhslr2->bhslr_status_class = 0x01; bhslr2->bhslr_status_detail = 0x01; response_keys = keys_new(); keys_add(response_keys, "TargetAddress", target_address); keys_save(response_keys, response); pdu_send(response); pdu_delete(response); keys_delete(response_keys); } static bool login_portal_redirect(struct connection *conn, struct pdu *request) { const struct portal_group *pg; pg = conn->conn_portal->p_portal_group; if (pg->pg_redirection == NULL) return (false); log_debugx("portal-group \"%s\" configured to redirect to %s", pg->pg_name, pg->pg_redirection); login_redirect(request, pg->pg_redirection); return (true); } static bool login_target_redirect(struct connection *conn, struct pdu *request) { const char *target_address; assert(conn->conn_portal->p_portal_group->pg_redirection == NULL); if (conn->conn_target == NULL) return (false); target_address = conn->conn_target->t_redirection; if (target_address == NULL) return (false); log_debugx("target \"%s\" configured to redirect to %s", conn->conn_target->t_name, target_address); login_redirect(request, target_address); return (true); } static void login_negotiate(struct connection *conn, struct pdu *request) { struct pdu *response; struct iscsi_bhs_login_response *bhslr2; struct keys *request_keys, *response_keys; int i; bool redirected, skipped_security; if (conn->conn_session_type == CONN_SESSION_TYPE_NORMAL) { /* * Query the kernel for various size limits. In case of * offload, it depends on hardware capabilities. */ assert(conn->conn_target != NULL); + conn->conn_max_recv_data_segment_limit = (1 << 24) - 1; + conn->conn_max_send_data_segment_limit = (1 << 24) - 1; + conn->conn_max_burst_limit = (1 << 24) - 1; + conn->conn_first_burst_limit = (1 << 24) - 1; kernel_limits(conn->conn_portal->p_portal_group->pg_offload, - &conn->conn_max_recv_data_segment_length, - &conn->conn_max_send_data_segment_length, - &conn->conn_max_burst_length, - &conn->conn_first_burst_length); + &conn->conn_max_recv_data_segment_limit, + &conn->conn_max_send_data_segment_limit, + &conn->conn_max_burst_limit, + &conn->conn_first_burst_limit); /* We expect legal, usable values at this point. */ - assert(conn->conn_max_recv_data_segment_length >= 512); - assert(conn->conn_max_recv_data_segment_length < (1 << 24)); - assert(conn->conn_max_burst_length >= 512); - assert(conn->conn_max_burst_length < (1 << 24)); - assert(conn->conn_first_burst_length >= 512); - assert(conn->conn_first_burst_length < (1 << 24)); - assert(conn->conn_first_burst_length <= - conn->conn_max_burst_length); + assert(conn->conn_max_recv_data_segment_limit >= 512); + assert(conn->conn_max_recv_data_segment_limit < (1 << 24)); + assert(conn->conn_max_send_data_segment_limit >= 512); + assert(conn->conn_max_send_data_segment_limit < (1 << 24)); + assert(conn->conn_max_burst_limit >= 512); + assert(conn->conn_max_burst_limit < (1 << 24)); + assert(conn->conn_first_burst_limit >= 512); + assert(conn->conn_first_burst_limit < (1 << 24)); + assert(conn->conn_first_burst_limit <= + conn->conn_max_burst_limit); + + /* + * Limit default send length in case it won't be negotiated. + * We can't do it for other limits, since they may affect both + * sender and receiver operation, and we must obey defaults. + */ + if (conn->conn_max_send_data_segment_limit < + conn->conn_max_send_data_segment_length) { + conn->conn_max_send_data_segment_length = + conn->conn_max_send_data_segment_limit; + } } else { - conn->conn_max_recv_data_segment_length = + conn->conn_max_recv_data_segment_limit = MAX_DATA_SEGMENT_LENGTH; - conn->conn_max_send_data_segment_length = + conn->conn_max_send_data_segment_limit = MAX_DATA_SEGMENT_LENGTH; } if (request == NULL) { log_debugx("beginning operational parameter negotiation; " "waiting for Login PDU"); request = login_receive(conn, false); skipped_security = false; } else skipped_security = true; /* * RFC 3720, 10.13.5. Status-Class and Status-Detail, says * the redirection SHOULD be accepted by the initiator before * authentication, but MUST be be accepted afterwards; that's * why we're doing it here and not earlier. */ redirected = login_target_redirect(conn, request); if (redirected) { log_debugx("initiator redirected; exiting"); exit(0); } request_keys = keys_new(); keys_load(request_keys, request); response = login_new_response(request); bhslr2 = (struct iscsi_bhs_login_response *)response->pdu_bhs; bhslr2->bhslr_tsih = htons(0xbadd); login_set_csg(response, BHSLR_STAGE_OPERATIONAL_NEGOTIATION); login_set_nsg(response, BHSLR_STAGE_FULL_FEATURE_PHASE); response_keys = keys_new(); if (skipped_security && conn->conn_session_type == CONN_SESSION_TYPE_NORMAL) { if (conn->conn_target->t_alias != NULL) keys_add(response_keys, "TargetAlias", conn->conn_target->t_alias); keys_add_int(response_keys, "TargetPortalGroupTag", conn->conn_portal->p_portal_group->pg_tag); } for (i = 0; i < KEYS_MAX; i++) { if (request_keys->keys_names[i] == NULL) break; login_negotiate_key(request, request_keys->keys_names[i], request_keys->keys_values[i], skipped_security, response_keys); } /* * We'd started with usable values at our end. But a bad initiator * could have presented a large FirstBurstLength and then a smaller * MaxBurstLength (in that order) and because we process the key/value * pairs in the order they are in the request we might have ended up * with illegal values here. */ if (conn->conn_session_type == CONN_SESSION_TYPE_NORMAL && conn->conn_first_burst_length > conn->conn_max_burst_length) { log_errx(1, "initiator sent FirstBurstLength > MaxBurstLength"); } log_debugx("operational parameter negotiation done; " "transitioning to Full Feature Phase"); keys_save(response_keys, response); pdu_send(response); pdu_delete(response); keys_delete(response_keys); pdu_delete(request); keys_delete(request_keys); } static void login_wait_transition(struct connection *conn) { struct pdu *request, *response; struct iscsi_bhs_login_request *bhslr; log_debugx("waiting for state transition request"); request = login_receive(conn, false); bhslr = (struct iscsi_bhs_login_request *)request->pdu_bhs; if ((bhslr->bhslr_flags & BHSLR_FLAGS_TRANSIT) == 0) { login_send_error(request, 0x02, 0x00); log_errx(1, "got no \"T\" flag after answering AuthMethod"); } log_debugx("got state transition request"); response = login_new_response(request); pdu_delete(request); login_set_nsg(response, BHSLR_STAGE_OPERATIONAL_NEGOTIATION); pdu_send(response); pdu_delete(response); login_negotiate(conn, NULL); } void login(struct connection *conn) { struct pdu *request, *response; struct iscsi_bhs_login_request *bhslr; struct keys *request_keys, *response_keys; struct auth_group *ag; struct portal_group *pg; const char *initiator_name, *initiator_alias, *session_type, *target_name, *auth_method; bool redirected, fail, trans; /* * Handle the initial Login Request - figure out required authentication * method and either transition to the next phase, if no authentication * is required, or call appropriate authentication code. */ log_debugx("beginning Login Phase; waiting for Login PDU"); request = login_receive(conn, true); bhslr = (struct iscsi_bhs_login_request *)request->pdu_bhs; if (bhslr->bhslr_tsih != 0) { login_send_error(request, 0x02, 0x0a); log_errx(1, "received Login PDU with non-zero TSIH"); } pg = conn->conn_portal->p_portal_group; memcpy(conn->conn_initiator_isid, bhslr->bhslr_isid, sizeof(conn->conn_initiator_isid)); /* * XXX: Implement the C flag some day. */ request_keys = keys_new(); keys_load(request_keys, request); assert(conn->conn_initiator_name == NULL); initiator_name = keys_find(request_keys, "InitiatorName"); if (initiator_name == NULL) { login_send_error(request, 0x02, 0x07); log_errx(1, "received Login PDU without InitiatorName"); } if (valid_iscsi_name(initiator_name) == false) { login_send_error(request, 0x02, 0x00); log_errx(1, "received Login PDU with invalid InitiatorName"); } conn->conn_initiator_name = checked_strdup(initiator_name); log_set_peer_name(conn->conn_initiator_name); setproctitle("%s (%s)", conn->conn_initiator_addr, conn->conn_initiator_name); redirected = login_portal_redirect(conn, request); if (redirected) { log_debugx("initiator redirected; exiting"); exit(0); } initiator_alias = keys_find(request_keys, "InitiatorAlias"); if (initiator_alias != NULL) conn->conn_initiator_alias = checked_strdup(initiator_alias); assert(conn->conn_session_type == CONN_SESSION_TYPE_NONE); session_type = keys_find(request_keys, "SessionType"); if (session_type != NULL) { if (strcmp(session_type, "Normal") == 0) { conn->conn_session_type = CONN_SESSION_TYPE_NORMAL; } else if (strcmp(session_type, "Discovery") == 0) { conn->conn_session_type = CONN_SESSION_TYPE_DISCOVERY; } else { login_send_error(request, 0x02, 0x00); log_errx(1, "received Login PDU with invalid " "SessionType \"%s\"", session_type); } } else conn->conn_session_type = CONN_SESSION_TYPE_NORMAL; assert(conn->conn_target == NULL); if (conn->conn_session_type == CONN_SESSION_TYPE_NORMAL) { target_name = keys_find(request_keys, "TargetName"); if (target_name == NULL) { login_send_error(request, 0x02, 0x07); log_errx(1, "received Login PDU without TargetName"); } conn->conn_port = port_find_in_pg(pg, target_name); if (conn->conn_port == NULL) { login_send_error(request, 0x02, 0x03); log_errx(1, "requested target \"%s\" not found", target_name); } conn->conn_target = conn->conn_port->p_target; } /* * At this point we know what kind of authentication we need. */ if (conn->conn_session_type == CONN_SESSION_TYPE_NORMAL) { ag = conn->conn_port->p_auth_group; if (ag == NULL) ag = conn->conn_target->t_auth_group; if (ag->ag_name != NULL) { log_debugx("initiator requests to connect " "to target \"%s\"; auth-group \"%s\"", conn->conn_target->t_name, ag->ag_name); } else { log_debugx("initiator requests to connect " "to target \"%s\"", conn->conn_target->t_name); } } else { assert(conn->conn_session_type == CONN_SESSION_TYPE_DISCOVERY); ag = pg->pg_discovery_auth_group; if (ag->ag_name != NULL) { log_debugx("initiator requests " "discovery session; auth-group \"%s\"", ag->ag_name); } else { log_debugx("initiator requests discovery session"); } } if (ag->ag_type == AG_TYPE_DENY) { login_send_error(request, 0x02, 0x01); log_errx(1, "auth-type is \"deny\""); } if (ag->ag_type == AG_TYPE_UNKNOWN) { /* * This can happen with empty auth-group. */ login_send_error(request, 0x02, 0x01); log_errx(1, "auth-type not set, denying access"); } /* * Enforce initiator-name and initiator-portal. */ if (auth_name_check(ag, initiator_name) != 0) { login_send_error(request, 0x02, 0x02); log_errx(1, "initiator does not match allowed initiator names"); } if (auth_portal_check(ag, &conn->conn_initiator_sa) != 0) { login_send_error(request, 0x02, 0x02); log_errx(1, "initiator does not match allowed " "initiator portals"); } /* * Let's see if the initiator intends to do any kind of authentication * at all. */ if (login_csg(request) == BHSLR_STAGE_OPERATIONAL_NEGOTIATION) { if (ag->ag_type != AG_TYPE_NO_AUTHENTICATION) { login_send_error(request, 0x02, 0x01); log_errx(1, "initiator skipped the authentication, " "but authentication is required"); } keys_delete(request_keys); log_debugx("initiator skipped the authentication, " "and we don't need it; proceeding with negotiation"); login_negotiate(conn, request); return; } fail = false; response = login_new_response(request); response_keys = keys_new(); trans = (bhslr->bhslr_flags & BHSLR_FLAGS_TRANSIT) != 0; auth_method = keys_find(request_keys, "AuthMethod"); if (ag->ag_type == AG_TYPE_NO_AUTHENTICATION) { log_debugx("authentication not required"); if (auth_method == NULL || login_list_contains(auth_method, "None")) { keys_add(response_keys, "AuthMethod", "None"); } else { log_warnx("initiator requests " "AuthMethod \"%s\" instead of \"None\"", auth_method); keys_add(response_keys, "AuthMethod", "Reject"); } if (trans) login_set_nsg(response, BHSLR_STAGE_OPERATIONAL_NEGOTIATION); } else { log_debugx("CHAP authentication required"); if (auth_method == NULL || login_list_contains(auth_method, "CHAP")) { keys_add(response_keys, "AuthMethod", "CHAP"); } else { log_warnx("initiator requests unsupported " "AuthMethod \"%s\" instead of \"CHAP\"", auth_method); keys_add(response_keys, "AuthMethod", "Reject"); fail = true; } } if (conn->conn_session_type == CONN_SESSION_TYPE_NORMAL) { if (conn->conn_target->t_alias != NULL) keys_add(response_keys, "TargetAlias", conn->conn_target->t_alias); keys_add_int(response_keys, "TargetPortalGroupTag", pg->pg_tag); } keys_save(response_keys, response); pdu_send(response); pdu_delete(response); keys_delete(response_keys); pdu_delete(request); keys_delete(request_keys); if (fail) { log_debugx("sent reject for AuthMethod; exiting"); exit(1); } if (ag->ag_type != AG_TYPE_NO_AUTHENTICATION) { login_chap(conn, ag); login_negotiate(conn, NULL); } else if (trans) { login_negotiate(conn, NULL); } else { login_wait_transition(conn); } } Index: projects/netbsd-tests-upstream-01-2017/usr.sbin/inetd/inetd.c =================================================================== --- projects/netbsd-tests-upstream-01-2017/usr.sbin/inetd/inetd.c (revision 312217) +++ projects/netbsd-tests-upstream-01-2017/usr.sbin/inetd/inetd.c (revision 312218) @@ -1,2577 +1,2581 @@ /* * Copyright (c) 1983, 1991, 1993, 1994 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #ifndef lint static const char copyright[] = "@(#) Copyright (c) 1983, 1991, 1993, 1994\n\ The Regents of the University of California. All rights reserved.\n"; #endif /* not lint */ #ifndef lint #if 0 static char sccsid[] = "@(#)from: inetd.c 8.4 (Berkeley) 4/13/94"; #endif #endif /* not lint */ #include __FBSDID("$FreeBSD$"); /* * Inetd - Internet super-server * * This program invokes all internet services as needed. Connection-oriented * services are invoked each time a connection is made, by creating a process. * This process is passed the connection as file descriptor 0 and is expected * to do a getpeername to find out the source host and port. * * Datagram oriented services are invoked when a datagram * arrives; a process is created and passed a pending message * on file descriptor 0. Datagram servers may either connect * to their peer, freeing up the original socket for inetd * to receive further messages on, or ``take over the socket'', * processing all arriving datagrams and, eventually, timing * out. The first type of server is said to be ``multi-threaded''; * the second type of server ``single-threaded''. * * Inetd uses a configuration file which is read at startup * and, possibly, at some later time in response to a hangup signal. * The configuration file is ``free format'' with fields given in the * order shown below. Continuation lines for an entry must begin with * a space or tab. All fields must be present in each entry. * * service name must be in /etc/services * or name a tcpmux service * or specify a unix domain socket * socket type stream/dgram/raw/rdm/seqpacket * protocol tcp[4][6], udp[4][6], unix * wait/nowait single-threaded/multi-threaded * user[:group][/login-class] user/group/login-class to run daemon as * server program full path name * server program arguments maximum of MAXARGS (20) * * TCP services without official port numbers are handled with the * RFC1078-based tcpmux internal service. Tcpmux listens on port 1 for * requests. When a connection is made from a foreign host, the service * requested is passed to tcpmux, which looks it up in the servtab list * and returns the proper entry for the service. Tcpmux returns a * negative reply if the service doesn't exist, otherwise the invoked * server is expected to return the positive reply if the service type in * inetd.conf file has the prefix "tcpmux/". If the service type has the * prefix "tcpmux/+", tcpmux will return the positive reply for the * process; this is for compatibility with older server code, and also * allows you to invoke programs that use stdin/stdout without putting any * special server code in them. Services that use tcpmux are "nowait" * because they do not have a well-known port and hence cannot listen * for new requests. * * For RPC services * service name/version must be in /etc/rpc * socket type stream/dgram/raw/rdm/seqpacket * protocol rpc/tcp[4][6], rpc/udp[4][6] * wait/nowait single-threaded/multi-threaded * user[:group][/login-class] user/group/login-class to run daemon as * server program full path name * server program arguments maximum of MAXARGS * * Comment lines are indicated by a `#' in column 1. * * #ifdef IPSEC * Comment lines that start with "#@" denote IPsec policy string, as described * in ipsec_set_policy(3). This will affect all the following items in * inetd.conf(8). To reset the policy, just use "#@" line. By default, * there's no IPsec policy. * #endif */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include +#ifdef LIBWRAP #include +#endif #include #include "inetd.h" #include "pathnames.h" #ifdef IPSEC #include #ifndef IPSEC_POLICY_IPSEC /* no ipsec support on old ipsec */ #undef IPSEC #endif #endif #ifndef LIBWRAP_ALLOW_FACILITY # define LIBWRAP_ALLOW_FACILITY LOG_AUTH #endif #ifndef LIBWRAP_ALLOW_SEVERITY # define LIBWRAP_ALLOW_SEVERITY LOG_INFO #endif #ifndef LIBWRAP_DENY_FACILITY # define LIBWRAP_DENY_FACILITY LOG_AUTH #endif #ifndef LIBWRAP_DENY_SEVERITY # define LIBWRAP_DENY_SEVERITY LOG_WARNING #endif #define ISWRAP(sep) \ ( ((wrap_ex && !(sep)->se_bi) || (wrap_bi && (sep)->se_bi)) \ && (sep->se_family == AF_INET || sep->se_family == AF_INET6) \ && ( ((sep)->se_accept && (sep)->se_socktype == SOCK_STREAM) \ || (sep)->se_socktype == SOCK_DGRAM)) #ifdef LOGIN_CAP #include /* see init.c */ #define RESOURCE_RC "daemon" #endif #ifndef MAXCHILD #define MAXCHILD -1 /* maximum number of this service < 0 = no limit */ #endif #ifndef MAXCPM #define MAXCPM -1 /* rate limit invocations from a single remote address, < 0 = no limit */ #endif #ifndef MAXPERIP #define MAXPERIP -1 /* maximum number of this service from a single remote address, < 0 = no limit */ #endif #ifndef TOOMANY #define TOOMANY 256 /* don't start more than TOOMANY */ #endif #define CNT_INTVL 60 /* servers in CNT_INTVL sec. */ #define RETRYTIME (60*10) /* retry after bind or server fail */ #define MAX_MAXCHLD 32767 /* max allowable max children */ #define SIGBLOCK (sigmask(SIGCHLD)|sigmask(SIGHUP)|sigmask(SIGALRM)) #define satosin(sa) ((struct sockaddr_in *)(void *)sa) #define csatosin(sa) ((const struct sockaddr_in *)(const void *)sa) #ifdef INET6 #define satosin6(sa) ((struct sockaddr_in6 *)(void *)sa) #define csatosin6(sa) ((const struct sockaddr_in6 *)(const void *)sa) #endif static void close_sep(struct servtab *); static void flag_signal(int); static void flag_config(int); static void config(void); static int cpmip(const struct servtab *, int); static void endconfig(void); static struct servtab *enter(struct servtab *); static void freeconfig(struct servtab *); static struct servtab *getconfigent(void); static int matchservent(const char *, const char *, const char *); static char *nextline(FILE *); static void addchild(struct servtab *, int); static void flag_reapchild(int); static void reapchild(void); static void enable(struct servtab *); static void disable(struct servtab *); static void flag_retry(int); static void retry(void); static int setconfig(void); static void setup(struct servtab *); #ifdef IPSEC static void ipsecsetup(struct servtab *); #endif static void unregisterrpc(register struct servtab *sep); static struct conninfo *search_conn(struct servtab *sep, int ctrl); static int room_conn(struct servtab *sep, struct conninfo *conn); static void addchild_conn(struct conninfo *conn, pid_t pid); static void reapchild_conn(pid_t pid); static void free_conn(struct conninfo *conn); static void resize_conn(struct servtab *sep, int maxperip); static void free_connlist(struct servtab *sep); static void free_proc(struct procinfo *); static struct procinfo *search_proc(pid_t pid, int add); static int hashval(char *p, int len); static char *skip(char **); static char *sskip(char **); static char *newstr(const char *); static void print_service(const char *, const struct servtab *); /* tcpd.h */ int allow_severity; int deny_severity; static int wrap_ex = 0; static int wrap_bi = 0; int debug = 0; static int dolog = 0; static int maxsock; /* highest-numbered descriptor */ static fd_set allsock; static int options; static int timingout; static int toomany = TOOMANY; static int maxchild = MAXCHILD; static int maxcpm = MAXCPM; static int maxperip = MAXPERIP; static struct servent *sp; static struct rpcent *rpc; static char *hostname = NULL; static struct sockaddr_in *bind_sa4; static int v4bind_ok = 0; #ifdef INET6 static struct sockaddr_in6 *bind_sa6; static int v6bind_ok = 0; #endif static int signalpipe[2]; #ifdef SANITY_CHECK static int nsock; #endif static uid_t euid; static gid_t egid; static mode_t mask; struct servtab *servtab; static const char *CONFIG = _PATH_INETDCONF; static const char *pid_file = _PATH_INETDPID; static struct pidfh *pfh = NULL; static struct netconfig *udpconf, *tcpconf, *udp6conf, *tcp6conf; static LIST_HEAD(, procinfo) proctable[PERIPSIZE]; static int getvalue(const char *arg, int *value, const char *whine) { int tmp; char *p; tmp = strtol(arg, &p, 0); if (tmp < 0 || *p) { syslog(LOG_ERR, whine, arg); return 1; /* failure */ } *value = tmp; return 0; /* success */ } +#ifdef LIBWRAP static sa_family_t whichaf(struct request_info *req) { struct sockaddr *sa; sa = (struct sockaddr *)req->client->sin; if (sa == NULL) return AF_UNSPEC; #ifdef INET6 if (sa->sa_family == AF_INET6 && IN6_IS_ADDR_V4MAPPED(&satosin6(sa)->sin6_addr)) return AF_INET; #endif return sa->sa_family; } +#endif int main(int argc, char **argv) { struct servtab *sep; struct passwd *pwd; struct group *grp; struct sigaction sa, saalrm, sachld, sahup, sapipe; int ch, dofork; pid_t pid; char buf[50]; #ifdef LOGIN_CAP login_cap_t *lc = NULL; #endif #ifdef LIBWRAP struct request_info req; int denied; char *service = NULL; #endif struct sockaddr_storage peer; int i; struct addrinfo hints, *res; const char *servname; int error; struct conninfo *conn; openlog("inetd", LOG_PID | LOG_NOWAIT | LOG_PERROR, LOG_DAEMON); while ((ch = getopt(argc, argv, "dlwWR:a:c:C:p:s:")) != -1) switch(ch) { case 'd': debug = 1; options |= SO_DEBUG; break; case 'l': dolog = 1; break; case 'R': getvalue(optarg, &toomany, "-R %s: bad value for service invocation rate"); break; case 'c': getvalue(optarg, &maxchild, "-c %s: bad value for maximum children"); break; case 'C': getvalue(optarg, &maxcpm, "-C %s: bad value for maximum children/minute"); break; case 'a': hostname = optarg; break; case 'p': pid_file = optarg; break; case 's': getvalue(optarg, &maxperip, "-s %s: bad value for maximum children per source address"); break; case 'w': wrap_ex++; break; case 'W': wrap_bi++; break; case '?': default: syslog(LOG_ERR, "usage: inetd [-dlwW] [-a address] [-R rate]" " [-c maximum] [-C rate]" " [-p pidfile] [conf-file]"); exit(EX_USAGE); } /* * Initialize Bind Addrs. * When hostname is NULL, wild card bind addrs are obtained from * getaddrinfo(). But getaddrinfo() requires at least one of * hostname or servname is non NULL. * So when hostname is NULL, set dummy value to servname. * Since getaddrinfo() doesn't accept numeric servname, and * we doesn't use ai_socktype of struct addrinfo returned * from getaddrinfo(), we set dummy value to ai_socktype. */ servname = (hostname == NULL) ? "0" /* dummy */ : NULL; bzero(&hints, sizeof(struct addrinfo)); hints.ai_flags = AI_PASSIVE; hints.ai_family = AF_UNSPEC; hints.ai_socktype = SOCK_STREAM; /* dummy */ error = getaddrinfo(hostname, servname, &hints, &res); if (error != 0) { syslog(LOG_ERR, "-a %s: %s", hostname, gai_strerror(error)); if (error == EAI_SYSTEM) syslog(LOG_ERR, "%s", strerror(errno)); exit(EX_USAGE); } do { if (res->ai_addr == NULL) { syslog(LOG_ERR, "-a %s: getaddrinfo failed", hostname); exit(EX_USAGE); } switch (res->ai_addr->sa_family) { case AF_INET: if (v4bind_ok) continue; bind_sa4 = satosin(res->ai_addr); /* init port num in case servname is dummy */ bind_sa4->sin_port = 0; v4bind_ok = 1; continue; #ifdef INET6 case AF_INET6: if (v6bind_ok) continue; bind_sa6 = satosin6(res->ai_addr); /* init port num in case servname is dummy */ bind_sa6->sin6_port = 0; v6bind_ok = 1; continue; #endif } if (v4bind_ok #ifdef INET6 && v6bind_ok #endif ) break; } while ((res = res->ai_next) != NULL); if (!v4bind_ok #ifdef INET6 && !v6bind_ok #endif ) { syslog(LOG_ERR, "-a %s: unknown address family", hostname); exit(EX_USAGE); } euid = geteuid(); egid = getegid(); umask(mask = umask(0777)); argc -= optind; argv += optind; if (argc > 0) CONFIG = argv[0]; if (access(CONFIG, R_OK) < 0) syslog(LOG_ERR, "Accessing %s: %m, continuing anyway.", CONFIG); if (debug == 0) { pid_t otherpid; pfh = pidfile_open(pid_file, 0600, &otherpid); if (pfh == NULL) { if (errno == EEXIST) { syslog(LOG_ERR, "%s already running, pid: %d", getprogname(), otherpid); exit(EX_OSERR); } syslog(LOG_WARNING, "pidfile_open() failed: %m"); } if (daemon(0, 0) < 0) { syslog(LOG_WARNING, "daemon(0,0) failed: %m"); } /* From now on we don't want syslog messages going to stderr. */ closelog(); openlog("inetd", LOG_PID | LOG_NOWAIT, LOG_DAEMON); /* * In case somebody has started inetd manually, we need to * clear the logname, so that old servers run as root do not * get the user's logname.. */ if (setlogin("") < 0) { syslog(LOG_WARNING, "cannot clear logname: %m"); /* no big deal if it fails.. */ } if (pfh != NULL && pidfile_write(pfh) == -1) { syslog(LOG_WARNING, "pidfile_write(): %m"); } } if (madvise(NULL, 0, MADV_PROTECT) != 0) syslog(LOG_WARNING, "madvise() failed: %s", strerror(errno)); for (i = 0; i < PERIPSIZE; ++i) LIST_INIT(&proctable[i]); if (v4bind_ok) { udpconf = getnetconfigent("udp"); tcpconf = getnetconfigent("tcp"); if (udpconf == NULL || tcpconf == NULL) { syslog(LOG_ERR, "unknown rpc/udp or rpc/tcp"); exit(EX_USAGE); } } #ifdef INET6 if (v6bind_ok) { udp6conf = getnetconfigent("udp6"); tcp6conf = getnetconfigent("tcp6"); if (udp6conf == NULL || tcp6conf == NULL) { syslog(LOG_ERR, "unknown rpc/udp6 or rpc/tcp6"); exit(EX_USAGE); } } #endif sa.sa_flags = 0; sigemptyset(&sa.sa_mask); sigaddset(&sa.sa_mask, SIGALRM); sigaddset(&sa.sa_mask, SIGCHLD); sigaddset(&sa.sa_mask, SIGHUP); sa.sa_handler = flag_retry; sigaction(SIGALRM, &sa, &saalrm); config(); sa.sa_handler = flag_config; sigaction(SIGHUP, &sa, &sahup); sa.sa_handler = flag_reapchild; sigaction(SIGCHLD, &sa, &sachld); sa.sa_handler = SIG_IGN; sigaction(SIGPIPE, &sa, &sapipe); { /* space for daemons to overwrite environment for ps */ #define DUMMYSIZE 100 char dummy[DUMMYSIZE]; (void)memset(dummy, 'x', DUMMYSIZE - 1); dummy[DUMMYSIZE - 1] = '\0'; (void)setenv("inetd_dummy", dummy, 1); } if (pipe2(signalpipe, O_CLOEXEC) != 0) { syslog(LOG_ERR, "pipe: %m"); exit(EX_OSERR); } FD_SET(signalpipe[0], &allsock); #ifdef SANITY_CHECK nsock++; #endif if (signalpipe[0] > maxsock) maxsock = signalpipe[0]; if (signalpipe[1] > maxsock) maxsock = signalpipe[1]; for (;;) { int n, ctrl; fd_set readable; #ifdef SANITY_CHECK if (nsock == 0) { syslog(LOG_ERR, "%s: nsock=0", __func__); exit(EX_SOFTWARE); } #endif readable = allsock; if ((n = select(maxsock + 1, &readable, (fd_set *)0, (fd_set *)0, (struct timeval *)0)) <= 0) { if (n < 0 && errno != EINTR) { syslog(LOG_WARNING, "select: %m"); sleep(1); } continue; } /* handle any queued signal flags */ if (FD_ISSET(signalpipe[0], &readable)) { int nsig; if (ioctl(signalpipe[0], FIONREAD, &nsig) != 0) { syslog(LOG_ERR, "ioctl: %m"); exit(EX_OSERR); } while (--nsig >= 0) { char c; if (read(signalpipe[0], &c, 1) != 1) { syslog(LOG_ERR, "read: %m"); exit(EX_OSERR); } if (debug) warnx("handling signal flag %c", c); switch(c) { case 'A': /* sigalrm */ retry(); break; case 'C': /* sigchld */ reapchild(); break; case 'H': /* sighup */ config(); break; } } } for (sep = servtab; n && sep; sep = sep->se_next) if (sep->se_fd != -1 && FD_ISSET(sep->se_fd, &readable)) { n--; if (debug) warnx("someone wants %s", sep->se_service); dofork = !sep->se_bi || sep->se_bi->bi_fork || ISWRAP(sep); conn = NULL; if (sep->se_accept && sep->se_socktype == SOCK_STREAM) { i = 1; if (ioctl(sep->se_fd, FIONBIO, &i) < 0) syslog(LOG_ERR, "ioctl (FIONBIO, 1): %m"); ctrl = accept(sep->se_fd, (struct sockaddr *)0, (socklen_t *)0); if (debug) warnx("accept, ctrl %d", ctrl); if (ctrl < 0) { if (errno != EINTR) syslog(LOG_WARNING, "accept (for %s): %m", sep->se_service); if (sep->se_accept && sep->se_socktype == SOCK_STREAM) close(ctrl); continue; } i = 0; if (ioctl(sep->se_fd, FIONBIO, &i) < 0) syslog(LOG_ERR, "ioctl1(FIONBIO, 0): %m"); if (ioctl(ctrl, FIONBIO, &i) < 0) syslog(LOG_ERR, "ioctl2(FIONBIO, 0): %m"); if (cpmip(sep, ctrl) < 0) { close(ctrl); continue; } if (dofork && (conn = search_conn(sep, ctrl)) != NULL && !room_conn(sep, conn)) { close(ctrl); continue; } } else ctrl = sep->se_fd; if (dolog && !ISWRAP(sep)) { char pname[NI_MAXHOST] = "unknown"; socklen_t sl; sl = sizeof(peer); if (getpeername(ctrl, (struct sockaddr *) &peer, &sl)) { sl = sizeof(peer); if (recvfrom(ctrl, buf, sizeof(buf), MSG_PEEK, (struct sockaddr *)&peer, &sl) >= 0) { getnameinfo((struct sockaddr *)&peer, peer.ss_len, pname, sizeof(pname), NULL, 0, NI_NUMERICHOST); } } else { getnameinfo((struct sockaddr *)&peer, peer.ss_len, pname, sizeof(pname), NULL, 0, NI_NUMERICHOST); } syslog(LOG_INFO,"%s from %s", sep->se_service, pname); } (void) sigblock(SIGBLOCK); pid = 0; /* * Fork for all external services, builtins which need to * fork and anything we're wrapping (as wrapping might * block or use hosts_options(5) twist). */ if (dofork) { if (sep->se_count++ == 0) (void)clock_gettime(CLOCK_MONOTONIC_FAST, &sep->se_time); else if (toomany > 0 && sep->se_count >= toomany) { struct timespec now; (void)clock_gettime(CLOCK_MONOTONIC_FAST, &now); if (now.tv_sec - sep->se_time.tv_sec > CNT_INTVL) { sep->se_time = now; sep->se_count = 1; } else { syslog(LOG_ERR, "%s/%s server failing (looping), service terminated", sep->se_service, sep->se_proto); if (sep->se_accept && sep->se_socktype == SOCK_STREAM) close(ctrl); close_sep(sep); free_conn(conn); sigsetmask(0L); if (!timingout) { timingout = 1; alarm(RETRYTIME); } continue; } } pid = fork(); } if (pid < 0) { syslog(LOG_ERR, "fork: %m"); if (sep->se_accept && sep->se_socktype == SOCK_STREAM) close(ctrl); free_conn(conn); sigsetmask(0L); sleep(1); continue; } if (pid) { addchild_conn(conn, pid); addchild(sep, pid); } sigsetmask(0L); if (pid == 0) { pidfile_close(pfh); if (dofork) { sigaction(SIGALRM, &saalrm, (struct sigaction *)0); sigaction(SIGCHLD, &sachld, (struct sigaction *)0); sigaction(SIGHUP, &sahup, (struct sigaction *)0); /* SIGPIPE reset before exec */ } /* * Call tcpmux to find the real service to exec. */ if (sep->se_bi && sep->se_bi->bi_fn == (bi_fn_t *) tcpmux) { sep = tcpmux(ctrl); if (sep == NULL) { close(ctrl); _exit(0); } } #ifdef LIBWRAP if (ISWRAP(sep)) { inetd_setproctitle("wrapping", ctrl); service = sep->se_server_name ? sep->se_server_name : sep->se_service; request_init(&req, RQ_DAEMON, service, RQ_FILE, ctrl, 0); fromhost(&req); deny_severity = LIBWRAP_DENY_FACILITY|LIBWRAP_DENY_SEVERITY; allow_severity = LIBWRAP_ALLOW_FACILITY|LIBWRAP_ALLOW_SEVERITY; denied = !hosts_access(&req); if (denied) { syslog(deny_severity, "refused connection from %.500s, service %s (%s%s)", eval_client(&req), service, sep->se_proto, (whichaf(&req) == AF_INET6) ? "6" : ""); if (sep->se_socktype != SOCK_STREAM) recv(ctrl, buf, sizeof (buf), 0); if (dofork) { sleep(1); _exit(0); } } if (dolog) { syslog(allow_severity, "connection from %.500s, service %s (%s%s)", eval_client(&req), service, sep->se_proto, (whichaf(&req) == AF_INET6) ? "6" : ""); } } #endif if (sep->se_bi) { (*sep->se_bi->bi_fn)(ctrl, sep); } else { if (debug) warnx("%d execl %s", getpid(), sep->se_server); /* Clear close-on-exec. */ if (fcntl(ctrl, F_SETFD, 0) < 0) { syslog(LOG_ERR, "%s/%s: fcntl (F_SETFD, 0): %m", sep->se_service, sep->se_proto); _exit(EX_OSERR); } if (ctrl != 0) { dup2(ctrl, 0); close(ctrl); } dup2(0, 1); dup2(0, 2); if ((pwd = getpwnam(sep->se_user)) == NULL) { syslog(LOG_ERR, "%s/%s: %s: no such user", sep->se_service, sep->se_proto, sep->se_user); if (sep->se_socktype != SOCK_STREAM) recv(0, buf, sizeof (buf), 0); _exit(EX_NOUSER); } grp = NULL; if ( sep->se_group != NULL && (grp = getgrnam(sep->se_group)) == NULL ) { syslog(LOG_ERR, "%s/%s: %s: no such group", sep->se_service, sep->se_proto, sep->se_group); if (sep->se_socktype != SOCK_STREAM) recv(0, buf, sizeof (buf), 0); _exit(EX_NOUSER); } if (grp != NULL) pwd->pw_gid = grp->gr_gid; #ifdef LOGIN_CAP if ((lc = login_getclass(sep->se_class)) == NULL) { /* error syslogged by getclass */ syslog(LOG_ERR, "%s/%s: %s: login class error", sep->se_service, sep->se_proto, sep->se_class); if (sep->se_socktype != SOCK_STREAM) recv(0, buf, sizeof (buf), 0); _exit(EX_NOUSER); } #endif if (setsid() < 0) { syslog(LOG_ERR, "%s: can't setsid(): %m", sep->se_service); /* _exit(EX_OSERR); not fatal yet */ } #ifdef LOGIN_CAP if (setusercontext(lc, pwd, pwd->pw_uid, LOGIN_SETALL & ~LOGIN_SETMAC) != 0) { syslog(LOG_ERR, "%s: can't setusercontext(..%s..): %m", sep->se_service, sep->se_user); _exit(EX_OSERR); } login_close(lc); #else if (pwd->pw_uid) { if (setlogin(sep->se_user) < 0) { syslog(LOG_ERR, "%s: can't setlogin(%s): %m", sep->se_service, sep->se_user); /* _exit(EX_OSERR); not yet */ } if (setgid(pwd->pw_gid) < 0) { syslog(LOG_ERR, "%s: can't set gid %d: %m", sep->se_service, pwd->pw_gid); _exit(EX_OSERR); } (void) initgroups(pwd->pw_name, pwd->pw_gid); if (setuid(pwd->pw_uid) < 0) { syslog(LOG_ERR, "%s: can't set uid %d: %m", sep->se_service, pwd->pw_uid); _exit(EX_OSERR); } } #endif sigaction(SIGPIPE, &sapipe, (struct sigaction *)0); execv(sep->se_server, sep->se_argv); syslog(LOG_ERR, "cannot execute %s: %m", sep->se_server); if (sep->se_socktype != SOCK_STREAM) recv(0, buf, sizeof (buf), 0); } if (dofork) _exit(0); } if (sep->se_accept && sep->se_socktype == SOCK_STREAM) close(ctrl); } } } /* * Add a signal flag to the signal flag queue for later handling */ static void flag_signal(int c) { char ch = c; if (write(signalpipe[1], &ch, 1) != 1) { syslog(LOG_ERR, "write: %m"); _exit(EX_OSERR); } } /* * Record a new child pid for this service. If we've reached the * limit on children, then stop accepting incoming requests. */ static void addchild(struct servtab *sep, pid_t pid) { if (sep->se_maxchild <= 0) return; #ifdef SANITY_CHECK if (sep->se_numchild >= sep->se_maxchild) { syslog(LOG_ERR, "%s: %d >= %d", __func__, sep->se_numchild, sep->se_maxchild); exit(EX_SOFTWARE); } #endif sep->se_pids[sep->se_numchild++] = pid; if (sep->se_numchild == sep->se_maxchild) disable(sep); } /* * Some child process has exited. See if it's on somebody's list. */ static void flag_reapchild(int signo __unused) { flag_signal('C'); } static void reapchild(void) { int k, status; pid_t pid; struct servtab *sep; for (;;) { pid = wait3(&status, WNOHANG, (struct rusage *)0); if (pid <= 0) break; if (debug) warnx("%d reaped, %s %u", pid, WIFEXITED(status) ? "status" : "signal", WIFEXITED(status) ? WEXITSTATUS(status) : WTERMSIG(status)); for (sep = servtab; sep; sep = sep->se_next) { for (k = 0; k < sep->se_numchild; k++) if (sep->se_pids[k] == pid) break; if (k == sep->se_numchild) continue; if (sep->se_numchild == sep->se_maxchild) enable(sep); sep->se_pids[k] = sep->se_pids[--sep->se_numchild]; if (WIFSIGNALED(status) || WEXITSTATUS(status)) syslog(LOG_WARNING, "%s[%d]: exited, %s %u", sep->se_server, pid, WIFEXITED(status) ? "status" : "signal", WIFEXITED(status) ? WEXITSTATUS(status) : WTERMSIG(status)); break; } reapchild_conn(pid); } } static void flag_config(int signo __unused) { flag_signal('H'); } static void config(void) { struct servtab *sep, *new, **sepp; long omask; int new_nomapped; #ifdef LOGIN_CAP login_cap_t *lc = NULL; #endif if (!setconfig()) { syslog(LOG_ERR, "%s: %m", CONFIG); return; } for (sep = servtab; sep; sep = sep->se_next) sep->se_checked = 0; while ((new = getconfigent())) { if (getpwnam(new->se_user) == NULL) { syslog(LOG_ERR, "%s/%s: no such user '%s', service ignored", new->se_service, new->se_proto, new->se_user); continue; } if (new->se_group && getgrnam(new->se_group) == NULL) { syslog(LOG_ERR, "%s/%s: no such group '%s', service ignored", new->se_service, new->se_proto, new->se_group); continue; } #ifdef LOGIN_CAP if ((lc = login_getclass(new->se_class)) == NULL) { /* error syslogged by getclass */ syslog(LOG_ERR, "%s/%s: %s: login class error, service ignored", new->se_service, new->se_proto, new->se_class); continue; } login_close(lc); #endif new_nomapped = new->se_nomapped; for (sep = servtab; sep; sep = sep->se_next) if (strcmp(sep->se_service, new->se_service) == 0 && strcmp(sep->se_proto, new->se_proto) == 0 && sep->se_rpc == new->se_rpc && sep->se_socktype == new->se_socktype && sep->se_family == new->se_family) break; if (sep != 0) { int i; #define SWAP(t,a, b) { t c = a; a = b; b = c; } omask = sigblock(SIGBLOCK); if (sep->se_nomapped != new->se_nomapped) { /* for rpc keep old nommaped till unregister */ if (!sep->se_rpc) sep->se_nomapped = new->se_nomapped; sep->se_reset = 1; } /* copy over outstanding child pids */ if (sep->se_maxchild > 0 && new->se_maxchild > 0) { new->se_numchild = sep->se_numchild; if (new->se_numchild > new->se_maxchild) new->se_numchild = new->se_maxchild; memcpy(new->se_pids, sep->se_pids, new->se_numchild * sizeof(*new->se_pids)); } SWAP(pid_t *, sep->se_pids, new->se_pids); sep->se_maxchild = new->se_maxchild; sep->se_numchild = new->se_numchild; sep->se_maxcpm = new->se_maxcpm; resize_conn(sep, new->se_maxperip); sep->se_maxperip = new->se_maxperip; sep->se_bi = new->se_bi; /* might need to turn on or off service now */ if (sep->se_fd >= 0) { if (sep->se_maxchild > 0 && sep->se_numchild == sep->se_maxchild) { if (FD_ISSET(sep->se_fd, &allsock)) disable(sep); } else { if (!FD_ISSET(sep->se_fd, &allsock)) enable(sep); } } sep->se_accept = new->se_accept; SWAP(char *, sep->se_user, new->se_user); SWAP(char *, sep->se_group, new->se_group); #ifdef LOGIN_CAP SWAP(char *, sep->se_class, new->se_class); #endif SWAP(char *, sep->se_server, new->se_server); SWAP(char *, sep->se_server_name, new->se_server_name); for (i = 0; i < MAXARGV; i++) SWAP(char *, sep->se_argv[i], new->se_argv[i]); #ifdef IPSEC SWAP(char *, sep->se_policy, new->se_policy); ipsecsetup(sep); #endif sigsetmask(omask); freeconfig(new); if (debug) print_service("REDO", sep); } else { sep = enter(new); if (debug) print_service("ADD ", sep); } sep->se_checked = 1; if (ISMUX(sep)) { sep->se_fd = -1; continue; } switch (sep->se_family) { case AF_INET: if (!v4bind_ok) { sep->se_fd = -1; continue; } break; #ifdef INET6 case AF_INET6: if (!v6bind_ok) { sep->se_fd = -1; continue; } break; #endif } if (!sep->se_rpc) { if (sep->se_family != AF_UNIX) { sp = getservbyname(sep->se_service, sep->se_proto); if (sp == 0) { syslog(LOG_ERR, "%s/%s: unknown service", sep->se_service, sep->se_proto); sep->se_checked = 0; continue; } } switch (sep->se_family) { case AF_INET: if (sp->s_port != sep->se_ctrladdr4.sin_port) { sep->se_ctrladdr4.sin_port = sp->s_port; sep->se_reset = 1; } break; #ifdef INET6 case AF_INET6: if (sp->s_port != sep->se_ctrladdr6.sin6_port) { sep->se_ctrladdr6.sin6_port = sp->s_port; sep->se_reset = 1; } break; #endif } if (sep->se_reset != 0 && sep->se_fd >= 0) close_sep(sep); } else { rpc = getrpcbyname(sep->se_service); if (rpc == 0) { syslog(LOG_ERR, "%s/%s unknown RPC service", sep->se_service, sep->se_proto); if (sep->se_fd != -1) (void) close(sep->se_fd); sep->se_fd = -1; continue; } if (sep->se_reset != 0 || rpc->r_number != sep->se_rpc_prog) { if (sep->se_rpc_prog) unregisterrpc(sep); sep->se_rpc_prog = rpc->r_number; if (sep->se_fd != -1) (void) close(sep->se_fd); sep->se_fd = -1; } sep->se_nomapped = new_nomapped; } sep->se_reset = 0; if (sep->se_fd == -1) setup(sep); } endconfig(); /* * Purge anything not looked at above. */ omask = sigblock(SIGBLOCK); sepp = &servtab; while ((sep = *sepp)) { if (sep->se_checked) { sepp = &sep->se_next; continue; } *sepp = sep->se_next; if (sep->se_fd >= 0) close_sep(sep); if (debug) print_service("FREE", sep); if (sep->se_rpc && sep->se_rpc_prog > 0) unregisterrpc(sep); freeconfig(sep); free(sep); } (void) sigsetmask(omask); } static void unregisterrpc(struct servtab *sep) { u_int i; struct servtab *sepp; long omask; struct netconfig *netid4, *netid6; omask = sigblock(SIGBLOCK); netid4 = sep->se_socktype == SOCK_DGRAM ? udpconf : tcpconf; netid6 = sep->se_socktype == SOCK_DGRAM ? udp6conf : tcp6conf; if (sep->se_family == AF_INET) netid6 = NULL; else if (sep->se_nomapped) netid4 = NULL; /* * Conflict if same prog and protocol - In that case one should look * to versions, but it is not interesting: having separate servers for * different versions does not work well. * Therefore one do not unregister if there is a conflict. * There is also transport conflict if destroying INET when INET46 * exists, or destroying INET46 when INET exists */ for (sepp = servtab; sepp; sepp = sepp->se_next) { if (sepp == sep) continue; if (sepp->se_checked == 0 || !sepp->se_rpc || strcmp(sep->se_proto, sepp->se_proto) != 0 || sep->se_rpc_prog != sepp->se_rpc_prog) continue; if (sepp->se_family == AF_INET) netid4 = NULL; if (sepp->se_family == AF_INET6) { netid6 = NULL; if (!sep->se_nomapped) netid4 = NULL; } if (netid4 == NULL && netid6 == NULL) return; } if (debug) print_service("UNREG", sep); for (i = sep->se_rpc_lowvers; i <= sep->se_rpc_highvers; i++) { if (netid4) rpcb_unset(sep->se_rpc_prog, i, netid4); if (netid6) rpcb_unset(sep->se_rpc_prog, i, netid6); } if (sep->se_fd != -1) (void) close(sep->se_fd); sep->se_fd = -1; (void) sigsetmask(omask); } static void flag_retry(int signo __unused) { flag_signal('A'); } static void retry(void) { struct servtab *sep; timingout = 0; for (sep = servtab; sep; sep = sep->se_next) if (sep->se_fd == -1 && !ISMUX(sep)) setup(sep); } static void setup(struct servtab *sep) { int on = 1; /* Set all listening sockets to close-on-exec. */ if ((sep->se_fd = socket(sep->se_family, sep->se_socktype | SOCK_CLOEXEC, 0)) < 0) { if (debug) warn("socket failed on %s/%s", sep->se_service, sep->se_proto); syslog(LOG_ERR, "%s/%s: socket: %m", sep->se_service, sep->se_proto); return; } #define turnon(fd, opt) \ setsockopt(fd, SOL_SOCKET, opt, (char *)&on, sizeof (on)) if (strcmp(sep->se_proto, "tcp") == 0 && (options & SO_DEBUG) && turnon(sep->se_fd, SO_DEBUG) < 0) syslog(LOG_ERR, "setsockopt (SO_DEBUG): %m"); if (turnon(sep->se_fd, SO_REUSEADDR) < 0) syslog(LOG_ERR, "setsockopt (SO_REUSEADDR): %m"); #ifdef SO_PRIVSTATE if (turnon(sep->se_fd, SO_PRIVSTATE) < 0) syslog(LOG_ERR, "setsockopt (SO_PRIVSTATE): %m"); #endif /* tftpd opens a new connection then needs more infos */ #ifdef INET6 if ((sep->se_family == AF_INET6) && (strcmp(sep->se_proto, "udp") == 0) && (sep->se_accept == 0) && (setsockopt(sep->se_fd, IPPROTO_IPV6, IPV6_RECVPKTINFO, (char *)&on, sizeof (on)) < 0)) syslog(LOG_ERR, "setsockopt (IPV6_RECVPKTINFO): %m"); if (sep->se_family == AF_INET6) { int flag = sep->se_nomapped ? 1 : 0; if (setsockopt(sep->se_fd, IPPROTO_IPV6, IPV6_V6ONLY, (char *)&flag, sizeof (flag)) < 0) syslog(LOG_ERR, "setsockopt (IPV6_V6ONLY): %m"); } #endif #undef turnon #ifdef IPSEC ipsecsetup(sep); #endif if (sep->se_family == AF_UNIX) { (void) unlink(sep->se_ctrladdr_un.sun_path); umask(0777); /* Make socket with conservative permissions */ } if (bind(sep->se_fd, (struct sockaddr *)&sep->se_ctrladdr, sep->se_ctrladdr_size) < 0) { if (debug) warn("bind failed on %s/%s", sep->se_service, sep->se_proto); syslog(LOG_ERR, "%s/%s: bind: %m", sep->se_service, sep->se_proto); (void) close(sep->se_fd); sep->se_fd = -1; if (!timingout) { timingout = 1; alarm(RETRYTIME); } if (sep->se_family == AF_UNIX) umask(mask); return; } if (sep->se_family == AF_UNIX) { /* Ick - fch{own,mod} don't work on Unix domain sockets */ if (chown(sep->se_service, sep->se_sockuid, sep->se_sockgid) < 0) syslog(LOG_ERR, "chown socket: %m"); if (chmod(sep->se_service, sep->se_sockmode) < 0) syslog(LOG_ERR, "chmod socket: %m"); umask(mask); } if (sep->se_rpc) { u_int i; socklen_t len = sep->se_ctrladdr_size; struct netconfig *netid, *netid2 = NULL; #ifdef INET6 struct sockaddr_in sock; #endif struct netbuf nbuf, nbuf2; if (getsockname(sep->se_fd, (struct sockaddr*)&sep->se_ctrladdr, &len) < 0){ syslog(LOG_ERR, "%s/%s: getsockname: %m", sep->se_service, sep->se_proto); (void) close(sep->se_fd); sep->se_fd = -1; return; } nbuf.buf = &sep->se_ctrladdr; nbuf.len = sep->se_ctrladdr.sa_len; if (sep->se_family == AF_INET) netid = sep->se_socktype==SOCK_DGRAM? udpconf:tcpconf; #ifdef INET6 else { netid = sep->se_socktype==SOCK_DGRAM? udp6conf:tcp6conf; if (!sep->se_nomapped) { /* INET and INET6 */ netid2 = netid==udp6conf? udpconf:tcpconf; memset(&sock, 0, sizeof sock); /* ADDR_ANY */ nbuf2.buf = &sock; nbuf2.len = sock.sin_len = sizeof sock; sock.sin_family = AF_INET; sock.sin_port = sep->se_ctrladdr6.sin6_port; } } #endif if (debug) print_service("REG ", sep); for (i = sep->se_rpc_lowvers; i <= sep->se_rpc_highvers; i++) { rpcb_unset(sep->se_rpc_prog, i, netid); rpcb_set(sep->se_rpc_prog, i, netid, &nbuf); if (netid2) { rpcb_unset(sep->se_rpc_prog, i, netid2); rpcb_set(sep->se_rpc_prog, i, netid2, &nbuf2); } } } if (sep->se_socktype == SOCK_STREAM) listen(sep->se_fd, -1); enable(sep); if (debug) { warnx("registered %s on %d", sep->se_server, sep->se_fd); } } #ifdef IPSEC static void ipsecsetup(struct servtab *sep) { char *buf; char *policy_in = NULL; char *policy_out = NULL; int level; int opt; switch (sep->se_family) { case AF_INET: level = IPPROTO_IP; opt = IP_IPSEC_POLICY; break; #ifdef INET6 case AF_INET6: level = IPPROTO_IPV6; opt = IPV6_IPSEC_POLICY; break; #endif default: return; } if (!sep->se_policy || sep->se_policy[0] == '\0') { static char def_in[] = "in entrust", def_out[] = "out entrust"; policy_in = def_in; policy_out = def_out; } else { if (!strncmp("in", sep->se_policy, 2)) policy_in = sep->se_policy; else if (!strncmp("out", sep->se_policy, 3)) policy_out = sep->se_policy; else { syslog(LOG_ERR, "invalid security policy \"%s\"", sep->se_policy); return; } } if (policy_in != NULL) { buf = ipsec_set_policy(policy_in, strlen(policy_in)); if (buf != NULL) { if (setsockopt(sep->se_fd, level, opt, buf, ipsec_get_policylen(buf)) < 0 && debug != 0) warnx("%s/%s: ipsec initialization failed; %s", sep->se_service, sep->se_proto, policy_in); free(buf); } else syslog(LOG_ERR, "invalid security policy \"%s\"", policy_in); } if (policy_out != NULL) { buf = ipsec_set_policy(policy_out, strlen(policy_out)); if (buf != NULL) { if (setsockopt(sep->se_fd, level, opt, buf, ipsec_get_policylen(buf)) < 0 && debug != 0) warnx("%s/%s: ipsec initialization failed; %s", sep->se_service, sep->se_proto, policy_out); free(buf); } else syslog(LOG_ERR, "invalid security policy \"%s\"", policy_out); } } #endif /* * Finish with a service and its socket. */ static void close_sep(struct servtab *sep) { if (sep->se_fd >= 0) { if (FD_ISSET(sep->se_fd, &allsock)) disable(sep); (void) close(sep->se_fd); sep->se_fd = -1; } sep->se_count = 0; sep->se_numchild = 0; /* forget about any existing children */ } static int matchservent(const char *name1, const char *name2, const char *proto) { char **alias, *p; struct servent *se; if (strcmp(proto, "unix") == 0) { if ((p = strrchr(name1, '/')) != NULL) name1 = p + 1; if ((p = strrchr(name2, '/')) != NULL) name2 = p + 1; } if (strcmp(name1, name2) == 0) return(1); if ((se = getservbyname(name1, proto)) != NULL) { if (strcmp(name2, se->s_name) == 0) return(1); for (alias = se->s_aliases; *alias; alias++) if (strcmp(name2, *alias) == 0) return(1); } return(0); } static struct servtab * enter(struct servtab *cp) { struct servtab *sep; long omask; sep = (struct servtab *)malloc(sizeof (*sep)); if (sep == (struct servtab *)0) { syslog(LOG_ERR, "malloc: %m"); exit(EX_OSERR); } *sep = *cp; sep->se_fd = -1; omask = sigblock(SIGBLOCK); sep->se_next = servtab; servtab = sep; sigsetmask(omask); return (sep); } static void enable(struct servtab *sep) { if (debug) warnx( "enabling %s, fd %d", sep->se_service, sep->se_fd); #ifdef SANITY_CHECK if (sep->se_fd < 0) { syslog(LOG_ERR, "%s: %s: bad fd", __func__, sep->se_service); exit(EX_SOFTWARE); } if (ISMUX(sep)) { syslog(LOG_ERR, "%s: %s: is mux", __func__, sep->se_service); exit(EX_SOFTWARE); } if (FD_ISSET(sep->se_fd, &allsock)) { syslog(LOG_ERR, "%s: %s: not off", __func__, sep->se_service); exit(EX_SOFTWARE); } nsock++; #endif FD_SET(sep->se_fd, &allsock); if (sep->se_fd > maxsock) maxsock = sep->se_fd; } static void disable(struct servtab *sep) { if (debug) warnx( "disabling %s, fd %d", sep->se_service, sep->se_fd); #ifdef SANITY_CHECK if (sep->se_fd < 0) { syslog(LOG_ERR, "%s: %s: bad fd", __func__, sep->se_service); exit(EX_SOFTWARE); } if (ISMUX(sep)) { syslog(LOG_ERR, "%s: %s: is mux", __func__, sep->se_service); exit(EX_SOFTWARE); } if (!FD_ISSET(sep->se_fd, &allsock)) { syslog(LOG_ERR, "%s: %s: not on", __func__, sep->se_service); exit(EX_SOFTWARE); } if (nsock == 0) { syslog(LOG_ERR, "%s: nsock=0", __func__); exit(EX_SOFTWARE); } nsock--; #endif FD_CLR(sep->se_fd, &allsock); if (sep->se_fd == maxsock) maxsock--; } static FILE *fconfig = NULL; static struct servtab serv; static char line[LINE_MAX]; static int setconfig(void) { if (fconfig != NULL) { fseek(fconfig, 0L, SEEK_SET); return (1); } fconfig = fopen(CONFIG, "r"); return (fconfig != NULL); } static void endconfig(void) { if (fconfig) { (void) fclose(fconfig); fconfig = NULL; } } static struct servtab * getconfigent(void) { struct servtab *sep = &serv; int argc; char *cp, *arg, *s; char *versp; static char TCPMUX_TOKEN[] = "tcpmux/"; #define MUX_LEN (sizeof(TCPMUX_TOKEN)-1) #ifdef IPSEC char *policy; #endif int v4bind; #ifdef INET6 int v6bind; #endif int i; #ifdef IPSEC policy = NULL; #endif more: v4bind = 0; #ifdef INET6 v6bind = 0; #endif while ((cp = nextline(fconfig)) != NULL) { #ifdef IPSEC /* lines starting with #@ is not a comment, but the policy */ if (cp[0] == '#' && cp[1] == '@') { char *p; for (p = cp + 2; p && *p && isspace(*p); p++) ; if (*p == '\0') { if (policy) free(policy); policy = NULL; } else if (ipsec_get_policylen(p) >= 0) { if (policy) free(policy); policy = newstr(p); } else { syslog(LOG_ERR, "%s: invalid ipsec policy \"%s\"", CONFIG, p); exit(EX_CONFIG); } } #endif if (*cp == '#' || *cp == '\0') continue; break; } if (cp == NULL) return ((struct servtab *)0); /* * clear the static buffer, since some fields (se_ctrladdr, * for example) don't get initialized here. */ memset(sep, 0, sizeof *sep); arg = skip(&cp); if (cp == NULL) { /* got an empty line containing just blanks/tabs. */ goto more; } if (arg[0] == ':') { /* :user:group:perm: */ char *user, *group, *perm; struct passwd *pw; struct group *gr; user = arg+1; if ((group = strchr(user, ':')) == NULL) { syslog(LOG_ERR, "no group after user '%s'", user); goto more; } *group++ = '\0'; if ((perm = strchr(group, ':')) == NULL) { syslog(LOG_ERR, "no mode after group '%s'", group); goto more; } *perm++ = '\0'; if ((pw = getpwnam(user)) == NULL) { syslog(LOG_ERR, "no such user '%s'", user); goto more; } sep->se_sockuid = pw->pw_uid; if ((gr = getgrnam(group)) == NULL) { syslog(LOG_ERR, "no such user '%s'", group); goto more; } sep->se_sockgid = gr->gr_gid; sep->se_sockmode = strtol(perm, &arg, 8); if (*arg != ':') { syslog(LOG_ERR, "bad mode '%s'", perm); goto more; } *arg++ = '\0'; } else { sep->se_sockuid = euid; sep->se_sockgid = egid; sep->se_sockmode = 0200; } if (strncmp(arg, TCPMUX_TOKEN, MUX_LEN) == 0) { char *c = arg + MUX_LEN; if (*c == '+') { sep->se_type = MUXPLUS_TYPE; c++; } else sep->se_type = MUX_TYPE; sep->se_service = newstr(c); } else { sep->se_service = newstr(arg); sep->se_type = NORM_TYPE; } arg = sskip(&cp); if (strcmp(arg, "stream") == 0) sep->se_socktype = SOCK_STREAM; else if (strcmp(arg, "dgram") == 0) sep->se_socktype = SOCK_DGRAM; else if (strcmp(arg, "rdm") == 0) sep->se_socktype = SOCK_RDM; else if (strcmp(arg, "seqpacket") == 0) sep->se_socktype = SOCK_SEQPACKET; else if (strcmp(arg, "raw") == 0) sep->se_socktype = SOCK_RAW; else sep->se_socktype = -1; arg = sskip(&cp); if (strncmp(arg, "tcp", 3) == 0) { sep->se_proto = newstr(strsep(&arg, "/")); if (arg != NULL && (strcmp(arg, "faith") == 0)) { syslog(LOG_ERR, "faith has been deprecated"); goto more; } } else { if (sep->se_type == NORM_TYPE && strncmp(arg, "faith/", 6) == 0) { syslog(LOG_ERR, "faith has been deprecated"); goto more; } sep->se_proto = newstr(arg); } if (strncmp(sep->se_proto, "rpc/", 4) == 0) { memmove(sep->se_proto, sep->se_proto + 4, strlen(sep->se_proto) + 1 - 4); sep->se_rpc = 1; sep->se_rpc_prog = sep->se_rpc_lowvers = sep->se_rpc_highvers = 0; if ((versp = strrchr(sep->se_service, '/'))) { *versp++ = '\0'; switch (sscanf(versp, "%u-%u", &sep->se_rpc_lowvers, &sep->se_rpc_highvers)) { case 2: break; case 1: sep->se_rpc_highvers = sep->se_rpc_lowvers; break; default: syslog(LOG_ERR, "bad RPC version specifier; %s", sep->se_service); freeconfig(sep); goto more; } } else { sep->se_rpc_lowvers = sep->se_rpc_highvers = 1; } } sep->se_nomapped = 0; if (strcmp(sep->se_proto, "unix") == 0) { sep->se_family = AF_UNIX; } else { while (isdigit(sep->se_proto[strlen(sep->se_proto) - 1])) { #ifdef INET6 if (sep->se_proto[strlen(sep->se_proto) - 1] == '6') { sep->se_proto[strlen(sep->se_proto) - 1] = '\0'; v6bind = 1; continue; } #endif if (sep->se_proto[strlen(sep->se_proto) - 1] == '4') { sep->se_proto[strlen(sep->se_proto) - 1] = '\0'; v4bind = 1; continue; } /* illegal version num */ syslog(LOG_ERR, "bad IP version for %s", sep->se_proto); freeconfig(sep); goto more; } #ifdef INET6 if (v6bind && !v6bind_ok) { syslog(LOG_INFO, "IPv6 bind is ignored for %s", sep->se_service); if (v4bind && v4bind_ok) v6bind = 0; else { freeconfig(sep); goto more; } } if (v6bind) { sep->se_family = AF_INET6; if (!v4bind || !v4bind_ok) sep->se_nomapped = 1; } else #endif { /* default to v4 bind if not v6 bind */ if (!v4bind_ok) { syslog(LOG_NOTICE, "IPv4 bind is ignored for %s", sep->se_service); freeconfig(sep); goto more; } sep->se_family = AF_INET; } } /* init ctladdr */ switch(sep->se_family) { case AF_INET: memcpy(&sep->se_ctrladdr4, bind_sa4, sizeof(sep->se_ctrladdr4)); sep->se_ctrladdr_size = sizeof(sep->se_ctrladdr4); break; #ifdef INET6 case AF_INET6: memcpy(&sep->se_ctrladdr6, bind_sa6, sizeof(sep->se_ctrladdr6)); sep->se_ctrladdr_size = sizeof(sep->se_ctrladdr6); break; #endif case AF_UNIX: if (strlen(sep->se_service) >= sizeof(sep->se_ctrladdr_un.sun_path)) { syslog(LOG_ERR, "domain socket pathname too long for service %s", sep->se_service); goto more; } memset(&sep->se_ctrladdr, 0, sizeof(sep->se_ctrladdr)); sep->se_ctrladdr_un.sun_family = sep->se_family; sep->se_ctrladdr_un.sun_len = strlen(sep->se_service); strcpy(sep->se_ctrladdr_un.sun_path, sep->se_service); sep->se_ctrladdr_size = SUN_LEN(&sep->se_ctrladdr_un); } arg = sskip(&cp); if (!strncmp(arg, "wait", 4)) sep->se_accept = 0; else if (!strncmp(arg, "nowait", 6)) sep->se_accept = 1; else { syslog(LOG_ERR, "%s: bad wait/nowait for service %s", CONFIG, sep->se_service); goto more; } sep->se_maxchild = -1; sep->se_maxcpm = -1; sep->se_maxperip = -1; if ((s = strchr(arg, '/')) != NULL) { char *eptr; u_long val; val = strtoul(s + 1, &eptr, 10); if (eptr == s + 1 || val > MAX_MAXCHLD) { syslog(LOG_ERR, "%s: bad max-child for service %s", CONFIG, sep->se_service); goto more; } if (debug) if (!sep->se_accept && val != 1) warnx("maxchild=%lu for wait service %s" " not recommended", val, sep->se_service); sep->se_maxchild = val; if (*eptr == '/') sep->se_maxcpm = strtol(eptr + 1, &eptr, 10); if (*eptr == '/') sep->se_maxperip = strtol(eptr + 1, &eptr, 10); /* * explicitly do not check for \0 for future expansion / * backwards compatibility */ } if (ISMUX(sep)) { /* * Silently enforce "nowait" mode for TCPMUX services * since they don't have an assigned port to listen on. */ sep->se_accept = 1; if (strcmp(sep->se_proto, "tcp")) { syslog(LOG_ERR, "%s: bad protocol for tcpmux service %s", CONFIG, sep->se_service); goto more; } if (sep->se_socktype != SOCK_STREAM) { syslog(LOG_ERR, "%s: bad socket type for tcpmux service %s", CONFIG, sep->se_service); goto more; } } sep->se_user = newstr(sskip(&cp)); #ifdef LOGIN_CAP if ((s = strrchr(sep->se_user, '/')) != NULL) { *s = '\0'; sep->se_class = newstr(s + 1); } else sep->se_class = newstr(RESOURCE_RC); #endif if ((s = strrchr(sep->se_user, ':')) != NULL) { *s = '\0'; sep->se_group = newstr(s + 1); } else sep->se_group = NULL; sep->se_server = newstr(sskip(&cp)); if ((sep->se_server_name = strrchr(sep->se_server, '/'))) sep->se_server_name++; if (strcmp(sep->se_server, "internal") == 0) { struct biltin *bi; for (bi = biltins; bi->bi_service; bi++) if (bi->bi_socktype == sep->se_socktype && matchservent(bi->bi_service, sep->se_service, sep->se_proto)) break; if (bi->bi_service == 0) { syslog(LOG_ERR, "internal service %s unknown", sep->se_service); goto more; } sep->se_accept = 1; /* force accept mode for built-ins */ sep->se_bi = bi; } else sep->se_bi = NULL; if (sep->se_maxperip < 0) sep->se_maxperip = maxperip; if (sep->se_maxcpm < 0) sep->se_maxcpm = maxcpm; if (sep->se_maxchild < 0) { /* apply default max-children */ if (sep->se_bi && sep->se_bi->bi_maxchild >= 0) sep->se_maxchild = sep->se_bi->bi_maxchild; else if (sep->se_accept) sep->se_maxchild = MAX(maxchild, 0); else sep->se_maxchild = 1; } if (sep->se_maxchild > 0) { sep->se_pids = malloc(sep->se_maxchild * sizeof(*sep->se_pids)); if (sep->se_pids == NULL) { syslog(LOG_ERR, "malloc: %m"); exit(EX_OSERR); } } argc = 0; for (arg = skip(&cp); cp; arg = skip(&cp)) if (argc < MAXARGV) { sep->se_argv[argc++] = newstr(arg); } else { syslog(LOG_ERR, "%s: too many arguments for service %s", CONFIG, sep->se_service); goto more; } while (argc <= MAXARGV) sep->se_argv[argc++] = NULL; for (i = 0; i < PERIPSIZE; ++i) LIST_INIT(&sep->se_conn[i]); #ifdef IPSEC sep->se_policy = policy ? newstr(policy) : NULL; #endif return (sep); } static void freeconfig(struct servtab *cp) { int i; if (cp->se_service) free(cp->se_service); if (cp->se_proto) free(cp->se_proto); if (cp->se_user) free(cp->se_user); if (cp->se_group) free(cp->se_group); #ifdef LOGIN_CAP if (cp->se_class) free(cp->se_class); #endif if (cp->se_server) free(cp->se_server); if (cp->se_pids) free(cp->se_pids); for (i = 0; i < MAXARGV; i++) if (cp->se_argv[i]) free(cp->se_argv[i]); free_connlist(cp); #ifdef IPSEC if (cp->se_policy) free(cp->se_policy); #endif } /* * Safe skip - if skip returns null, log a syntax error in the * configuration file and exit. */ static char * sskip(char **cpp) { char *cp; cp = skip(cpp); if (cp == NULL) { syslog(LOG_ERR, "%s: syntax error", CONFIG); exit(EX_DATAERR); } return (cp); } static char * skip(char **cpp) { char *cp = *cpp; char *start; char quote = '\0'; again: while (*cp == ' ' || *cp == '\t') cp++; if (*cp == '\0') { int c; c = getc(fconfig); (void) ungetc(c, fconfig); if (c == ' ' || c == '\t') if ((cp = nextline(fconfig))) goto again; *cpp = (char *)0; return ((char *)0); } if (*cp == '"' || *cp == '\'') quote = *cp++; start = cp; if (quote) while (*cp && *cp != quote) cp++; else while (*cp && *cp != ' ' && *cp != '\t') cp++; if (*cp != '\0') *cp++ = '\0'; *cpp = cp; return (start); } static char * nextline(FILE *fd) { char *cp; if (fgets(line, sizeof (line), fd) == NULL) return ((char *)0); cp = strchr(line, '\n'); if (cp) *cp = '\0'; return (line); } static char * newstr(const char *cp) { char *cr; if ((cr = strdup(cp != NULL ? cp : ""))) return (cr); syslog(LOG_ERR, "strdup: %m"); exit(EX_OSERR); } void inetd_setproctitle(const char *a, int s) { socklen_t size; struct sockaddr_storage ss; char buf[80], pbuf[NI_MAXHOST]; size = sizeof(ss); if (getpeername(s, (struct sockaddr *)&ss, &size) == 0) { getnameinfo((struct sockaddr *)&ss, size, pbuf, sizeof(pbuf), NULL, 0, NI_NUMERICHOST); (void) sprintf(buf, "%s [%s]", a, pbuf); } else (void) sprintf(buf, "%s", a); setproctitle("%s", buf); } int check_loop(const struct sockaddr *sa, const struct servtab *sep) { struct servtab *se2; char pname[NI_MAXHOST]; for (se2 = servtab; se2; se2 = se2->se_next) { if (!se2->se_bi || se2->se_socktype != SOCK_DGRAM) continue; switch (se2->se_family) { case AF_INET: if (csatosin(sa)->sin_port == se2->se_ctrladdr4.sin_port) goto isloop; continue; #ifdef INET6 case AF_INET6: if (csatosin6(sa)->sin6_port == se2->se_ctrladdr6.sin6_port) goto isloop; continue; #endif default: continue; } isloop: getnameinfo(sa, sa->sa_len, pname, sizeof(pname), NULL, 0, NI_NUMERICHOST); syslog(LOG_WARNING, "%s/%s:%s/%s loop request REFUSED from %s", sep->se_service, sep->se_proto, se2->se_service, se2->se_proto, pname); return 1; } return 0; } /* * print_service: * Dump relevant information to stderr */ static void print_service(const char *action, const struct servtab *sep) { fprintf(stderr, "%s: %s proto=%s accept=%d max=%d user=%s group=%s" #ifdef LOGIN_CAP "class=%s" #endif " builtin=%p server=%s" #ifdef IPSEC " policy=\"%s\"" #endif "\n", action, sep->se_service, sep->se_proto, sep->se_accept, sep->se_maxchild, sep->se_user, sep->se_group, #ifdef LOGIN_CAP sep->se_class, #endif (void *) sep->se_bi, sep->se_server #ifdef IPSEC , (sep->se_policy ? sep->se_policy : "") #endif ); } #define CPMHSIZE 256 #define CPMHMASK (CPMHSIZE-1) #define CHTGRAN 10 #define CHTSIZE 6 typedef struct CTime { unsigned long ct_Ticks; int ct_Count; } CTime; typedef struct CHash { union { struct in_addr c4_Addr; struct in6_addr c6_Addr; } cu_Addr; #define ch_Addr4 cu_Addr.c4_Addr #define ch_Addr6 cu_Addr.c6_Addr int ch_Family; time_t ch_LTime; char *ch_Service; CTime ch_Times[CHTSIZE]; } CHash; static CHash CHashAry[CPMHSIZE]; static int cpmip(const struct servtab *sep, int ctrl) { struct sockaddr_storage rss; socklen_t rssLen = sizeof(rss); int r = 0; /* * If getpeername() fails, just let it through (if logging is * enabled the condition is caught elsewhere) */ if (sep->se_maxcpm > 0 && (sep->se_family == AF_INET || sep->se_family == AF_INET6) && getpeername(ctrl, (struct sockaddr *)&rss, &rssLen) == 0 ) { time_t t = time(NULL); int hv = 0xABC3D20F; int i; int cnt = 0; CHash *chBest = NULL; unsigned int ticks = t / CHTGRAN; struct sockaddr_in *sin4; #ifdef INET6 struct sockaddr_in6 *sin6; #endif sin4 = (struct sockaddr_in *)&rss; #ifdef INET6 sin6 = (struct sockaddr_in6 *)&rss; #endif { char *p; int addrlen; switch (rss.ss_family) { case AF_INET: p = (char *)&sin4->sin_addr; addrlen = sizeof(struct in_addr); break; #ifdef INET6 case AF_INET6: p = (char *)&sin6->sin6_addr; addrlen = sizeof(struct in6_addr); break; #endif default: /* should not happen */ return -1; } for (i = 0; i < addrlen; ++i, ++p) { hv = (hv << 5) ^ (hv >> 23) ^ *p; } hv = (hv ^ (hv >> 16)); } for (i = 0; i < 5; ++i) { CHash *ch = &CHashAry[(hv + i) & CPMHMASK]; if (rss.ss_family == AF_INET && ch->ch_Family == AF_INET && sin4->sin_addr.s_addr == ch->ch_Addr4.s_addr && ch->ch_Service && strcmp(sep->se_service, ch->ch_Service) == 0) { chBest = ch; break; } #ifdef INET6 if (rss.ss_family == AF_INET6 && ch->ch_Family == AF_INET6 && IN6_ARE_ADDR_EQUAL(&sin6->sin6_addr, &ch->ch_Addr6) != 0 && ch->ch_Service && strcmp(sep->se_service, ch->ch_Service) == 0) { chBest = ch; break; } #endif if (chBest == NULL || ch->ch_LTime == 0 || ch->ch_LTime < chBest->ch_LTime) { chBest = ch; } } if ((rss.ss_family == AF_INET && (chBest->ch_Family != AF_INET || sin4->sin_addr.s_addr != chBest->ch_Addr4.s_addr)) || chBest->ch_Service == NULL || strcmp(sep->se_service, chBest->ch_Service) != 0) { chBest->ch_Family = sin4->sin_family; chBest->ch_Addr4 = sin4->sin_addr; if (chBest->ch_Service) free(chBest->ch_Service); chBest->ch_Service = strdup(sep->se_service); bzero(chBest->ch_Times, sizeof(chBest->ch_Times)); } #ifdef INET6 if ((rss.ss_family == AF_INET6 && (chBest->ch_Family != AF_INET6 || IN6_ARE_ADDR_EQUAL(&sin6->sin6_addr, &chBest->ch_Addr6) == 0)) || chBest->ch_Service == NULL || strcmp(sep->se_service, chBest->ch_Service) != 0) { chBest->ch_Family = sin6->sin6_family; chBest->ch_Addr6 = sin6->sin6_addr; if (chBest->ch_Service) free(chBest->ch_Service); chBest->ch_Service = strdup(sep->se_service); bzero(chBest->ch_Times, sizeof(chBest->ch_Times)); } #endif chBest->ch_LTime = t; { CTime *ct = &chBest->ch_Times[ticks % CHTSIZE]; if (ct->ct_Ticks != ticks) { ct->ct_Ticks = ticks; ct->ct_Count = 0; } ++ct->ct_Count; } for (i = 0; i < CHTSIZE; ++i) { CTime *ct = &chBest->ch_Times[i]; if (ct->ct_Ticks <= ticks && ct->ct_Ticks >= ticks - CHTSIZE) { cnt += ct->ct_Count; } } if ((cnt * 60) / (CHTSIZE * CHTGRAN) > sep->se_maxcpm) { char pname[NI_MAXHOST]; getnameinfo((struct sockaddr *)&rss, ((struct sockaddr *)&rss)->sa_len, pname, sizeof(pname), NULL, 0, NI_NUMERICHOST); r = -1; syslog(LOG_ERR, "%s from %s exceeded counts/min (limit %d/min)", sep->se_service, pname, sep->se_maxcpm); } } return(r); } static struct conninfo * search_conn(struct servtab *sep, int ctrl) { struct sockaddr_storage ss; socklen_t sslen = sizeof(ss); struct conninfo *conn; int hv; char pname[NI_MAXHOST], pname2[NI_MAXHOST]; if (sep->se_maxperip <= 0) return NULL; /* * If getpeername() fails, just let it through (if logging is * enabled the condition is caught elsewhere) */ if (getpeername(ctrl, (struct sockaddr *)&ss, &sslen) != 0) return NULL; switch (ss.ss_family) { case AF_INET: hv = hashval((char *)&((struct sockaddr_in *)&ss)->sin_addr, sizeof(struct in_addr)); break; #ifdef INET6 case AF_INET6: hv = hashval((char *)&((struct sockaddr_in6 *)&ss)->sin6_addr, sizeof(struct in6_addr)); break; #endif default: /* * Since we only support AF_INET and AF_INET6, just * let other than AF_INET and AF_INET6 through. */ return NULL; } if (getnameinfo((struct sockaddr *)&ss, sslen, pname, sizeof(pname), NULL, 0, NI_NUMERICHOST) != 0) return NULL; LIST_FOREACH(conn, &sep->se_conn[hv], co_link) { if (getnameinfo((struct sockaddr *)&conn->co_addr, conn->co_addr.ss_len, pname2, sizeof(pname2), NULL, 0, NI_NUMERICHOST) == 0 && strcmp(pname, pname2) == 0) break; } if (conn == NULL) { if ((conn = malloc(sizeof(struct conninfo))) == NULL) { syslog(LOG_ERR, "malloc: %m"); exit(EX_OSERR); } conn->co_proc = malloc(sep->se_maxperip * sizeof(*conn->co_proc)); if (conn->co_proc == NULL) { syslog(LOG_ERR, "malloc: %m"); exit(EX_OSERR); } memcpy(&conn->co_addr, (struct sockaddr *)&ss, sslen); conn->co_numchild = 0; LIST_INSERT_HEAD(&sep->se_conn[hv], conn, co_link); } /* * Since a child process is not invoked yet, we cannot * determine a pid of a child. So, co_proc and co_numchild * should be filled leter. */ return conn; } static int room_conn(struct servtab *sep, struct conninfo *conn) { char pname[NI_MAXHOST]; if (conn->co_numchild >= sep->se_maxperip) { getnameinfo((struct sockaddr *)&conn->co_addr, conn->co_addr.ss_len, pname, sizeof(pname), NULL, 0, NI_NUMERICHOST); syslog(LOG_ERR, "%s from %s exceeded counts (limit %d)", sep->se_service, pname, sep->se_maxperip); return 0; } return 1; } static void addchild_conn(struct conninfo *conn, pid_t pid) { struct procinfo *proc; if (conn == NULL) return; if ((proc = search_proc(pid, 1)) != NULL) { if (proc->pr_conn != NULL) { syslog(LOG_ERR, "addchild_conn: child already on process list"); exit(EX_OSERR); } proc->pr_conn = conn; } conn->co_proc[conn->co_numchild++] = proc; } static void reapchild_conn(pid_t pid) { struct procinfo *proc; struct conninfo *conn; int i; if ((proc = search_proc(pid, 0)) == NULL) return; if ((conn = proc->pr_conn) == NULL) return; for (i = 0; i < conn->co_numchild; ++i) if (conn->co_proc[i] == proc) { conn->co_proc[i] = conn->co_proc[--conn->co_numchild]; break; } free_proc(proc); free_conn(conn); } static void resize_conn(struct servtab *sep, int maxpip) { struct conninfo *conn; int i, j; if (sep->se_maxperip <= 0) return; if (maxpip <= 0) { free_connlist(sep); return; } for (i = 0; i < PERIPSIZE; ++i) { LIST_FOREACH(conn, &sep->se_conn[i], co_link) { for (j = maxpip; j < conn->co_numchild; ++j) free_proc(conn->co_proc[j]); conn->co_proc = realloc(conn->co_proc, maxpip * sizeof(*conn->co_proc)); if (conn->co_proc == NULL) { syslog(LOG_ERR, "realloc: %m"); exit(EX_OSERR); } if (conn->co_numchild > maxpip) conn->co_numchild = maxpip; } } } static void free_connlist(struct servtab *sep) { struct conninfo *conn; int i, j; for (i = 0; i < PERIPSIZE; ++i) { while ((conn = LIST_FIRST(&sep->se_conn[i])) != NULL) { for (j = 0; j < conn->co_numchild; ++j) free_proc(conn->co_proc[j]); conn->co_numchild = 0; free_conn(conn); } } } static void free_conn(struct conninfo *conn) { if (conn == NULL) return; if (conn->co_numchild <= 0) { LIST_REMOVE(conn, co_link); free(conn->co_proc); free(conn); } } static struct procinfo * search_proc(pid_t pid, int add) { struct procinfo *proc; int hv; hv = hashval((char *)&pid, sizeof(pid)); LIST_FOREACH(proc, &proctable[hv], pr_link) { if (proc->pr_pid == pid) break; } if (proc == NULL && add) { if ((proc = malloc(sizeof(struct procinfo))) == NULL) { syslog(LOG_ERR, "malloc: %m"); exit(EX_OSERR); } proc->pr_pid = pid; proc->pr_conn = NULL; LIST_INSERT_HEAD(&proctable[hv], proc, pr_link); } return proc; } static void free_proc(struct procinfo *proc) { if (proc == NULL) return; LIST_REMOVE(proc, pr_link); free(proc); } static int hashval(char *p, int len) { int i, hv = 0xABC3D20F; for (i = 0; i < len; ++i, ++p) hv = (hv << 5) ^ (hv >> 23) ^ *p; hv = (hv ^ (hv >> 16)) & (PERIPSIZE - 1); return hv; } Index: projects/netbsd-tests-upstream-01-2017/usr.sbin/iscsid/iscsid.c =================================================================== --- projects/netbsd-tests-upstream-01-2017/usr.sbin/iscsid/iscsid.c (revision 312217) +++ projects/netbsd-tests-upstream-01-2017/usr.sbin/iscsid/iscsid.c (revision 312218) @@ -1,643 +1,642 @@ /*- * Copyright (c) 2012 The FreeBSD Foundation * All rights reserved. * * This software was developed by Edward Tomasz Napierala under sponsorship * from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "iscsid.h" static volatile bool sigalrm_received = false; static int nchildren = 0; static void usage(void) { fprintf(stderr, "usage: iscsid [-P pidfile][-d][-m maxproc][-t timeout]\n"); exit(1); } char * checked_strdup(const char *s) { char *c; c = strdup(s); if (c == NULL) log_err(1, "strdup"); return (c); } static void resolve_addr(const struct connection *conn, const char *address, struct addrinfo **ai, bool initiator_side) { struct addrinfo hints; char *arg, *addr, *ch; const char *port; int error, colons = 0; arg = checked_strdup(address); if (arg[0] == '\0') { fail(conn, "empty address"); log_errx(1, "empty address"); } if (arg[0] == '[') { /* * IPv6 address in square brackets, perhaps with port. */ arg++; addr = strsep(&arg, "]"); if (arg == NULL) { fail(conn, "malformed address"); log_errx(1, "malformed address %s", address); } if (arg[0] == '\0') { port = NULL; } else if (arg[0] == ':') { port = arg + 1; } else { fail(conn, "malformed address"); log_errx(1, "malformed address %s", address); } } else { /* * Either IPv6 address without brackets - and without * a port - or IPv4 address. Just count the colons. */ for (ch = arg; *ch != '\0'; ch++) { if (*ch == ':') colons++; } if (colons > 1) { addr = arg; port = NULL; } else { addr = strsep(&arg, ":"); if (arg == NULL) port = NULL; else port = arg; } } if (port == NULL && !initiator_side) port = "3260"; memset(&hints, 0, sizeof(hints)); hints.ai_family = PF_UNSPEC; hints.ai_socktype = SOCK_STREAM; hints.ai_flags = AI_ADDRCONFIG | AI_NUMERICSERV; if (initiator_side) hints.ai_flags |= AI_PASSIVE; error = getaddrinfo(addr, port, &hints, ai); if (error != 0) { fail(conn, gai_strerror(error)); log_errx(1, "getaddrinfo for %s failed: %s", address, gai_strerror(error)); } } static struct connection * connection_new(int iscsi_fd, const struct iscsi_daemon_request *request) { struct connection *conn; struct iscsi_session_limits *isl; struct addrinfo *from_ai, *to_ai; const char *from_addr, *to_addr; #ifdef ICL_KERNEL_PROXY struct iscsi_daemon_connect idc; #endif int error, sockbuf; conn = calloc(1, sizeof(*conn)); if (conn == NULL) log_err(1, "calloc"); /* * Default values, from RFC 3720, section 12. */ conn->conn_header_digest = CONN_DIGEST_NONE; conn->conn_data_digest = CONN_DIGEST_NONE; conn->conn_initial_r2t = true; conn->conn_immediate_data = true; - conn->conn_max_burst_length = MAX_BURST_LENGTH; - conn->conn_first_burst_length = FIRST_BURST_LENGTH; + conn->conn_max_recv_data_segment_length = 8192; + conn->conn_max_send_data_segment_length = 8192; + conn->conn_max_burst_length = 262144; + conn->conn_first_burst_length = 65536; conn->conn_iscsi_fd = iscsi_fd; conn->conn_session_id = request->idr_session_id; memcpy(&conn->conn_conf, &request->idr_conf, sizeof(conn->conn_conf)); memcpy(&conn->conn_isid, &request->idr_isid, sizeof(conn->conn_isid)); conn->conn_tsih = request->idr_tsih; /* * Read the driver limits and provide reasonable defaults for the ones * the driver doesn't care about. If a max_snd_dsl is not explicitly * provided by the driver then we'll make sure both conn->max_snd_dsl * and isl->max_snd_dsl are set to the rcv_dsl. This preserves historic * behavior. */ isl = &conn->conn_limits; memcpy(isl, &request->idr_limits, sizeof(*isl)); - if (isl->isl_max_recv_data_segment_length == 0) { - conn->conn_max_recv_data_segment_length = 8192; - conn->conn_max_send_data_segment_length = 8192; - isl->isl_max_recv_data_segment_length = 8192; - } else { - conn->conn_max_recv_data_segment_length = - isl->isl_max_recv_data_segment_length; - conn->conn_max_send_data_segment_length = - isl->isl_max_recv_data_segment_length; - } - if (isl->isl_max_send_data_segment_length == 0) { + if (isl->isl_max_recv_data_segment_length == 0) + isl->isl_max_recv_data_segment_length = (1 << 24) - 1; + if (isl->isl_max_send_data_segment_length == 0) isl->isl_max_send_data_segment_length = isl->isl_max_recv_data_segment_length; - } else { + if (isl->isl_max_burst_length == 0) + isl->isl_max_burst_length = (1 << 24) - 1; + if (isl->isl_first_burst_length == 0) + isl->isl_first_burst_length = (1 << 24) - 1; + if (isl->isl_first_burst_length > isl->isl_max_burst_length) + isl->isl_first_burst_length = isl->isl_max_burst_length; + + /* + * Limit default send length in case it won't be negotiated. + * We can't do it for other limits, since they may affect both + * sender and receiver operation, and we must obey defaults. + */ + if (conn->conn_max_send_data_segment_length > + isl->isl_max_send_data_segment_length) { conn->conn_max_send_data_segment_length = isl->isl_max_send_data_segment_length; - } - if (isl->isl_max_burst_length == 0) - isl->isl_max_burst_length = conn->conn_max_burst_length; - if (isl->isl_first_burst_length == 0) { - if (isl->isl_max_burst_length < (int)conn->conn_first_burst_length) - isl->isl_first_burst_length = isl->isl_max_burst_length; - else - isl->isl_first_burst_length = conn->conn_first_burst_length; } from_addr = conn->conn_conf.isc_initiator_addr; to_addr = conn->conn_conf.isc_target_addr; if (from_addr[0] != '\0') resolve_addr(conn, from_addr, &from_ai, true); else from_ai = NULL; resolve_addr(conn, to_addr, &to_ai, false); #ifdef ICL_KERNEL_PROXY if (conn->conn_conf.isc_iser) { memset(&idc, 0, sizeof(idc)); idc.idc_session_id = conn->conn_session_id; if (conn->conn_conf.isc_iser) idc.idc_iser = 1; idc.idc_domain = to_ai->ai_family; idc.idc_socktype = to_ai->ai_socktype; idc.idc_protocol = to_ai->ai_protocol; if (from_ai != NULL) { idc.idc_from_addr = from_ai->ai_addr; idc.idc_from_addrlen = from_ai->ai_addrlen; } idc.idc_to_addr = to_ai->ai_addr; idc.idc_to_addrlen = to_ai->ai_addrlen; log_debugx("connecting to %s using ICL kernel proxy", to_addr); error = ioctl(iscsi_fd, ISCSIDCONNECT, &idc); if (error != 0) { fail(conn, strerror(errno)); log_err(1, "failed to connect to %s " "using ICL kernel proxy: ISCSIDCONNECT", to_addr); } return (conn); } #endif /* ICL_KERNEL_PROXY */ if (conn->conn_conf.isc_iser) { fail(conn, "iSER not supported"); log_errx(1, "iscsid(8) compiled without ICL_KERNEL_PROXY " "does not support iSER"); } conn->conn_socket = socket(to_ai->ai_family, to_ai->ai_socktype, to_ai->ai_protocol); if (conn->conn_socket < 0) { fail(conn, strerror(errno)); log_err(1, "failed to create socket for %s", from_addr); } sockbuf = SOCKBUF_SIZE; if (setsockopt(conn->conn_socket, SOL_SOCKET, SO_RCVBUF, &sockbuf, sizeof(sockbuf)) == -1) log_warn("setsockopt(SO_RCVBUF) failed"); sockbuf = SOCKBUF_SIZE; if (setsockopt(conn->conn_socket, SOL_SOCKET, SO_SNDBUF, &sockbuf, sizeof(sockbuf)) == -1) log_warn("setsockopt(SO_SNDBUF) failed"); if (from_ai != NULL) { error = bind(conn->conn_socket, from_ai->ai_addr, from_ai->ai_addrlen); if (error != 0) { fail(conn, strerror(errno)); log_err(1, "failed to bind to %s", from_addr); } } log_debugx("connecting to %s", to_addr); error = connect(conn->conn_socket, to_ai->ai_addr, to_ai->ai_addrlen); if (error != 0) { fail(conn, strerror(errno)); log_err(1, "failed to connect to %s", to_addr); } return (conn); } static void handoff(struct connection *conn) { struct iscsi_daemon_handoff idh; int error; log_debugx("handing off connection to the kernel"); memset(&idh, 0, sizeof(idh)); idh.idh_session_id = conn->conn_session_id; idh.idh_socket = conn->conn_socket; strlcpy(idh.idh_target_alias, conn->conn_target_alias, sizeof(idh.idh_target_alias)); idh.idh_tsih = conn->conn_tsih; idh.idh_statsn = conn->conn_statsn; idh.idh_header_digest = conn->conn_header_digest; idh.idh_data_digest = conn->conn_data_digest; idh.idh_initial_r2t = conn->conn_initial_r2t; idh.idh_immediate_data = conn->conn_immediate_data; idh.idh_max_recv_data_segment_length = conn->conn_max_recv_data_segment_length; idh.idh_max_send_data_segment_length = conn->conn_max_send_data_segment_length; idh.idh_max_burst_length = conn->conn_max_burst_length; idh.idh_first_burst_length = conn->conn_first_burst_length; error = ioctl(conn->conn_iscsi_fd, ISCSIDHANDOFF, &idh); if (error != 0) log_err(1, "ISCSIDHANDOFF"); } void fail(const struct connection *conn, const char *reason) { struct iscsi_daemon_fail idf; int error, saved_errno; saved_errno = errno; memset(&idf, 0, sizeof(idf)); idf.idf_session_id = conn->conn_session_id; strlcpy(idf.idf_reason, reason, sizeof(idf.idf_reason)); error = ioctl(conn->conn_iscsi_fd, ISCSIDFAIL, &idf); if (error != 0) log_err(1, "ISCSIDFAIL"); errno = saved_errno; } /* * XXX: I CANT INTO LATIN */ static void capsicate(struct connection *conn) { int error; cap_rights_t rights; #ifdef ICL_KERNEL_PROXY const unsigned long cmds[] = { ISCSIDCONNECT, ISCSIDSEND, ISCSIDRECEIVE, ISCSIDHANDOFF, ISCSIDFAIL, ISCSISADD, ISCSISREMOVE, ISCSISMODIFY }; #else const unsigned long cmds[] = { ISCSIDHANDOFF, ISCSIDFAIL, ISCSISADD, ISCSISREMOVE, ISCSISMODIFY }; #endif cap_rights_init(&rights, CAP_IOCTL); error = cap_rights_limit(conn->conn_iscsi_fd, &rights); if (error != 0 && errno != ENOSYS) log_err(1, "cap_rights_limit"); error = cap_ioctls_limit(conn->conn_iscsi_fd, cmds, sizeof(cmds) / sizeof(cmds[0])); if (error != 0 && errno != ENOSYS) log_err(1, "cap_ioctls_limit"); error = cap_enter(); if (error != 0 && errno != ENOSYS) log_err(1, "cap_enter"); if (cap_sandboxed()) log_debugx("Capsicum capability mode enabled"); else log_warnx("Capsicum capability mode not supported"); } bool timed_out(void) { return (sigalrm_received); } static void sigalrm_handler(int dummy __unused) { /* * It would be easiest to just log an error and exit. We can't * do this, though, because log_errx() is not signal safe, since * it calls syslog(3). Instead, set a flag checked by pdu_send() * and pdu_receive(), to call log_errx() there. Should they fail * to notice, we'll exit here one second later. */ if (sigalrm_received) { /* * Oh well. Just give up and quit. */ _exit(2); } sigalrm_received = true; } static void set_timeout(int timeout) { struct sigaction sa; struct itimerval itv; int error; if (timeout <= 0) { log_debugx("session timeout disabled"); return; } bzero(&sa, sizeof(sa)); sa.sa_handler = sigalrm_handler; sigfillset(&sa.sa_mask); error = sigaction(SIGALRM, &sa, NULL); if (error != 0) log_err(1, "sigaction"); /* * First SIGALRM will arive after conf_timeout seconds. * If we do nothing, another one will arrive a second later. */ bzero(&itv, sizeof(itv)); itv.it_interval.tv_sec = 1; itv.it_value.tv_sec = timeout; log_debugx("setting session timeout to %d seconds", timeout); error = setitimer(ITIMER_REAL, &itv, NULL); if (error != 0) log_err(1, "setitimer"); } static void sigchld_handler(int dummy __unused) { /* * The only purpose of this handler is to make SIGCHLD * interrupt the ISCSIDWAIT ioctl(2), so we can call * wait_for_children(). */ } static void register_sigchld(void) { struct sigaction sa; int error; bzero(&sa, sizeof(sa)); sa.sa_handler = sigchld_handler; sigfillset(&sa.sa_mask); error = sigaction(SIGCHLD, &sa, NULL); if (error != 0) log_err(1, "sigaction"); } static void handle_request(int iscsi_fd, const struct iscsi_daemon_request *request, int timeout) { struct connection *conn; log_set_peer_addr(request->idr_conf.isc_target_addr); if (request->idr_conf.isc_target[0] != '\0') { log_set_peer_name(request->idr_conf.isc_target); setproctitle("%s (%s)", request->idr_conf.isc_target_addr, request->idr_conf.isc_target); } else { setproctitle("%s", request->idr_conf.isc_target_addr); } conn = connection_new(iscsi_fd, request); set_timeout(timeout); capsicate(conn); login(conn); if (conn->conn_conf.isc_discovery != 0) discovery(conn); else handoff(conn); log_debugx("nothing more to do; exiting"); exit (0); } static int wait_for_children(bool block) { pid_t pid; int status; int num = 0; for (;;) { /* * If "block" is true, wait for at least one process. */ if (block && num == 0) pid = wait4(-1, &status, 0, NULL); else pid = wait4(-1, &status, WNOHANG, NULL); if (pid <= 0) break; if (WIFSIGNALED(status)) { log_warnx("child process %d terminated with signal %d", pid, WTERMSIG(status)); } else if (WEXITSTATUS(status) != 0) { log_warnx("child process %d terminated with exit status %d", pid, WEXITSTATUS(status)); } else { log_debugx("child process %d terminated gracefully", pid); } num++; } return (num); } int main(int argc, char **argv) { int ch, debug = 0, error, iscsi_fd, maxproc = 30, retval, saved_errno, timeout = 60; bool dont_daemonize = false; struct pidfh *pidfh; pid_t pid, otherpid; const char *pidfile_path = DEFAULT_PIDFILE; struct iscsi_daemon_request request; while ((ch = getopt(argc, argv, "P:dl:m:t:")) != -1) { switch (ch) { case 'P': pidfile_path = optarg; break; case 'd': dont_daemonize = true; debug++; break; case 'l': debug = atoi(optarg); break; case 'm': maxproc = atoi(optarg); break; case 't': timeout = atoi(optarg); break; case '?': default: usage(); } } argc -= optind; if (argc != 0) usage(); log_init(debug); pidfh = pidfile_open(pidfile_path, 0600, &otherpid); if (pidfh == NULL) { if (errno == EEXIST) log_errx(1, "daemon already running, pid: %jd.", (intmax_t)otherpid); log_err(1, "cannot open or create pidfile \"%s\"", pidfile_path); } iscsi_fd = open(ISCSI_PATH, O_RDWR); if (iscsi_fd < 0 && errno == ENOENT) { saved_errno = errno; retval = kldload("iscsi"); if (retval != -1) iscsi_fd = open(ISCSI_PATH, O_RDWR); else errno = saved_errno; } if (iscsi_fd < 0) log_err(1, "failed to open %s", ISCSI_PATH); if (dont_daemonize == false) { if (daemon(0, 0) == -1) { log_warn("cannot daemonize"); pidfile_remove(pidfh); exit(1); } } pidfile_write(pidfh); register_sigchld(); for (;;) { log_debugx("waiting for request from the kernel"); memset(&request, 0, sizeof(request)); error = ioctl(iscsi_fd, ISCSIDWAIT, &request); if (error != 0) { if (errno == EINTR) { nchildren -= wait_for_children(false); assert(nchildren >= 0); continue; } log_err(1, "ISCSIDWAIT"); } if (dont_daemonize) { log_debugx("not forking due to -d flag; " "will exit after servicing a single request"); } else { nchildren -= wait_for_children(false); assert(nchildren >= 0); while (maxproc > 0 && nchildren >= maxproc) { log_debugx("maxproc limit of %d child processes hit; " "waiting for child process to exit", maxproc); nchildren -= wait_for_children(true); assert(nchildren >= 0); } log_debugx("incoming connection; forking child process #%d", nchildren); nchildren++; pid = fork(); if (pid < 0) log_err(1, "fork"); if (pid > 0) continue; } pidfile_close(pidfh); handle_request(iscsi_fd, &request, timeout); } return (0); } Index: projects/netbsd-tests-upstream-01-2017/usr.sbin/iscsid/iscsid.h =================================================================== --- projects/netbsd-tests-upstream-01-2017/usr.sbin/iscsid/iscsid.h (revision 312217) +++ projects/netbsd-tests-upstream-01-2017/usr.sbin/iscsid/iscsid.h (revision 312218) @@ -1,152 +1,150 @@ /*- * Copyright (c) 2012 The FreeBSD Foundation * All rights reserved. * * This software was developed by Edward Tomasz Napierala under sponsorship * from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef ISCSID_H #define ISCSID_H #include #include #include #define DEFAULT_PIDFILE "/var/run/iscsid.pid" #define CONN_DIGEST_NONE 0 #define CONN_DIGEST_CRC32C 1 #define CONN_MUTUAL_CHALLENGE_LEN 1024 #define SOCKBUF_SIZE 1048576 -#define MAX_BURST_LENGTH (256 * 1024) -#define FIRST_BURST_LENGTH (128 * 1024) struct connection { int conn_iscsi_fd; int conn_socket; unsigned int conn_session_id; struct iscsi_session_conf conn_conf; struct iscsi_session_limits conn_limits; char conn_target_alias[ISCSI_ADDR_LEN]; uint8_t conn_isid[6]; uint16_t conn_tsih; uint32_t conn_statsn; int conn_header_digest; int conn_data_digest; bool conn_initial_r2t; bool conn_immediate_data; int conn_max_recv_data_segment_length; int conn_max_send_data_segment_length; int conn_max_burst_length; int conn_first_burst_length; struct chap *conn_mutual_chap; }; struct pdu { struct connection *pdu_connection; struct iscsi_bhs *pdu_bhs; char *pdu_data; size_t pdu_data_len; }; #define KEYS_MAX 1024 struct keys { char *keys_names[KEYS_MAX]; char *keys_values[KEYS_MAX]; char *keys_data; size_t keys_data_len; }; #define CHAP_CHALLENGE_LEN 1024 #define CHAP_DIGEST_LEN 16 /* Equal to MD5 digest size. */ struct chap { unsigned char chap_id; char chap_challenge[CHAP_CHALLENGE_LEN]; char chap_response[CHAP_DIGEST_LEN]; }; struct rchap { char *rchap_secret; unsigned char rchap_id; void *rchap_challenge; size_t rchap_challenge_len; }; struct chap *chap_new(void); char *chap_get_id(const struct chap *chap); char *chap_get_challenge(const struct chap *chap); int chap_receive(struct chap *chap, const char *response); int chap_authenticate(struct chap *chap, const char *secret); void chap_delete(struct chap *chap); struct rchap *rchap_new(const char *secret); int rchap_receive(struct rchap *rchap, const char *id, const char *challenge); char *rchap_get_response(struct rchap *rchap); void rchap_delete(struct rchap *rchap); struct keys *keys_new(void); void keys_delete(struct keys *key); void keys_load(struct keys *keys, const struct pdu *pdu); void keys_save(struct keys *keys, struct pdu *pdu); const char *keys_find(struct keys *keys, const char *name); void keys_add(struct keys *keys, const char *name, const char *value); void keys_add_int(struct keys *keys, const char *name, int value); struct pdu *pdu_new(struct connection *ic); struct pdu *pdu_new_response(struct pdu *request); void pdu_receive(struct pdu *request); void pdu_send(struct pdu *response); void pdu_delete(struct pdu *ip); void login(struct connection *ic); void discovery(struct connection *ic); void log_init(int level); void log_set_peer_name(const char *name); void log_set_peer_addr(const char *addr); void log_err(int, const char *, ...) __dead2 __printflike(2, 3); void log_errx(int, const char *, ...) __dead2 __printflike(2, 3); void log_warn(const char *, ...) __printflike(1, 2); void log_warnx(const char *, ...) __printflike(1, 2); void log_debugx(const char *, ...) __printflike(1, 2); char *checked_strdup(const char *); bool timed_out(void); void fail(const struct connection *, const char *); #endif /* !ISCSID_H */ Index: projects/netbsd-tests-upstream-01-2017/usr.sbin/iscsid/login.c =================================================================== --- projects/netbsd-tests-upstream-01-2017/usr.sbin/iscsid/login.c (revision 312217) +++ projects/netbsd-tests-upstream-01-2017/usr.sbin/iscsid/login.c (revision 312218) @@ -1,887 +1,890 @@ /*- * Copyright (c) 2012 The FreeBSD Foundation * All rights reserved. * * This software was developed by Edward Tomasz Napierala under sponsorship * from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include "iscsid.h" #include "iscsi_proto.h" static int login_nsg(const struct pdu *response) { struct iscsi_bhs_login_response *bhslr; bhslr = (struct iscsi_bhs_login_response *)response->pdu_bhs; return (bhslr->bhslr_flags & 0x03); } static void login_set_nsg(struct pdu *request, int nsg) { struct iscsi_bhs_login_request *bhslr; assert(nsg == BHSLR_STAGE_SECURITY_NEGOTIATION || nsg == BHSLR_STAGE_OPERATIONAL_NEGOTIATION || nsg == BHSLR_STAGE_FULL_FEATURE_PHASE); bhslr = (struct iscsi_bhs_login_request *)request->pdu_bhs; bhslr->bhslr_flags &= 0xFC; bhslr->bhslr_flags |= nsg; } static void login_set_csg(struct pdu *request, int csg) { struct iscsi_bhs_login_request *bhslr; assert(csg == BHSLR_STAGE_SECURITY_NEGOTIATION || csg == BHSLR_STAGE_OPERATIONAL_NEGOTIATION || csg == BHSLR_STAGE_FULL_FEATURE_PHASE); bhslr = (struct iscsi_bhs_login_request *)request->pdu_bhs; bhslr->bhslr_flags &= 0xF3; bhslr->bhslr_flags |= csg << 2; } static const char * login_target_error_str(int class, int detail) { static char msg[128]; /* * RFC 3270, 10.13.5. Status-Class and Status-Detail */ switch (class) { case 0x01: switch (detail) { case 0x01: return ("Target moved temporarily"); case 0x02: return ("Target moved permanently"); default: snprintf(msg, sizeof(msg), "unknown redirection; " "Status-Class 0x%x, Status-Detail 0x%x", class, detail); return (msg); } case 0x02: switch (detail) { case 0x00: return ("Initiator error"); case 0x01: return ("Authentication failure"); case 0x02: return ("Authorization failure"); case 0x03: return ("Not found"); case 0x04: return ("Target removed"); case 0x05: return ("Unsupported version"); case 0x06: return ("Too many connections"); case 0x07: return ("Missing parameter"); case 0x08: return ("Can't include in session"); case 0x09: return ("Session type not supported"); case 0x0a: return ("Session does not exist"); case 0x0b: return ("Invalid during login"); default: snprintf(msg, sizeof(msg), "unknown initiator error; " "Status-Class 0x%x, Status-Detail 0x%x", class, detail); return (msg); } case 0x03: switch (detail) { case 0x00: return ("Target error"); case 0x01: return ("Service unavailable"); case 0x02: return ("Out of resources"); default: snprintf(msg, sizeof(msg), "unknown target error; " "Status-Class 0x%x, Status-Detail 0x%x", class, detail); return (msg); } default: snprintf(msg, sizeof(msg), "unknown error; " "Status-Class 0x%x, Status-Detail 0x%x", class, detail); return (msg); } } static void kernel_modify(const struct connection *conn, const char *target_address) { struct iscsi_session_modify ism; int error; memset(&ism, 0, sizeof(ism)); ism.ism_session_id = conn->conn_session_id; memcpy(&ism.ism_conf, &conn->conn_conf, sizeof(ism.ism_conf)); strlcpy(ism.ism_conf.isc_target_addr, target_address, sizeof(ism.ism_conf.isc_target)); error = ioctl(conn->conn_iscsi_fd, ISCSISMODIFY, &ism); if (error != 0) { log_err(1, "failed to redirect to %s: ISCSISMODIFY", target_address); } } /* * XXX: The way it works is suboptimal; what should happen is described * in draft-gilligan-iscsi-fault-tolerance-00. That, however, would * be much more complicated: we would need to keep "dependencies" * for sessions, so that, in case described in draft and using draft * terminology, we would have three sessions: one for discovery, * one for initial target portal, and one for redirect portal. * This would allow us to "backtrack" on connection failure, * as described in draft. */ static void login_handle_redirection(struct connection *conn, struct pdu *response) { struct iscsi_bhs_login_response *bhslr; struct keys *response_keys; const char *target_address; bhslr = (struct iscsi_bhs_login_response *)response->pdu_bhs; assert (bhslr->bhslr_status_class == 1); response_keys = keys_new(); keys_load(response_keys, response); target_address = keys_find(response_keys, "TargetAddress"); if (target_address == NULL) log_errx(1, "received redirection without TargetAddress"); if (target_address[0] == '\0') log_errx(1, "received redirection with empty TargetAddress"); if (strlen(target_address) >= sizeof(conn->conn_conf.isc_target_addr) - 1) log_errx(1, "received TargetAddress is too long"); log_debugx("received redirection to \"%s\"", target_address); kernel_modify(conn, target_address); keys_delete(response_keys); } static struct pdu * login_receive(struct connection *conn) { struct pdu *response; struct iscsi_bhs_login_response *bhslr; const char *errorstr; static bool initial = true; response = pdu_new(conn); pdu_receive(response); if (response->pdu_bhs->bhs_opcode != ISCSI_BHS_OPCODE_LOGIN_RESPONSE) { log_errx(1, "protocol error: received invalid opcode 0x%x", response->pdu_bhs->bhs_opcode); } bhslr = (struct iscsi_bhs_login_response *)response->pdu_bhs; /* * XXX: Implement the C flag some day. */ if ((bhslr->bhslr_flags & BHSLR_FLAGS_CONTINUE) != 0) log_errx(1, "received Login PDU with unsupported \"C\" flag"); if (bhslr->bhslr_version_max != 0x00) log_errx(1, "received Login PDU with unsupported " "Version-max 0x%x", bhslr->bhslr_version_max); if (bhslr->bhslr_version_active != 0x00) log_errx(1, "received Login PDU with unsupported " "Version-active 0x%x", bhslr->bhslr_version_active); if (bhslr->bhslr_status_class == 1) { login_handle_redirection(conn, response); log_debugx("redirection handled; exiting"); exit(0); } if (bhslr->bhslr_status_class != 0) { errorstr = login_target_error_str(bhslr->bhslr_status_class, bhslr->bhslr_status_detail); fail(conn, errorstr); log_errx(1, "target returned error: %s", errorstr); } if (initial == false && ntohl(bhslr->bhslr_statsn) != conn->conn_statsn + 1) { /* * It's a warning, not an error, to work around what seems * to be bug in NetBSD iSCSI target. */ log_warnx("received Login PDU with wrong StatSN: " "is %u, should be %u", ntohl(bhslr->bhslr_statsn), conn->conn_statsn + 1); } conn->conn_tsih = ntohs(bhslr->bhslr_tsih); conn->conn_statsn = ntohl(bhslr->bhslr_statsn); initial = false; return (response); } static struct pdu * login_new_request(struct connection *conn, int csg) { struct pdu *request; struct iscsi_bhs_login_request *bhslr; int nsg; request = pdu_new(conn); bhslr = (struct iscsi_bhs_login_request *)request->pdu_bhs; bhslr->bhslr_opcode = ISCSI_BHS_OPCODE_LOGIN_REQUEST | ISCSI_BHS_OPCODE_IMMEDIATE; bhslr->bhslr_flags = BHSLR_FLAGS_TRANSIT; switch (csg) { case BHSLR_STAGE_SECURITY_NEGOTIATION: nsg = BHSLR_STAGE_OPERATIONAL_NEGOTIATION; break; case BHSLR_STAGE_OPERATIONAL_NEGOTIATION: nsg = BHSLR_STAGE_FULL_FEATURE_PHASE; break; default: assert(!"invalid csg"); log_errx(1, "invalid csg %d", csg); } login_set_csg(request, csg); login_set_nsg(request, nsg); memcpy(bhslr->bhslr_isid, &conn->conn_isid, sizeof(bhslr->bhslr_isid)); bhslr->bhslr_tsih = htons(conn->conn_tsih); bhslr->bhslr_initiator_task_tag = 0; bhslr->bhslr_cmdsn = 0; bhslr->bhslr_expstatsn = htonl(conn->conn_statsn + 1); return (request); } static int login_list_prefers(const char *list, const char *choice1, const char *choice2) { char *tofree, *str, *token; tofree = str = checked_strdup(list); while ((token = strsep(&str, ",")) != NULL) { if (strcmp(token, choice1) == 0) { free(tofree); return (1); } if (strcmp(token, choice2) == 0) { free(tofree); return (2); } } free(tofree); return (-1); } static void login_negotiate_key(struct connection *conn, const char *name, const char *value) { struct iscsi_session_limits *isl; int which, tmp; isl = &conn->conn_limits; if (strcmp(name, "TargetAlias") == 0) { strlcpy(conn->conn_target_alias, value, sizeof(conn->conn_target_alias)); } else if (strcmp(value, "Irrelevant") == 0) { /* Ignore. */ } else if (strcmp(name, "HeaderDigest") == 0) { which = login_list_prefers(value, "CRC32C", "None"); switch (which) { case 1: log_debugx("target prefers CRC32C " "for header digest; we'll use it"); conn->conn_header_digest = CONN_DIGEST_CRC32C; break; case 2: log_debugx("target prefers not to do " "header digest; we'll comply"); break; default: log_warnx("target sent unrecognized " "HeaderDigest value \"%s\"; will use None", value); break; } } else if (strcmp(name, "DataDigest") == 0) { which = login_list_prefers(value, "CRC32C", "None"); switch (which) { case 1: log_debugx("target prefers CRC32C " "for data digest; we'll use it"); conn->conn_data_digest = CONN_DIGEST_CRC32C; break; case 2: log_debugx("target prefers not to do " "data digest; we'll comply"); break; default: log_warnx("target sent unrecognized " "DataDigest value \"%s\"; will use None", value); break; } } else if (strcmp(name, "MaxConnections") == 0) { /* Ignore. */ } else if (strcmp(name, "InitialR2T") == 0) { if (strcmp(value, "Yes") == 0) conn->conn_initial_r2t = true; else conn->conn_initial_r2t = false; } else if (strcmp(name, "ImmediateData") == 0) { if (strcmp(value, "Yes") == 0) conn->conn_immediate_data = true; else conn->conn_immediate_data = false; } else if (strcmp(name, "MaxRecvDataSegmentLength") == 0) { tmp = strtoul(value, NULL, 10); if (tmp <= 0) log_errx(1, "received invalid " "MaxRecvDataSegmentLength"); if (tmp > isl->isl_max_send_data_segment_length) { log_debugx("capping max_send_data_segment_length " "from %d to %d", tmp, isl->isl_max_send_data_segment_length); tmp = isl->isl_max_send_data_segment_length; } conn->conn_max_send_data_segment_length = tmp; + /* We received target's limit, that means it accepted our's. */ + conn->conn_max_recv_data_segment_length = + isl->isl_max_recv_data_segment_length; } else if (strcmp(name, "MaxBurstLength") == 0) { tmp = strtoul(value, NULL, 10); if (tmp <= 0) log_errx(1, "received invalid MaxBurstLength"); if (tmp > isl->isl_max_burst_length) { log_debugx("capping MaxBurstLength " "from %d to %d", tmp, isl->isl_max_burst_length); tmp = isl->isl_max_burst_length; } conn->conn_max_burst_length = tmp; } else if (strcmp(name, "FirstBurstLength") == 0) { tmp = strtoul(value, NULL, 10); if (tmp <= 0) log_errx(1, "received invalid FirstBurstLength"); if (tmp > isl->isl_first_burst_length) { log_debugx("capping FirstBurstLength " "from %d to %d", tmp, isl->isl_first_burst_length); tmp = isl->isl_first_burst_length; } conn->conn_first_burst_length = tmp; } else if (strcmp(name, "DefaultTime2Wait") == 0) { /* Ignore */ } else if (strcmp(name, "DefaultTime2Retain") == 0) { /* Ignore */ } else if (strcmp(name, "MaxOutstandingR2T") == 0) { /* Ignore */ } else if (strcmp(name, "DataPDUInOrder") == 0) { /* Ignore */ } else if (strcmp(name, "DataSequenceInOrder") == 0) { /* Ignore */ } else if (strcmp(name, "ErrorRecoveryLevel") == 0) { /* Ignore */ } else if (strcmp(name, "OFMarker") == 0) { /* Ignore */ } else if (strcmp(name, "IFMarker") == 0) { /* Ignore */ } else if (strcmp(name, "RDMAExtensions") == 0) { if (conn->conn_conf.isc_iser == 1 && strcmp(value, "Yes") != 0) { log_errx(1, "received unsupported RDMAExtensions"); } } else if (strcmp(name, "InitiatorRecvDataSegmentLength") == 0) { tmp = strtoul(value, NULL, 10); if (tmp <= 0) log_errx(1, "received invalid " "InitiatorRecvDataSegmentLength"); if ((int)tmp > isl->isl_max_recv_data_segment_length) { log_debugx("capping InitiatorRecvDataSegmentLength " "from %d to %d", tmp, isl->isl_max_recv_data_segment_length); tmp = isl->isl_max_recv_data_segment_length; } conn->conn_max_recv_data_segment_length = tmp; } else if (strcmp(name, "TargetPortalGroupTag") == 0) { /* Ignore */ } else if (strcmp(name, "TargetRecvDataSegmentLength") == 0) { tmp = strtoul(value, NULL, 10); if (tmp <= 0) { log_errx(1, "received invalid TargetRecvDataSegmentLength"); } if (tmp > isl->isl_max_send_data_segment_length) { log_debugx("capping TargetRecvDataSegmentLength " "from %d to %d", tmp, isl->isl_max_send_data_segment_length); tmp = isl->isl_max_send_data_segment_length; } conn->conn_max_send_data_segment_length = tmp; } else { log_debugx("unknown key \"%s\"; ignoring", name); } } static void login_negotiate(struct connection *conn) { struct pdu *request, *response; struct keys *request_keys, *response_keys; struct iscsi_bhs_login_response *bhslr; int i, nrequests = 0; struct iscsi_session_limits *isl; log_debugx("beginning operational parameter negotiation"); request = login_new_request(conn, BHSLR_STAGE_OPERATIONAL_NEGOTIATION); request_keys = keys_new(); isl = &conn->conn_limits; log_debugx("Limits for offload \"%s\" are " "MaxRecvDataSegment=%d, max_send_dsl=%d, " "MaxBurstLength=%d, FirstBurstLength=%d", conn->conn_conf.isc_offload, isl->isl_max_recv_data_segment_length, isl->isl_max_send_data_segment_length, isl->isl_max_burst_length, isl->isl_first_burst_length); /* * The following keys are irrelevant for discovery sessions. */ if (conn->conn_conf.isc_discovery == 0) { if (conn->conn_conf.isc_header_digest != 0) keys_add(request_keys, "HeaderDigest", "CRC32C"); else keys_add(request_keys, "HeaderDigest", "None"); if (conn->conn_conf.isc_data_digest != 0) keys_add(request_keys, "DataDigest", "CRC32C"); else keys_add(request_keys, "DataDigest", "None"); keys_add(request_keys, "ImmediateData", "Yes"); keys_add_int(request_keys, "MaxBurstLength", isl->isl_max_burst_length); keys_add_int(request_keys, "FirstBurstLength", isl->isl_first_burst_length); keys_add(request_keys, "InitialR2T", "Yes"); keys_add(request_keys, "MaxOutstandingR2T", "1"); if (conn->conn_conf.isc_iser == 1) { keys_add_int(request_keys, "InitiatorRecvDataSegmentLength", isl->isl_max_recv_data_segment_length); keys_add_int(request_keys, "TargetRecvDataSegmentLength", isl->isl_max_send_data_segment_length); keys_add(request_keys, "RDMAExtensions", "Yes"); } else { keys_add_int(request_keys, "MaxRecvDataSegmentLength", isl->isl_max_recv_data_segment_length); } } else { keys_add(request_keys, "HeaderDigest", "None"); keys_add(request_keys, "DataDigest", "None"); keys_add_int(request_keys, "MaxRecvDataSegmentLength", isl->isl_max_recv_data_segment_length); } keys_add(request_keys, "DefaultTime2Wait", "0"); keys_add(request_keys, "DefaultTime2Retain", "0"); keys_add(request_keys, "ErrorRecoveryLevel", "0"); keys_save(request_keys, request); keys_delete(request_keys); request_keys = NULL; pdu_send(request); pdu_delete(request); request = NULL; response = login_receive(conn); response_keys = keys_new(); keys_load(response_keys, response); for (i = 0; i < KEYS_MAX; i++) { if (response_keys->keys_names[i] == NULL) break; login_negotiate_key(conn, response_keys->keys_names[i], response_keys->keys_values[i]); } keys_delete(response_keys); response_keys = NULL; for (;;) { bhslr = (struct iscsi_bhs_login_response *)response->pdu_bhs; if ((bhslr->bhslr_flags & BHSLR_FLAGS_TRANSIT) != 0) break; nrequests++; if (nrequests > 5) { log_warnx("received login response " "without the \"T\" flag too many times; giving up"); break; } log_debugx("received login response " "without the \"T\" flag; sending another request"); pdu_delete(response); request = login_new_request(conn, BHSLR_STAGE_OPERATIONAL_NEGOTIATION); pdu_send(request); pdu_delete(request); response = login_receive(conn); } if (login_nsg(response) != BHSLR_STAGE_FULL_FEATURE_PHASE) log_warnx("received final login response with wrong NSG 0x%x", login_nsg(response)); pdu_delete(response); log_debugx("operational parameter negotiation done; " "transitioning to Full Feature phase"); } static void login_send_chap_a(struct connection *conn) { struct pdu *request; struct keys *request_keys; request = login_new_request(conn, BHSLR_STAGE_SECURITY_NEGOTIATION); request_keys = keys_new(); keys_add(request_keys, "CHAP_A", "5"); keys_save(request_keys, request); keys_delete(request_keys); pdu_send(request); pdu_delete(request); } static void login_send_chap_r(struct pdu *response) { struct connection *conn; struct pdu *request; struct keys *request_keys, *response_keys; struct rchap *rchap; const char *chap_a, *chap_c, *chap_i; char *chap_r; int error; char *mutual_chap_c, *mutual_chap_i; /* * As in the rest of the initiator, 'request' means * 'initiator -> target', and 'response' means 'target -> initiator', * * So, here the 'response' from the target is the packet that contains * CHAP challenge; our CHAP response goes into 'request'. */ conn = response->pdu_connection; response_keys = keys_new(); keys_load(response_keys, response); /* * First, compute the response. */ chap_a = keys_find(response_keys, "CHAP_A"); if (chap_a == NULL) log_errx(1, "received CHAP packet without CHAP_A"); chap_c = keys_find(response_keys, "CHAP_C"); if (chap_c == NULL) log_errx(1, "received CHAP packet without CHAP_C"); chap_i = keys_find(response_keys, "CHAP_I"); if (chap_i == NULL) log_errx(1, "received CHAP packet without CHAP_I"); if (strcmp(chap_a, "5") != 0) { log_errx(1, "received CHAP packet " "with unsupported CHAP_A \"%s\"", chap_a); } rchap = rchap_new(conn->conn_conf.isc_secret); error = rchap_receive(rchap, chap_i, chap_c); if (error != 0) { log_errx(1, "received CHAP packet " "with malformed CHAP_I or CHAP_C"); } chap_r = rchap_get_response(rchap); rchap_delete(rchap); keys_delete(response_keys); request = login_new_request(conn, BHSLR_STAGE_SECURITY_NEGOTIATION); request_keys = keys_new(); keys_add(request_keys, "CHAP_N", conn->conn_conf.isc_user); keys_add(request_keys, "CHAP_R", chap_r); free(chap_r); /* * If we want mutual authentication, we're expected to send * our CHAP_I/CHAP_C now. */ if (conn->conn_conf.isc_mutual_user[0] != '\0') { log_debugx("requesting mutual authentication; " "binary challenge size is %zd bytes", sizeof(conn->conn_mutual_chap->chap_challenge)); assert(conn->conn_mutual_chap == NULL); conn->conn_mutual_chap = chap_new(); mutual_chap_i = chap_get_id(conn->conn_mutual_chap); mutual_chap_c = chap_get_challenge(conn->conn_mutual_chap); keys_add(request_keys, "CHAP_I", mutual_chap_i); keys_add(request_keys, "CHAP_C", mutual_chap_c); free(mutual_chap_i); free(mutual_chap_c); } keys_save(request_keys, request); keys_delete(request_keys); pdu_send(request); pdu_delete(request); } static void login_verify_mutual(const struct pdu *response) { struct connection *conn; struct keys *response_keys; const char *chap_n, *chap_r; int error; conn = response->pdu_connection; response_keys = keys_new(); keys_load(response_keys, response); chap_n = keys_find(response_keys, "CHAP_N"); if (chap_n == NULL) log_errx(1, "received CHAP Response PDU without CHAP_N"); chap_r = keys_find(response_keys, "CHAP_R"); if (chap_r == NULL) log_errx(1, "received CHAP Response PDU without CHAP_R"); error = chap_receive(conn->conn_mutual_chap, chap_r); if (error != 0) log_errx(1, "received CHAP Response PDU with invalid CHAP_R"); if (strcmp(chap_n, conn->conn_conf.isc_mutual_user) != 0) { fail(conn, "Mutual CHAP failed"); log_errx(1, "mutual CHAP authentication failed: wrong user"); } error = chap_authenticate(conn->conn_mutual_chap, conn->conn_conf.isc_mutual_secret); if (error != 0) { fail(conn, "Mutual CHAP failed"); log_errx(1, "mutual CHAP authentication failed: wrong secret"); } keys_delete(response_keys); chap_delete(conn->conn_mutual_chap); conn->conn_mutual_chap = NULL; log_debugx("mutual CHAP authentication succeeded"); } static void login_chap(struct connection *conn) { struct pdu *response; log_debugx("beginning CHAP authentication; sending CHAP_A"); login_send_chap_a(conn); log_debugx("waiting for CHAP_A/CHAP_C/CHAP_I"); response = login_receive(conn); log_debugx("sending CHAP_N/CHAP_R"); login_send_chap_r(response); pdu_delete(response); /* * XXX: Make sure this is not susceptible to MITM. */ log_debugx("waiting for CHAP result"); response = login_receive(conn); if (conn->conn_conf.isc_mutual_user[0] != '\0') login_verify_mutual(response); pdu_delete(response); log_debugx("CHAP authentication done"); } void login(struct connection *conn) { struct pdu *request, *response; struct keys *request_keys, *response_keys; struct iscsi_bhs_login_response *bhslr2; const char *auth_method; int i; log_debugx("beginning Login phase; sending Login PDU"); request = login_new_request(conn, BHSLR_STAGE_SECURITY_NEGOTIATION); request_keys = keys_new(); if (conn->conn_conf.isc_mutual_user[0] != '\0') { keys_add(request_keys, "AuthMethod", "CHAP"); } else if (conn->conn_conf.isc_user[0] != '\0') { /* * Give target a chance to skip authentication if it * doesn't feel like it. * * None is first, CHAP second; this is to work around * what seems to be LIO (Linux target) bug: otherwise, * if target is configured with no authentication, * and we are configured to authenticate, the target * will erroneously respond with AuthMethod=CHAP * instead of AuthMethod=None, and will subsequently * fail the connection. This usually happens with * Discovery sessions, which default to no authentication. */ keys_add(request_keys, "AuthMethod", "None,CHAP"); } else { keys_add(request_keys, "AuthMethod", "None"); } keys_add(request_keys, "InitiatorName", conn->conn_conf.isc_initiator); if (conn->conn_conf.isc_initiator_alias[0] != '\0') { keys_add(request_keys, "InitiatorAlias", conn->conn_conf.isc_initiator_alias); } if (conn->conn_conf.isc_discovery == 0) { keys_add(request_keys, "SessionType", "Normal"); keys_add(request_keys, "TargetName", conn->conn_conf.isc_target); } else { keys_add(request_keys, "SessionType", "Discovery"); } keys_save(request_keys, request); keys_delete(request_keys); pdu_send(request); pdu_delete(request); response = login_receive(conn); response_keys = keys_new(); keys_load(response_keys, response); for (i = 0; i < KEYS_MAX; i++) { if (response_keys->keys_names[i] == NULL) break; /* * Not interested in AuthMethod at this point; we only need * to parse things such as TargetAlias. * * XXX: This is somewhat ugly. We should have a way to apply * all the keys to the session and use that by default * instead of discarding them. */ if (strcmp(response_keys->keys_names[i], "AuthMethod") == 0) continue; login_negotiate_key(conn, response_keys->keys_names[i], response_keys->keys_values[i]); } bhslr2 = (struct iscsi_bhs_login_response *)response->pdu_bhs; if ((bhslr2->bhslr_flags & BHSLR_FLAGS_TRANSIT) != 0 && login_nsg(response) == BHSLR_STAGE_OPERATIONAL_NEGOTIATION) { if (conn->conn_conf.isc_mutual_user[0] != '\0') { log_errx(1, "target requested transition " "to operational parameter negotiation, " "but we require mutual CHAP"); } log_debugx("target requested transition " "to operational parameter negotiation"); keys_delete(response_keys); pdu_delete(response); login_negotiate(conn); return; } auth_method = keys_find(response_keys, "AuthMethod"); if (auth_method == NULL) log_errx(1, "received response without AuthMethod"); if (strcmp(auth_method, "None") == 0) { if (conn->conn_conf.isc_mutual_user[0] != '\0') { log_errx(1, "target does not require authantication, " "but we require mutual CHAP"); } log_debugx("target does not require authentication"); keys_delete(response_keys); pdu_delete(response); login_negotiate(conn); return; } if (strcmp(auth_method, "CHAP") != 0) { fail(conn, "Unsupported AuthMethod"); log_errx(1, "received response " "with unsupported AuthMethod \"%s\"", auth_method); } if (conn->conn_conf.isc_user[0] == '\0' || conn->conn_conf.isc_secret[0] == '\0') { fail(conn, "Authentication required"); log_errx(1, "target requests CHAP authentication, but we don't " "have user and secret"); } keys_delete(response_keys); response_keys = NULL; pdu_delete(response); response = NULL; login_chap(conn); login_negotiate(conn); } Index: projects/netbsd-tests-upstream-01-2017 =================================================================== --- projects/netbsd-tests-upstream-01-2017 (revision 312217) +++ projects/netbsd-tests-upstream-01-2017 (revision 312218) Property changes on: projects/netbsd-tests-upstream-01-2017 ___________________________________________________________________ Modified: svn:mergeinfo ## -0,0 +0,1 ## Merged /head:r312125-312217