Index: user/alc/PQ_LAUNDRY/contrib/netbsd-tests/lib/libc/locale/t_mbstowcs.c =================================================================== --- user/alc/PQ_LAUNDRY/contrib/netbsd-tests/lib/libc/locale/t_mbstowcs.c (revision 306831) +++ user/alc/PQ_LAUNDRY/contrib/netbsd-tests/lib/libc/locale/t_mbstowcs.c (revision 306832) @@ -1,211 +1,211 @@ /* $NetBSD: t_mbstowcs.c,v 1.1 2011/07/15 07:35:21 jruoho Exp $ */ /*- * Copyright (c) 2011 The NetBSD Foundation, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /*- * Copyright (c)2003 Citrus Project, * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __COPYRIGHT("@(#) Copyright (c) 2011\ The NetBSD Foundation, inc. All rights reserved."); __RCSID("$NetBSD: t_mbstowcs.c,v 1.1 2011/07/15 07:35:21 jruoho Exp $"); #include #include #include #include #include #include #include #include #define REQUIRE_ERRNO(x, v) \ ATF_REQUIRE_MSG((x) != (v), "%s: %s", #x, strerror(errno)) #define SIZE 256 static struct test { const char *locale; const char *data; wchar_t wchars[64]; int widths[64]; int width; } tests[] = { { "en_US.UTF-8", "[\001\177][\302\200\337\277][\340\240\200\357\277\277][\360\220\200" "\200\364\217\277\277]", { 0x5B, 0x01, 0x7F, 0x5D, 0x5B, 0x80, 0x07FF, 0x5D, 0x5B, 0x0800, 0xFFFF, 0x5D, 0x5B, 0x10000, 0x10FFFF, 0x5D, 0x0A }, #ifdef __FreeBSD__ - { 1, -1, -1, 1, 1, -1, 1, 1, 1, 1, -1, 1, 1, -1, -1, + { 1, -1, -1, 1, 1, -1, -1, 1, 1, 1, -1, 1, 1, -1, -1, #else { 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, #endif 1, 1, -1, -1, 1, 1, -1, -1, 1, -1 }, -1 }, { "ja_JP.ISO2022-JP", "\033$B#J#I#S$G$9!#\033(Baaaa\033$B$\"$$$&$($*\033(B", { 0x4200234A, 0x42002349, 0x42002353, 0x42002447, 0x42002439, 0x42002123, 0x61, 0x61, 0x61, 0x61, 0x42002422, 0x42002424, 0x42002426, 0x42002428, 0x4200242A, 0x0A }, { 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 2, 2, 2, 2, 2, -1 }, 26 }, { "ja_JP.SJIS", "\202r\202i\202h\202r\202\305\202\267\201Baaaa\202\240\202\242" "\202\244\202\246\202\250", { 0x8272, 0x8269, 0x8268, 0x8272, 0x82C5, 0x82B7, 0x8142, 0x61, 0x61, 0x61, 0x61, 0x82A0, 0x82A2, 0x82A4, 0x82A6, 0x82A8, 0x0A }, { 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 2, 2, 2, 2, 2, -1 }, 28 }, { "ja_JP.eucJP", "\243\305\243\325\243\303\244\307\244\271\241\243aaaa\244\242\244" "\244\244\246\244\250\244\252", { 0xA3C5, 0xA3D5, 0xA3C3, 0xA4C7, 0xA4B9, 0xA1A3, 0x61, 0x61, 0x61, 0x61, 0xA4A2, 0xA4A4, 0xA4A6, 0xA4A8, 0xA4AA, 0x0A }, { 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 2, 2, 2, 2, 2, -1 }, 26 }, { NULL, NULL, {}, {}, 0 } }; ATF_TC(mbstowcs_basic); ATF_TC_HEAD(mbstowcs_basic, tc) { atf_tc_set_md_var(tc, "descr", "Checks wide character functions with different locales"); } ATF_TC_BODY(mbstowcs_basic, tc) { struct test *t; for (t = &tests[0]; t->data != NULL; ++t) { wchar_t wbuf[SIZE]; char buf[SIZE]; char visbuf[SIZE]; char *str; int i; ATF_REQUIRE_STREQ(setlocale(LC_ALL, "C"), "C"); #ifdef __NetBSD__ ATF_REQUIRE(setlocale(LC_CTYPE, t->locale) != NULL); #else if (setlocale(LC_CTYPE, t->locale) == NULL) { fprintf(stderr, "Locale %s not found.\n", t->locale); continue; } #endif (void)strvis(visbuf, t->data, VIS_WHITE | VIS_OCTAL); (void)printf("Checking string: \"%s\"\n", visbuf); ATF_REQUIRE((str = setlocale(LC_ALL, NULL)) != NULL); (void)printf("Using locale: %s\n", str); REQUIRE_ERRNO((ssize_t)mbstowcs(wbuf, t->data, SIZE-1), -1); REQUIRE_ERRNO((ssize_t)wcstombs(buf, wbuf, SIZE-1), -1); if (strcmp(buf, t->data) != 0) { (void)strvis(visbuf, buf, VIS_WHITE | VIS_OCTAL); (void)printf("Conversion to wcs and back failed: " "\"%s\"\n", visbuf); atf_tc_fail("Test failed"); } /* The output here is implementation-dependent. */ for (i = 0; wbuf[i] != 0; ++i) { if (wbuf[i] == t->wchars[i] && wcwidth(wbuf[i]) == t->widths[i]) continue; (void)printf("At position %d:\n", i); (void)printf(" expected: 0x%04X (%d)\n", t->wchars[i], t->widths[i]); (void)printf(" got : 0x%04X (%d)\n", wbuf[i], wcwidth(wbuf[i])); atf_tc_fail("Test failed"); } if (wcswidth(wbuf, SIZE-1) != t->width) { (void)printf("Incorrect wcswidth:\n"); (void)printf(" expected: %d\n", t->width); (void)printf(" got : %d\n", wcswidth(wbuf, SIZE-1)); atf_tc_fail("Test failed"); } (void)printf("Ok.\n"); } } ATF_TP_ADD_TCS(tp) { ATF_TP_ADD_TC(tp, mbstowcs_basic); return atf_no_error(); } Index: user/alc/PQ_LAUNDRY/contrib/netbsd-tests/lib/libc/string/t_memmem.c =================================================================== --- user/alc/PQ_LAUNDRY/contrib/netbsd-tests/lib/libc/string/t_memmem.c (revision 306831) +++ user/alc/PQ_LAUNDRY/contrib/netbsd-tests/lib/libc/string/t_memmem.c (revision 306832) @@ -1,105 +1,108 @@ /* $NetBSD: t_memmem.c,v 1.2 2011/07/07 08:27:36 jruoho Exp $ */ /*- * Copyright (c) 2005 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Perry E. Metzger of Metzger, Dowdeswell & Co. LLC. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include #include #include #include char p0[] = ""; int lp0 = 0; char p1[] = "0123"; int lp1 = 4; char p2[] = "456"; int lp2 = 3; char p3[] = "789"; int lp3 = 3; char p4[] = "abc"; int lp4 = 3; char p5[] = "0"; int lp5 = 1; char p6[] = "9"; int lp6 = 1; char p7[] = "654"; int lp7 = 3; +char p8[] = "89abc"; +int lp8 = 5; char b0[] = ""; int lb0 = 0; char b1[] = "0"; int lb1 = 1; char b2[] = "0123456789"; int lb2 = 10; #define expect(b) \ if (!(b)) { \ fprintf(stderr, "failed on line %d\n", __LINE__); \ atf_tc_fail("Check stderr for test id/line"); \ } ATF_TC(memmem_basic); ATF_TC_HEAD(memmem_basic, tc) { atf_tc_set_md_var(tc, "descr", "Test memmem results"); } ATF_TC_BODY(memmem_basic, tc) { #if defined(__darwin__) expect(memmem(b2, lb2, p0, lp0) == NULL); expect(memmem(b0, lb0, p0, lp0) == NULL); #else expect(memmem(b2, lb2, p0, lp0) == b2); expect(memmem(b0, lb0, p0, lp0) == b0); #endif expect(memmem(b0, lb0, p1, lp1) == NULL); expect(memmem(b1, lb1, p1, lp1) == NULL); expect(memmem(b2, lb2, p1, lp1) == b2); expect(memmem(b2, lb2, p2, lp2) == (b2 + 4)); expect(memmem(b2, lb2, p3, lp3) == (b2 + 7)); expect(memmem(b2, lb2, p5, lp5) == b2); expect(memmem(b2, lb2, p6, lp6) == (b2 + 9)); expect(memmem(b2, lb2, p4, lp4) == NULL); expect(memmem(b2, lb2, p7, lp7) == NULL); + expect(memmem(b2, lb2, p8, lp8) == NULL); } ATF_TP_ADD_TCS(tp) { ATF_TP_ADD_TC(tp, memmem_basic); return atf_no_error(); } Index: user/alc/PQ_LAUNDRY/contrib/netbsd-tests =================================================================== --- user/alc/PQ_LAUNDRY/contrib/netbsd-tests (revision 306831) +++ user/alc/PQ_LAUNDRY/contrib/netbsd-tests (revision 306832) Property changes on: user/alc/PQ_LAUNDRY/contrib/netbsd-tests ___________________________________________________________________ Modified: svn:mergeinfo ## -0,0 +0,1 ## Merged /head/contrib/netbsd-tests:r306545-306831 Index: user/alc/PQ_LAUNDRY/etc/mtree/BSD.include.dist =================================================================== --- user/alc/PQ_LAUNDRY/etc/mtree/BSD.include.dist (revision 306831) +++ user/alc/PQ_LAUNDRY/etc/mtree/BSD.include.dist (revision 306832) @@ -1,352 +1,354 @@ # $FreeBSD$ # # Please see the file src/etc/mtree/README before making changes to this file. # /set type=dir uname=root gname=wheel mode=0755 . arpa .. atf-c .. atf-c++ .. bsm .. bsnmp .. c++ 4.2 backward .. bits .. debug .. ext pb_ds detail basic_tree_policy .. bin_search_tree_ .. binary_heap_ .. binomial_heap_ .. binomial_heap_base_ .. cc_hash_table_map_ .. eq_fn .. gp_hash_table_map_ .. hash_fn .. left_child_next_sibling_heap_ .. list_update_map_ .. list_update_policy .. ov_tree_map_ .. pairing_heap_ .. pat_trie_ .. rb_tree_map_ .. rc_binomial_heap_ .. resize_policy .. splay_tree_ .. thin_heap_ .. tree_policy .. trie_policy .. unordered_iterator .. .. .. .. tr1 .. .. v1 experimental .. ext .. tr1 .. .. .. cam ata .. nvme .. scsi .. .. casper .. crypto .. dev acpica .. agp .. an .. bktr .. ciss .. evdev .. filemon .. firewire .. hwpmc .. ic .. iicbus .. io .. lmc .. mfi .. mpt mpilib .. .. nand .. nvme .. ofw .. pbio .. pci .. powermac_nvram .. ppbus .. smbus .. speaker .. usb .. utopia .. vkbd .. wi .. .. devdctl .. edit readline .. .. fs cuse .. devfs .. fdescfs .. msdosfs .. nandfs .. nfs .. nullfs .. procfs .. smbfs .. udf .. unionfs .. .. gcc 4.2 .. .. geom cache .. concat .. eli .. gate .. journal .. label .. mirror .. mountver .. multipath .. nop .. raid .. raid3 .. shsec .. stripe .. virstor .. .. gnu posix .. .. gssapi .. infiniband complib .. iba .. opensm .. vendor .. .. isofs cd9660 .. .. kadm5 .. krb5 .. lib80211 .. libmilter .. libxo .. lzma .. machine pc .. .. net altq .. .. net80211 .. netgraph atm .. bluetooth include .. .. netflow .. .. netinet cc .. .. netinet6 .. netipsec .. netnatm api .. msg .. saal .. sig .. .. netpfil pf .. .. netsmb .. nfs .. nfsclient .. nfsserver .. openssl .. pcap .. protocols .. rdma .. rpc .. rpcsvc .. security audit .. mac_biba .. mac_bsdextended .. mac_lomac .. mac_mls .. mac_partition .. .. ssp .. sys + disk + .. .. teken .. ufs ffs .. ufs .. .. vm .. xlocale .. .. Index: user/alc/PQ_LAUNDRY/include/Makefile =================================================================== --- user/alc/PQ_LAUNDRY/include/Makefile (revision 306831) +++ user/alc/PQ_LAUNDRY/include/Makefile (revision 306832) @@ -1,371 +1,372 @@ # @(#)Makefile 8.2 (Berkeley) 1/4/94 # $FreeBSD$ # # Doing a "make install" builds /usr/include. .include PACKAGE=runtime TAGS+= development CLEANFILES= osreldate.h version SUBDIR= arpa protocols rpcsvc rpc xlocale SUBDIR_PARALLEL= INCS= a.out.h ar.h assert.h bitstring.h complex.h cpio.h _ctype.h ctype.h \ db.h \ dirent.h dlfcn.h elf.h elf-hints.h err.h fmtmsg.h fnmatch.h fstab.h \ fts.h ftw.h getopt.h glob.h grp.h \ ieeefp.h ifaddrs.h \ inttypes.h iso646.h kenv.h langinfo.h libgen.h limits.h link.h \ locale.h malloc.h malloc_np.h memory.h monetary.h mpool.h mqueue.h \ ndbm.h netconfig.h \ netdb.h nl_types.h nlist.h nss.h nsswitch.h paths.h \ printf.h proc_service.h pthread.h \ pthread_np.h pwd.h ranlib.h readpassphrase.h regex.h \ res_update.h resolv.h runetype.h search.h semaphore.h setjmp.h \ signal.h spawn.h stab.h stdalign.h stdbool.h stddef.h \ stdnoreturn.h stdio.h stdlib.h string.h stringlist.h \ strings.h sysexits.h tar.h termios.h tgmath.h \ time.h timeconv.h timers.h ttyent.h \ uchar.h ulimit.h unistd.h utime.h utmpx.h uuid.h varargs.h \ wchar.h wctype.h wordexp.h xlocale.h .PATH: ${.CURDIR}/../contrib/libc-vis INCS+= vis.h MHDRS= float.h floatingpoint.h stdarg.h PHDRS= sched.h _semaphore.h LHDRS= aio.h errno.h fcntl.h linker_set.h poll.h stdatomic.h stdint.h \ syslog.h ucontext.h LDIRS= bsm cam geom net net80211 netgraph netinet netinet6 \ netipsec netnatm netsmb nfs nfsclient nfsserver sys vm LSUBDIRS= cam/ata cam/nvme cam/scsi \ dev/acpica dev/agp dev/an dev/bktr dev/ciss dev/filemon dev/firewire \ dev/hwpmc \ dev/ic dev/iicbus dev/io dev/lmc dev/mfi dev/nvme \ dev/ofw dev/pbio dev/pci ${_dev_powermac_nvram} dev/ppbus dev/smbus \ dev/speaker dev/utopia dev/vkbd dev/wi \ fs/devfs fs/fdescfs fs/msdosfs fs/nandfs fs/nfs fs/nullfs \ fs/procfs fs/smbfs fs/udf fs/unionfs \ geom/cache geom/concat geom/eli geom/gate geom/journal geom/label \ geom/mirror geom/mountver geom/multipath geom/nop \ geom/raid geom/raid3 geom/shsec geom/stripe geom/virstor \ net/altq \ netgraph/atm netgraph/netflow \ netinet/cc \ security/audit \ security/mac_biba security/mac_bsdextended security/mac_lomac \ security/mac_mls security/mac_partition \ + sys/disk \ ufs/ffs ufs/ufs LSUBSUBDIRS= dev/mpt/mpilib .if ${MK_BLUETOOTH} != "no" LSUBSUBDIRS+= netgraph/bluetooth/include .endif .if ${MK_CUSE} != "no" LSUBDIRS+= fs/cuse .endif .if ${MK_GSSAPI} != "no" SUBDIR+= gssapi INCS+= gssapi.h .endif .if ${MK_HESIOD} != "no" INCS+= hesiod.h .endif # Handle the #define aliases for libiconv .if ${MK_ICONV} == "yes" INCS+= iconv.h .endif .if ${MK_USB} != "no" LSUBDIRS+= dev/usb .endif .if ${MACHINE_ARCH} == "powerpc" || ${MACHINE_ARCH} == "powerpc64" _dev_powermac_nvram= dev/powermac_nvram .endif # Define SHARED to indicate whether you want symbolic links to the system # source (``symlinks''), or a separate copy (``copies''). ``symlinks'' is # probably only useful for developers and should be avoided if you do not # wish to tie your /usr/include and /usr/src together. #SHARED= symlinks SHARED?= copies INCS+= osreldate.h SYSDIR= ${.CURDIR}/../sys NEWVERS_SH= ${SYSDIR}/conf/newvers.sh PARAM_H= ${SYSDIR}/sys/param.h MK_OSRELDATE_SH= ${.CURDIR}/mk-osreldate.sh SYMLINKS+= ${INCLUDEDIR} ${LIBDIR}/include osreldate.h: ${NEWVERS_SH} ${PARAM_H} ${MK_OSRELDATE_SH} env NEWVERS_SH=${NEWVERS_SH} PARAMFILE=${PARAM_H} SYSDIR=${SYSDIR} \ sh ${MK_OSRELDATE_SH} .for i in ${LHDRS} INCSLINKS+= sys/$i ${INCLUDEDIR}/$i .endfor .for i in ${MHDRS} INCSLINKS+= machine/$i ${INCLUDEDIR}/$i .endfor .for i in ${PHDRS} INCSLINKS+= sys/$i ${INCLUDEDIR}/$i .endfor .if ${MACHINE} != ${MACHINE_CPUARCH} _MARCHS= ${MACHINE_CPUARCH} .endif .if ${MACHINE_CPUARCH} == "i386" || ${MACHINE_CPUARCH} == "amd64" _MARCHS+= x86 .endif META_TARGETS+= compat stage_includes: ${SHARED} # Take care of stale directory-level symlinks. compat: .for i in ${LDIRS} ${LSUBDIRS} machine ${_MARCHS} crypto if [ -L ${DESTDIR}${INCLUDEDIR}/$i ]; then \ rm -f ${DESTDIR}${INCLUDEDIR}/$i; \ fi .endfor mtree -deU ${MTREE_FOLLOWS_SYMLINKS} \ -f ${.CURDIR}/../etc/mtree/BSD.include.dist \ -p ${DESTDIR}${INCLUDEDIR} > /dev/null copies: .PHONY .META .for i in ${LDIRS} ${LSUBDIRS} ${LSUBSUBDIRS} crypto machine machine/pc \ ${_MARCHS} if [ -d ${DESTDIR}${INCLUDEDIR}/$i ]; then \ cd ${DESTDIR}${INCLUDEDIR}/$i; \ for h in *.h; do \ if [ -L $$h ]; then rm -f $$h; fi; \ done; \ fi .endfor .for i in ${LDIRS} ${LSUBDIRS:Ndev/agp:Ndev/acpica:Ndev/bktr:Ndev/evdev:Ndev/nand:Ndev/pci} ${LSUBSUBDIRS} cd ${.CURDIR}/../sys; \ ${INSTALL} -C ${TAG_ARGS} -o ${BINOWN} -g ${BINGRP} -m 444 $i/*.h \ ${DESTDIR}${INCLUDEDIR}/$i .endfor cd ${.CURDIR}/../sys/dev/acpica; \ ${INSTALL} -C ${TAG_ARGS} -o ${BINOWN} -g ${BINGRP} -m 444 acpiio.h \ ${DESTDIR}${INCLUDEDIR}/dev/acpica; \ ${INSTALL} -C ${TAG_ARGS} -o ${BINOWN} -g ${BINGRP} -m 444 acpi_hpet.h \ ${DESTDIR}${INCLUDEDIR}/dev/acpica cd ${.CURDIR}/../sys/dev/agp; \ ${INSTALL} -C ${TAG_ARGS} -o ${BINOWN} -g ${BINGRP} -m 444 agpreg.h \ ${DESTDIR}${INCLUDEDIR}/dev/agp cd ${.CURDIR}/../sys/dev/bktr; \ ${INSTALL} -C ${TAG_ARGS} -o ${BINOWN} -g ${BINGRP} -m 444 ioctl_*.h \ ${DESTDIR}${INCLUDEDIR}/dev/bktr .if ${MK_NAND} != "no" cd ${.CURDIR}/../sys/dev/nand; \ ${INSTALL} -C ${TAG_ARGS} -o ${BINOWN} -g ${BINGRP} -m 444 nandsim.h \ ${DESTDIR}${INCLUDEDIR}/dev/nand; \ ${INSTALL} -C ${TAG_ARGS} -o ${BINOWN} -g ${BINGRP} -m 444 nand_dev.h \ ${DESTDIR}${INCLUDEDIR}/dev/nand .endif cd ${.CURDIR}/../sys/dev/evdev; \ ${INSTALL} -C -o ${BINOWN} -g ${BINGRP} -m 444 input.h \ ${DESTDIR}${INCLUDEDIR}/dev/evdev; \ ${INSTALL} -C -o ${BINOWN} -g ${BINGRP} -m 444 input-event-codes.h \ ${DESTDIR}${INCLUDEDIR}/dev/evdev; \ ${INSTALL} -C -o ${BINOWN} -g ${BINGRP} -m 444 uinput.h \ ${DESTDIR}${INCLUDEDIR}/dev/evdev cd ${.CURDIR}/../sys/dev/pci; \ ${INSTALL} -C ${TAG_ARGS} -o ${BINOWN} -g ${BINGRP} -m 444 pcireg.h \ ${DESTDIR}${INCLUDEDIR}/dev/pci cd ${.CURDIR}/../sys/fs/cd9660/; \ ${INSTALL} -C ${TAG_ARGS} -o ${BINOWN} -g ${BINGRP} -m 444 *.h \ ${DESTDIR}${INCLUDEDIR}/isofs/cd9660 .if ${MK_IPFILTER} != "no" cd ${.CURDIR}/../sys/contrib/ipfilter/netinet; \ ${INSTALL} -C ${TAG_ARGS} -o ${BINOWN} -g ${BINGRP} -m 444 *.h \ ${DESTDIR}${INCLUDEDIR}/netinet .endif .if ${MK_PF} != "no" cd ${.CURDIR}/../sys/netpfil/pf; \ ${INSTALL} -C ${TAG_ARGS} -o ${BINOWN} -g ${BINGRP} -m 444 *.h \ ${DESTDIR}${INCLUDEDIR}/netpfil/pf .endif cd ${.CURDIR}/../sys/crypto; \ ${INSTALL} -C ${TAG_ARGS} -o ${BINOWN} -g ${BINGRP} -m 444 rijndael/rijndael.h \ ${DESTDIR}${INCLUDEDIR}/crypto cd ${.CURDIR}/../sys/opencrypto; \ ${INSTALL} -C ${TAG_ARGS} -o ${BINOWN} -g ${BINGRP} -m 444 *.h \ ${DESTDIR}${INCLUDEDIR}/crypto cd ${.CURDIR}/../sys/${MACHINE}/include; \ ${INSTALL} -C ${TAG_ARGS} -o ${BINOWN} -g ${BINGRP} -m 444 *.h \ ${DESTDIR}${INCLUDEDIR}/machine .if exists(${.CURDIR}/../sys/${MACHINE}/include/pc) cd ${.CURDIR}/../sys/${MACHINE}/include/pc; \ ${INSTALL} -C ${TAG_ARGS} -o ${BINOWN} -g ${BINGRP} -m 444 *.h \ ${DESTDIR}${INCLUDEDIR}/machine/pc .endif .for _MARCH in ${_MARCHS} .if exists(${.CURDIR}/../sys/${_MARCH}/include) ${INSTALL} -d ${TAG_ARGS} -o ${BINOWN} -g ${BINGRP} -m 755 \ ${DESTDIR}${INCLUDEDIR}/${_MARCH}; \ cd ${.CURDIR}/../sys/${_MARCH}/include; \ ${INSTALL} -C ${TAG_ARGS} -o ${BINOWN} -g ${BINGRP} -m 444 *.h \ ${DESTDIR}${INCLUDEDIR}/${_MARCH} .if exists(${.CURDIR}/../sys/${_MARCH}/include/pc) ${INSTALL} -d ${TAG_ARGS} -o ${BINOWN} -g ${BINGRP} -m 755 \ ${DESTDIR}${INCLUDEDIR}/${_MARCH}/pc; \ cd ${.CURDIR}/../sys/${_MARCH}/include/pc; \ ${INSTALL} -C ${TAG_ARGS} -o ${BINOWN} -g ${BINGRP} -m 444 *.h \ ${DESTDIR}${INCLUDEDIR}/${_MARCH}/pc .endif .endif .endfor cd ${.CURDIR}/../sys/rpc; \ ${INSTALL} -C ${TAG_ARGS} -o ${BINOWN} -g ${BINGRP} -m 444 types.h \ ${DESTDIR}${INCLUDEDIR}/rpc cd ${.CURDIR}/../sys/teken; \ ${INSTALL} -C ${TAG_ARGS} -o ${BINOWN} -g ${BINGRP} -m 444 teken.h \ ${DESTDIR}${INCLUDEDIR}/teken symlinks: .PHONY .META @${ECHO} "Setting up symlinks to kernel source tree..." .for i in ${LDIRS} cd ${.CURDIR}/../sys/$i; \ for h in *.h; do \ ${INSTALL_SYMLINK} ${TAG_ARGS} ../../../sys/$i/$$h ${DESTDIR}${INCLUDEDIR}/$i; \ done .endfor .for i in ${LSUBDIRS:Ndev/agp:Ndev/acpica:Ndev/bktr:Ndev/evdev:Ndev/nand:Ndev/pci} cd ${.CURDIR}/../sys/$i; \ for h in *.h; do \ ${INSTALL_SYMLINK} ${TAG_ARGS} ../../../../sys/$i/$$h ${DESTDIR}${INCLUDEDIR}/$i; \ done .endfor cd ${.CURDIR}/../sys/dev/acpica; \ for h in acpiio.h acpi_hpet.h; do \ ${INSTALL_SYMLINK} ${TAG_ARGS} ../../../../sys/dev/acpica/$$h \ ${DESTDIR}${INCLUDEDIR}/dev/acpica; \ done cd ${.CURDIR}/../sys/dev/agp; \ for h in agpreg.h; do \ ${INSTALL_SYMLINK} ${TAG_ARGS} ../../../../sys/dev/agp/$$h \ ${DESTDIR}${INCLUDEDIR}/dev/agp; \ done cd ${.CURDIR}/../sys/dev/bktr; \ for h in ioctl_*.h; do \ ${INSTALL_SYMLINK} ${TAG_ARGS} ../../../../sys/dev/bktr/$$h \ ${DESTDIR}${INCLUDEDIR}/dev/bktr; \ done .if ${MK_NAND} != "no" cd ${.CURDIR}/../sys/dev/nand; \ for h in nandsim.h nand_dev.h; do \ ${INSTALL_SYMLINK} ${TAG_ARGS} ../../../../sys/dev/nand/$$h \ ${DESTDIR}${INCLUDEDIR}/dev/nand; \ done .endif cd ${.CURDIR}/../sys/dev/evdev; \ for h in input.h input-event-codes.h uinput.h; do \ ln -fs ../../../../sys/dev/evdev/$$h \ ${DESTDIR}${INCLUDEDIR}/dev/evdev; \ done cd ${.CURDIR}/../sys/dev/pci; \ for h in pcireg.h; do \ ${INSTALL_SYMLINK} ${TAG_ARGS} ../../../../sys/dev/pci/$$h \ ${DESTDIR}${INCLUDEDIR}/dev/pci; \ done .for i in ${LSUBSUBDIRS} cd ${.CURDIR}/../sys/$i; \ for h in *.h; do \ ${INSTALL_SYMLINK} ${TAG_ARGS} ../../../../../sys/$i/$$h ${DESTDIR}${INCLUDEDIR}/$i; \ done .endfor .if ${MK_IPFILTER} != "no" cd ${.CURDIR}/../sys/contrib/ipfilter/netinet; \ for h in *.h; do \ ${INSTALL_SYMLINK} ${TAG_ARGS} ../../../sys/contrib/ipfilter/netinet/$$h \ ${DESTDIR}${INCLUDEDIR}/netinet; \ done .endif .if ${MK_PF} != "no" cd ${.CURDIR}/../sys/netpfil/pf; \ for h in *.h; do \ ${INSTALL_SYMLINK} ${TAG_ARGS} ../../../../sys/netpfil/pf/$$h \ ${DESTDIR}${INCLUDEDIR}/netpfil/pf; \ done .endif cd ${.CURDIR}/../sys/crypto; \ for h in rijndael/rijndael.h; do \ ${INSTALL_SYMLINK} ${TAG_ARGS} ../../../sys/crypto/$$h \ ${DESTDIR}${INCLUDEDIR}/crypto; \ done cd ${.CURDIR}/../sys/opencrypto; \ for h in *.h; do \ ${INSTALL_SYMLINK} ${TAG_ARGS} ../../../sys/opencrypto/$$h \ ${DESTDIR}${INCLUDEDIR}/crypto; \ done cd ${.CURDIR}/../sys/${MACHINE}/include; \ for h in *.h; do \ ${INSTALL_SYMLINK} ${TAG_ARGS} ../../../sys/${MACHINE}/include/$$h \ ${DESTDIR}${INCLUDEDIR}/machine; \ done .if exists(${.CURDIR}/../sys/${MACHINE}/include/pc) cd ${.CURDIR}/../sys/${MACHINE}/include/pc; \ for h in *.h; do \ ${INSTALL_SYMLINK} ${TAG_ARGS} ../../../../sys/${MACHINE}/include/pc/$$h \ ${DESTDIR}${INCLUDEDIR}/machine/pc; \ done .endif .for _MARCH in ${_MARCHS} .if exists(${.CURDIR}/../sys/${_MARCH}/include) ${INSTALL} -d ${TAG_ARGS} -o ${BINOWN} -g ${BINGRP} -m 755 \ ${DESTDIR}${INCLUDEDIR}/${_MARCH}; \ cd ${.CURDIR}/../sys/${_MARCH}/include; \ for h in *.h; do \ ${INSTALL_SYMLINK} ${TAG_ARGS} ../../../sys/${_MARCH}/include/$$h \ ${DESTDIR}${INCLUDEDIR}/${_MARCH}; \ done .if exists(${.CURDIR}/../sys/${_MARCH}/include/pc) ${INSTALL} -d ${TAG_ARGS} -o ${BINOWN} -g ${BINGRP} -m 755 \ ${DESTDIR}${INCLUDEDIR}/${_MARCH}/pc; \ cd ${.CURDIR}/../sys/${_MARCH}/include/pc; \ for h in *.h; do \ ${INSTALL_SYMLINK} ${TAG_ARGS} ../../../../sys/${_MARCH}/include/pc/$$h \ ${DESTDIR}${INCLUDEDIR}/${_MARCH}/pc; \ done .endif .endif .endfor cd ${.CURDIR}/../sys/fs/cd9660; \ for h in *.h; do \ ${INSTALL_SYMLINK} ${TAG_ARGS} ../../../../sys/fs/cd9660/$$h \ ${DESTDIR}${INCLUDEDIR}/isofs/cd9660; \ done cd ${.CURDIR}/../sys/rpc; \ for h in types.h; do \ ${INSTALL_SYMLINK} ${TAG_ARGS} ../../../sys/rpc/$$h \ ${DESTDIR}${INCLUDEDIR}/rpc; \ done .include installincludes: ${SHARED} ${SHARED}: compat .if ${MACHINE} == "host" && !defined(_SKIP_BUILD) # we're here because we are building a sysroot... # we need MACHINE et al set correctly HOST_MACHINE!= uname -m HOST_MACHINE_ARCH!= uname -p MACHINE:= ${HOST_MACHINE} MACHINE_ARCH:= ${HOST_MACHINE_ARCH} .endif Index: user/alc/PQ_LAUNDRY/sbin/init/init.c =================================================================== --- user/alc/PQ_LAUNDRY/sbin/init/init.c (revision 306831) +++ user/alc/PQ_LAUNDRY/sbin/init/init.c (revision 306832) @@ -1,1989 +1,2002 @@ /*- * Copyright (c) 1991, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * Donn Seeley at Berkeley Software Design, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #ifndef lint static const char copyright[] = "@(#) Copyright (c) 1991, 1993\n\ The Regents of the University of California. All rights reserved.\n"; #endif /* not lint */ #ifndef lint #if 0 static char sccsid[] = "@(#)init.c 8.1 (Berkeley) 7/15/93"; #endif static const char rcsid[] = "$FreeBSD$"; #endif /* not lint */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef SECURE #include #endif #ifdef LOGIN_CAP #include #endif #include "mntopts.h" #include "pathnames.h" /* * Sleep times; used to prevent thrashing. */ #define GETTY_SPACING 5 /* N secs minimum getty spacing */ #define GETTY_SLEEP 30 /* sleep N secs after spacing problem */ #define GETTY_NSPACE 3 /* max. spacing count to bring reaction */ #define WINDOW_WAIT 3 /* wait N secs after starting window */ #define STALL_TIMEOUT 30 /* wait N secs after warning */ #define DEATH_WATCH 10 /* wait N secs for procs to die */ #define DEATH_SCRIPT 120 /* wait for 2min for /etc/rc.shutdown */ #define RESOURCE_RC "daemon" #define RESOURCE_WINDOW "default" #define RESOURCE_GETTY "default" static void handle(sig_t, ...); static void delset(sigset_t *, ...); static void stall(const char *, ...) __printflike(1, 2); static void warning(const char *, ...) __printflike(1, 2); static void emergency(const char *, ...) __printflike(1, 2); static void disaster(int); static void badsys(int); static void revoke_ttys(void); static int runshutdown(void); static char *strk(char *); /* * We really need a recursive typedef... * The following at least guarantees that the return type of (*state_t)() * is sufficiently wide to hold a function pointer. */ typedef long (*state_func_t)(void); typedef state_func_t (*state_t)(void); static state_func_t single_user(void); static state_func_t runcom(void); static state_func_t read_ttys(void); static state_func_t multi_user(void); static state_func_t clean_ttys(void); static state_func_t catatonia(void); static state_func_t death(void); static state_func_t death_single(void); static state_func_t reroot(void); static state_func_t reroot_phase_two(void); static state_func_t run_script(const char *); static enum { AUTOBOOT, FASTBOOT } runcom_mode = AUTOBOOT; #define FALSE 0 #define TRUE 1 static int Reboot = FALSE; static int howto = RB_AUTOBOOT; static int devfs; static char *init_path_argv0; static void transition(state_t); static state_t requested_transition; static state_t current_state = death_single; static void open_console(void); static const char *get_shell(void); static void write_stderr(const char *message); typedef struct init_session { pid_t se_process; /* controlling process */ time_t se_started; /* used to avoid thrashing */ int se_flags; /* status of session */ #define SE_SHUTDOWN 0x1 /* session won't be restarted */ #define SE_PRESENT 0x2 /* session is in /etc/ttys */ int se_nspace; /* spacing count */ char *se_device; /* filename of port */ char *se_getty; /* what to run on that port */ char *se_getty_argv_space; /* pre-parsed argument array space */ char **se_getty_argv; /* pre-parsed argument array */ char *se_window; /* window system (started only once) */ char *se_window_argv_space; /* pre-parsed argument array space */ char **se_window_argv; /* pre-parsed argument array */ char *se_type; /* default terminal type */ struct init_session *se_prev; struct init_session *se_next; } session_t; static void free_session(session_t *); static session_t *new_session(session_t *, struct ttyent *); static session_t *sessions; static char **construct_argv(char *); static void start_window_system(session_t *); static void collect_child(pid_t); static pid_t start_getty(session_t *); static void transition_handler(int); static void alrm_handler(int); static void setsecuritylevel(int); static int getsecuritylevel(void); static int setupargv(session_t *, struct ttyent *); #ifdef LOGIN_CAP static void setprocresources(const char *); #endif static int clang; static int start_session_db(void); static void add_session(session_t *); static void del_session(session_t *); static session_t *find_session(pid_t); static DB *session_db; /* * The mother of all processes. */ int main(int argc, char *argv[]) { state_t initial_transition = runcom; char kenv_value[PATH_MAX]; int c, error; struct sigaction sa; sigset_t mask; /* Dispose of random users. */ if (getuid() != 0) errx(1, "%s", strerror(EPERM)); /* System V users like to reexec init. */ if (getpid() != 1) { #ifdef COMPAT_SYSV_INIT /* So give them what they want */ if (argc > 1) { if (strlen(argv[1]) == 1) { char runlevel = *argv[1]; int sig; switch (runlevel) { case '0': /* halt + poweroff */ sig = SIGUSR2; break; case '1': /* single-user */ sig = SIGTERM; break; case '6': /* reboot */ sig = SIGINT; break; case 'c': /* block further logins */ sig = SIGTSTP; break; case 'q': /* rescan /etc/ttys */ sig = SIGHUP; break; case 'r': /* remount root */ sig = SIGEMT; break; default: goto invalid; } kill(1, sig); _exit(0); } else invalid: errx(1, "invalid run-level ``%s''", argv[1]); } else #endif errx(1, "already running"); } init_path_argv0 = strdup(argv[0]); if (init_path_argv0 == NULL) err(1, "strdup"); /* * Note that this does NOT open a file... * Does 'init' deserve its own facility number? */ openlog("init", LOG_CONS, LOG_AUTH); /* * Create an initial session. */ if (setsid() < 0 && (errno != EPERM || getsid(0) != 1)) warning("initial setsid() failed: %m"); /* * Establish an initial user so that programs running * single user do not freak out and die (like passwd). */ if (setlogin("root") < 0) warning("setlogin() failed: %m"); /* * This code assumes that we always get arguments through flags, * never through bits set in some random machine register. */ while ((c = getopt(argc, argv, "dsfr")) != -1) switch (c) { case 'd': devfs = 1; break; case 's': initial_transition = single_user; break; case 'f': runcom_mode = FASTBOOT; break; case 'r': initial_transition = reroot_phase_two; break; default: warning("unrecognized flag '-%c'", c); break; } if (optind != argc) warning("ignoring excess arguments"); /* * We catch or block signals rather than ignore them, * so that they get reset on exec. */ handle(badsys, SIGSYS, 0); handle(disaster, SIGABRT, SIGFPE, SIGILL, SIGSEGV, SIGBUS, SIGXCPU, SIGXFSZ, 0); handle(transition_handler, SIGHUP, SIGINT, SIGEMT, SIGTERM, SIGTSTP, SIGUSR1, SIGUSR2, 0); handle(alrm_handler, SIGALRM, 0); sigfillset(&mask); delset(&mask, SIGABRT, SIGFPE, SIGILL, SIGSEGV, SIGBUS, SIGSYS, SIGXCPU, SIGXFSZ, SIGHUP, SIGINT, SIGEMT, SIGTERM, SIGTSTP, SIGALRM, SIGUSR1, SIGUSR2, 0); sigprocmask(SIG_SETMASK, &mask, (sigset_t *) 0); sigemptyset(&sa.sa_mask); sa.sa_flags = 0; sa.sa_handler = SIG_IGN; sigaction(SIGTTIN, &sa, (struct sigaction *)0); sigaction(SIGTTOU, &sa, (struct sigaction *)0); /* * Paranoia. */ close(0); close(1); close(2); if (kenv(KENV_GET, "init_script", kenv_value, sizeof(kenv_value)) > 0) { state_func_t next_transition; if ((next_transition = run_script(kenv_value)) != NULL) initial_transition = (state_t) next_transition; } if (kenv(KENV_GET, "init_chroot", kenv_value, sizeof(kenv_value)) > 0) { if (chdir(kenv_value) != 0 || chroot(".") != 0) warning("Can't chroot to %s: %m", kenv_value); } /* * Additional check if devfs needs to be mounted: * If "/" and "/dev" have the same device number, * then it hasn't been mounted yet. */ if (!devfs) { struct stat stst; dev_t root_devno; stat("/", &stst); root_devno = stst.st_dev; if (stat("/dev", &stst) != 0) warning("Can't stat /dev: %m"); else if (stst.st_dev == root_devno) devfs++; } if (devfs) { struct iovec iov[4]; char *s; int i; char _fstype[] = "fstype"; char _devfs[] = "devfs"; char _fspath[] = "fspath"; char _path_dev[]= _PATH_DEV; iov[0].iov_base = _fstype; iov[0].iov_len = sizeof(_fstype); iov[1].iov_base = _devfs; iov[1].iov_len = sizeof(_devfs); iov[2].iov_base = _fspath; iov[2].iov_len = sizeof(_fspath); /* * Try to avoid the trailing slash in _PATH_DEV. * Be *very* defensive. */ s = strdup(_PATH_DEV); if (s != NULL) { i = strlen(s); if (i > 0 && s[i - 1] == '/') s[i - 1] = '\0'; iov[3].iov_base = s; iov[3].iov_len = strlen(s) + 1; } else { iov[3].iov_base = _path_dev; iov[3].iov_len = sizeof(_path_dev); } nmount(iov, 4, 0); if (s != NULL) free(s); } if (initial_transition != reroot_phase_two) { /* * Unmount reroot leftovers. This runs after init(8) * gets reexecuted after reroot_phase_two() is done. */ error = unmount(_PATH_REROOT, MNT_FORCE); if (error != 0 && errno != EINVAL) warning("Cannot unmount %s: %m", _PATH_REROOT); } /* * Start the state machine. */ transition(initial_transition); /* * Should never reach here. */ return 1; } /* * Associate a function with a signal handler. */ static void handle(sig_t handler, ...) { int sig; struct sigaction sa; sigset_t mask_everything; va_list ap; va_start(ap, handler); sa.sa_handler = handler; sigfillset(&mask_everything); while ((sig = va_arg(ap, int)) != 0) { sa.sa_mask = mask_everything; /* XXX SA_RESTART? */ sa.sa_flags = sig == SIGCHLD ? SA_NOCLDSTOP : 0; sigaction(sig, &sa, (struct sigaction *) 0); } va_end(ap); } /* * Delete a set of signals from a mask. */ static void delset(sigset_t *maskp, ...) { int sig; va_list ap; va_start(ap, maskp); while ((sig = va_arg(ap, int)) != 0) sigdelset(maskp, sig); va_end(ap); } /* * Log a message and sleep for a while (to give someone an opportunity * to read it and to save log or hardcopy output if the problem is chronic). * NB: should send a message to the session logger to avoid blocking. */ static void stall(const char *message, ...) { va_list ap; va_start(ap, message); vsyslog(LOG_ALERT, message, ap); va_end(ap); sleep(STALL_TIMEOUT); } /* * Like stall(), but doesn't sleep. * If cpp had variadic macros, the two functions could be #defines for another. * NB: should send a message to the session logger to avoid blocking. */ static void warning(const char *message, ...) { va_list ap; va_start(ap, message); vsyslog(LOG_ALERT, message, ap); va_end(ap); } /* * Log an emergency message. * NB: should send a message to the session logger to avoid blocking. */ static void emergency(const char *message, ...) { va_list ap; va_start(ap, message); vsyslog(LOG_EMERG, message, ap); va_end(ap); } /* * Catch a SIGSYS signal. * * These may arise if a system does not support sysctl. * We tolerate up to 25 of these, then throw in the towel. */ static void badsys(int sig) { static int badcount = 0; if (badcount++ < 25) return; disaster(sig); } /* * Catch an unexpected signal. */ static void disaster(int sig) { emergency("fatal signal: %s", (unsigned)sig < NSIG ? sys_siglist[sig] : "unknown signal"); sleep(STALL_TIMEOUT); _exit(sig); /* reboot */ } /* * Get the security level of the kernel. */ static int getsecuritylevel(void) { #ifdef KERN_SECURELVL int name[2], curlevel; size_t len; name[0] = CTL_KERN; name[1] = KERN_SECURELVL; len = sizeof curlevel; if (sysctl(name, 2, &curlevel, &len, NULL, 0) == -1) { emergency("cannot get kernel security level: %s", strerror(errno)); return (-1); } return (curlevel); #else return (-1); #endif } /* * Set the security level of the kernel. */ static void setsecuritylevel(int newlevel) { #ifdef KERN_SECURELVL int name[2], curlevel; curlevel = getsecuritylevel(); if (newlevel == curlevel) return; name[0] = CTL_KERN; name[1] = KERN_SECURELVL; if (sysctl(name, 2, NULL, NULL, &newlevel, sizeof newlevel) == -1) { emergency( "cannot change kernel security level from %d to %d: %s", curlevel, newlevel, strerror(errno)); return; } #ifdef SECURE warning("kernel security level changed from %d to %d", curlevel, newlevel); #endif #endif } /* * Change states in the finite state machine. * The initial state is passed as an argument. */ static void transition(state_t s) { current_state = s; for (;;) current_state = (state_t) (*current_state)(); } /* * Start a session and allocate a controlling terminal. * Only called by children of init after forking. */ static void open_console(void) { int fd; /* * Try to open /dev/console. Open the device with O_NONBLOCK to * prevent potential blocking on a carrier. */ revoke(_PATH_CONSOLE); if ((fd = open(_PATH_CONSOLE, O_RDWR | O_NONBLOCK)) != -1) { (void)fcntl(fd, F_SETFL, fcntl(fd, F_GETFL) & ~O_NONBLOCK); if (login_tty(fd) == 0) return; close(fd); } /* No luck. Log output to file if possible. */ if ((fd = open(_PATH_DEVNULL, O_RDWR)) == -1) { stall("cannot open null device."); _exit(1); } if (fd != STDIN_FILENO) { dup2(fd, STDIN_FILENO); close(fd); } fd = open(_PATH_INITLOG, O_WRONLY | O_APPEND | O_CREAT, 0644); if (fd == -1) dup2(STDIN_FILENO, STDOUT_FILENO); else if (fd != STDOUT_FILENO) { dup2(fd, STDOUT_FILENO); close(fd); } dup2(STDOUT_FILENO, STDERR_FILENO); } static const char * get_shell(void) { static char kenv_value[PATH_MAX]; if (kenv(KENV_GET, "init_shell", kenv_value, sizeof(kenv_value)) > 0) return kenv_value; else return _PATH_BSHELL; } static void write_stderr(const char *message) { write(STDERR_FILENO, message, strlen(message)); } static int read_file(const char *path, void **bufp, size_t *bufsizep) { struct stat sb; size_t bufsize; void *buf; ssize_t nbytes; int error, fd; fd = open(path, O_RDONLY); if (fd < 0) { emergency("%s: %s", path, strerror(errno)); return (-1); } error = fstat(fd, &sb); if (error != 0) { emergency("fstat: %s", strerror(errno)); close(fd); return (error); } bufsize = sb.st_size; buf = malloc(bufsize); if (buf == NULL) { emergency("malloc: %s", strerror(errno)); close(fd); return (error); } nbytes = read(fd, buf, bufsize); if (nbytes != (ssize_t)bufsize) { emergency("read: %s", strerror(errno)); close(fd); free(buf); return (error); } error = close(fd); if (error != 0) { emergency("close: %s", strerror(errno)); free(buf); return (error); } *bufp = buf; *bufsizep = bufsize; return (0); } static int create_file(const char *path, const void *buf, size_t bufsize) { ssize_t nbytes; int error, fd; fd = open(path, O_WRONLY | O_CREAT | O_EXCL, 0700); if (fd < 0) { emergency("%s: %s", path, strerror(errno)); return (-1); } nbytes = write(fd, buf, bufsize); if (nbytes != (ssize_t)bufsize) { emergency("write: %s", strerror(errno)); close(fd); return (-1); } error = close(fd); if (error != 0) { emergency("close: %s", strerror(errno)); return (-1); } return (0); } static int mount_tmpfs(const char *fspath) { struct iovec *iov; char errmsg[255]; int error, iovlen; iov = NULL; iovlen = 0; memset(errmsg, 0, sizeof(errmsg)); build_iovec(&iov, &iovlen, "fstype", __DECONST(void *, "tmpfs"), (size_t)-1); build_iovec(&iov, &iovlen, "fspath", __DECONST(void *, fspath), (size_t)-1); build_iovec(&iov, &iovlen, "errmsg", errmsg, sizeof(errmsg)); error = nmount(iov, iovlen, 0); if (error != 0) { if (*errmsg != '\0') { emergency("cannot mount tmpfs on %s: %s: %s", fspath, errmsg, strerror(errno)); } else { emergency("cannot mount tmpfs on %s: %s", fspath, strerror(errno)); } return (error); } return (0); } static state_func_t reroot(void) { void *buf; size_t bufsize; int error; buf = NULL; bufsize = 0; revoke_ttys(); runshutdown(); /* * Make sure nobody can interfere with our scheme. * Ignore ESRCH, which can apparently happen when * there are no processes to kill. */ error = kill(-1, SIGKILL); if (error != 0 && errno != ESRCH) { emergency("kill(2) failed: %s", strerror(errno)); goto out; } /* * Copy the init binary into tmpfs, so that we can unmount * the old rootfs without committing suicide. */ error = read_file(init_path_argv0, &buf, &bufsize); if (error != 0) goto out; error = mount_tmpfs(_PATH_REROOT); if (error != 0) goto out; error = create_file(_PATH_REROOT_INIT, buf, bufsize); if (error != 0) goto out; /* * Execute the temporary init. */ execl(_PATH_REROOT_INIT, _PATH_REROOT_INIT, "-r", NULL); emergency("cannot exec %s: %s", _PATH_REROOT_INIT, strerror(errno)); out: emergency("reroot failed; going to single user mode"); free(buf); return (state_func_t) single_user; } static state_func_t reroot_phase_two(void) { char init_path[PATH_MAX], *path, *path_component; size_t init_path_len; int nbytes, error; /* * Ask the kernel to mount the new rootfs. */ error = reboot(RB_REROOT); if (error != 0) { emergency("RB_REBOOT failed: %s", strerror(errno)); goto out; } /* * Figure out where the destination init(8) binary is. Note that * the path could be different than what we've started with. Use * the value from kenv, if set, or the one from sysctl otherwise. * The latter defaults to a hardcoded value, but can be overridden * by a build time option. */ nbytes = kenv(KENV_GET, "init_path", init_path, sizeof(init_path)); if (nbytes <= 0) { init_path_len = sizeof(init_path); error = sysctlbyname("kern.init_path", init_path, &init_path_len, NULL, 0); if (error != 0) { emergency("failed to retrieve kern.init_path: %s", strerror(errno)); goto out; } } /* * Repeat the init search logic from sys/kern/init_path.c */ path_component = init_path; while ((path = strsep(&path_component, ":")) != NULL) { /* * Execute init(8) from the new rootfs. */ execl(path, path, NULL); } emergency("cannot exec init from %s: %s", init_path, strerror(errno)); out: emergency("reroot failed; going to single user mode"); return (state_func_t) single_user; } /* * Bring the system up single user. */ static state_func_t single_user(void) { pid_t pid, wpid; int status; sigset_t mask; const char *shell; char *argv[2]; + struct timeval tv, tn; #ifdef SECURE struct ttyent *typ; struct passwd *pp; static const char banner[] = "Enter root password, or ^D to go multi-user\n"; char *clear, *password; #endif #ifdef DEBUGSHELL char altshell[128]; #endif if (Reboot) { /* Instead of going single user, let's reboot the machine */ sync(); - reboot(howto); - _exit(0); + if (reboot(howto) == -1) { + emergency("reboot(%#x) failed, %s", howto, + strerror(errno)); + _exit(1); /* panic and reboot */ + } + warning("reboot(%#x) returned", howto); + _exit(0); /* panic as well */ } shell = get_shell(); if ((pid = fork()) == 0) { /* * Start the single user session. */ open_console(); #ifdef SECURE /* * Check the root password. * We don't care if the console is 'on' by default; * it's the only tty that can be 'off' and 'secure'. */ typ = getttynam("console"); pp = getpwnam("root"); if (typ && (typ->ty_status & TTY_SECURE) == 0 && pp && *pp->pw_passwd) { write_stderr(banner); for (;;) { clear = getpass("Password:"); if (clear == NULL || *clear == '\0') _exit(0); password = crypt(clear, pp->pw_passwd); bzero(clear, _PASSWORD_LEN); if (password == NULL || strcmp(password, pp->pw_passwd) == 0) break; warning("single-user login failed\n"); } } endttyent(); endpwent(); #endif /* SECURE */ #ifdef DEBUGSHELL { char *cp = altshell; int num; #define SHREQUEST "Enter full pathname of shell or RETURN for " write_stderr(SHREQUEST); write_stderr(shell); write_stderr(": "); while ((num = read(STDIN_FILENO, cp, 1)) != -1 && num != 0 && *cp != '\n' && cp < &altshell[127]) cp++; *cp = '\0'; if (altshell[0] != '\0') shell = altshell; } #endif /* DEBUGSHELL */ /* * Unblock signals. * We catch all the interesting ones, * and those are reset to SIG_DFL on exec. */ sigemptyset(&mask); sigprocmask(SIG_SETMASK, &mask, (sigset_t *) 0); /* * Fire off a shell. * If the default one doesn't work, try the Bourne shell. */ char name[] = "-sh"; argv[0] = name; argv[1] = 0; execv(shell, argv); emergency("can't exec %s for single user: %m", shell); execv(_PATH_BSHELL, argv); emergency("can't exec %s for single user: %m", _PATH_BSHELL); sleep(STALL_TIMEOUT); _exit(1); } if (pid == -1) { /* * We are seriously hosed. Do our best. */ emergency("can't fork single-user shell, trying again"); while (waitpid(-1, (int *) 0, WNOHANG) > 0) continue; return (state_func_t) single_user; } requested_transition = 0; do { if ((wpid = waitpid(-1, &status, WUNTRACED)) != -1) collect_child(wpid); if (wpid == -1) { if (errno == EINTR) continue; warning("wait for single-user shell failed: %m; restarting"); return (state_func_t) single_user; } if (wpid == pid && WIFSTOPPED(status)) { warning("init: shell stopped, restarting\n"); kill(pid, SIGCONT); wpid = -1; } } while (wpid != pid && !requested_transition); if (requested_transition) return (state_func_t) requested_transition; if (!WIFEXITED(status)) { if (WTERMSIG(status) == SIGKILL) { /* * reboot(8) killed shell? */ warning("single user shell terminated."); - sleep(STALL_TIMEOUT); + gettimeofday(&tv, NULL); + tn = tv; + tv.tv_sec += STALL_TIMEOUT; + while (tv.tv_sec > tn.tv_sec || (tv.tv_sec == + tn.tv_sec && tv.tv_usec > tn.tv_usec)) { + sleep(1); + gettimeofday(&tn, NULL); + } _exit(0); } else { warning("single user shell terminated, restarting"); return (state_func_t) single_user; } } runcom_mode = FASTBOOT; return (state_func_t) runcom; } /* * Run the system startup script. */ static state_func_t runcom(void) { state_func_t next_transition; if ((next_transition = run_script(_PATH_RUNCOM)) != NULL) return next_transition; runcom_mode = AUTOBOOT; /* the default */ return (state_func_t) read_ttys; } /* * Run a shell script. * Returns 0 on success, otherwise the next transition to enter: * - single_user if fork/execv/waitpid failed, or if the script * terminated with a signal or exit code != 0. * - death_single if a SIGTERM was delivered to init(8). */ static state_func_t run_script(const char *script) { pid_t pid, wpid; int status; char *argv[4]; const char *shell; struct sigaction sa; shell = get_shell(); if ((pid = fork()) == 0) { sigemptyset(&sa.sa_mask); sa.sa_flags = 0; sa.sa_handler = SIG_IGN; sigaction(SIGTSTP, &sa, (struct sigaction *)0); sigaction(SIGHUP, &sa, (struct sigaction *)0); open_console(); char _sh[] = "sh"; char _autoboot[] = "autoboot"; argv[0] = _sh; argv[1] = __DECONST(char *, script); argv[2] = runcom_mode == AUTOBOOT ? _autoboot : 0; argv[3] = 0; sigprocmask(SIG_SETMASK, &sa.sa_mask, (sigset_t *) 0); #ifdef LOGIN_CAP setprocresources(RESOURCE_RC); #endif execv(shell, argv); stall("can't exec %s for %s: %m", shell, script); _exit(1); /* force single user mode */ } if (pid == -1) { emergency("can't fork for %s on %s: %m", shell, script); while (waitpid(-1, (int *) 0, WNOHANG) > 0) continue; sleep(STALL_TIMEOUT); return (state_func_t) single_user; } /* * Copied from single_user(). This is a bit paranoid. */ requested_transition = 0; do { if ((wpid = waitpid(-1, &status, WUNTRACED)) != -1) collect_child(wpid); if (wpid == -1) { if (requested_transition == death_single || requested_transition == reroot) return (state_func_t) requested_transition; if (errno == EINTR) continue; warning("wait for %s on %s failed: %m; going to " "single user mode", shell, script); return (state_func_t) single_user; } if (wpid == pid && WIFSTOPPED(status)) { warning("init: %s on %s stopped, restarting\n", shell, script); kill(pid, SIGCONT); wpid = -1; } } while (wpid != pid); if (WIFSIGNALED(status) && WTERMSIG(status) == SIGTERM && requested_transition == catatonia) { /* /etc/rc executed /sbin/reboot; wait for the end quietly */ sigset_t s; sigfillset(&s); for (;;) sigsuspend(&s); } if (!WIFEXITED(status)) { warning("%s on %s terminated abnormally, going to single " "user mode", shell, script); return (state_func_t) single_user; } if (WEXITSTATUS(status)) return (state_func_t) single_user; return (state_func_t) 0; } /* * Open the session database. * * NB: We could pass in the size here; is it necessary? */ static int start_session_db(void) { if (session_db && (*session_db->close)(session_db)) emergency("session database close: %s", strerror(errno)); if ((session_db = dbopen(NULL, O_RDWR, 0, DB_HASH, NULL)) == NULL) { emergency("session database open: %s", strerror(errno)); return (1); } return (0); } /* * Add a new login session. */ static void add_session(session_t *sp) { DBT key; DBT data; key.data = &sp->se_process; key.size = sizeof sp->se_process; data.data = &sp; data.size = sizeof sp; if ((*session_db->put)(session_db, &key, &data, 0)) emergency("insert %d: %s", sp->se_process, strerror(errno)); } /* * Delete an old login session. */ static void del_session(session_t *sp) { DBT key; key.data = &sp->se_process; key.size = sizeof sp->se_process; if ((*session_db->del)(session_db, &key, 0)) emergency("delete %d: %s", sp->se_process, strerror(errno)); } /* * Look up a login session by pid. */ static session_t * find_session(pid_t pid) { DBT key; DBT data; session_t *ret; key.data = &pid; key.size = sizeof pid; if ((*session_db->get)(session_db, &key, &data, 0) != 0) return 0; bcopy(data.data, (char *)&ret, sizeof(ret)); return ret; } /* * Construct an argument vector from a command line. */ static char ** construct_argv(char *command) { int argc = 0; char **argv = (char **) malloc(((strlen(command) + 1) / 2 + 1) * sizeof (char *)); if ((argv[argc++] = strk(command)) == NULL) { free(argv); return (NULL); } while ((argv[argc++] = strk((char *) 0)) != NULL) continue; return argv; } /* * Deallocate a session descriptor. */ static void free_session(session_t *sp) { free(sp->se_device); if (sp->se_getty) { free(sp->se_getty); free(sp->se_getty_argv_space); free(sp->se_getty_argv); } if (sp->se_window) { free(sp->se_window); free(sp->se_window_argv_space); free(sp->se_window_argv); } if (sp->se_type) free(sp->se_type); free(sp); } /* * Allocate a new session descriptor. * Mark it SE_PRESENT. */ static session_t * new_session(session_t *sprev, struct ttyent *typ) { session_t *sp; int fd; if ((typ->ty_status & TTY_ON) == 0 || typ->ty_name == 0 || typ->ty_getty == 0) return 0; sp = (session_t *) calloc(1, sizeof (session_t)); sp->se_flags |= SE_PRESENT; sp->se_device = malloc(sizeof(_PATH_DEV) + strlen(typ->ty_name)); sprintf(sp->se_device, "%s%s", _PATH_DEV, typ->ty_name); /* * Attempt to open the device, if we get "device not configured" * then don't add the device to the session list. */ if ((fd = open(sp->se_device, O_RDONLY | O_NONBLOCK, 0)) < 0) { if (errno == ENXIO) { free_session(sp); return (0); } } else close(fd); if (setupargv(sp, typ) == 0) { free_session(sp); return (0); } sp->se_next = 0; if (sprev == NULL) { sessions = sp; sp->se_prev = 0; } else { sprev->se_next = sp; sp->se_prev = sprev; } return sp; } /* * Calculate getty and if useful window argv vectors. */ static int setupargv(session_t *sp, struct ttyent *typ) { if (sp->se_getty) { free(sp->se_getty); free(sp->se_getty_argv_space); free(sp->se_getty_argv); } sp->se_getty = malloc(strlen(typ->ty_getty) + strlen(typ->ty_name) + 2); sprintf(sp->se_getty, "%s %s", typ->ty_getty, typ->ty_name); sp->se_getty_argv_space = strdup(sp->se_getty); sp->se_getty_argv = construct_argv(sp->se_getty_argv_space); if (sp->se_getty_argv == NULL) { warning("can't parse getty for port %s", sp->se_device); free(sp->se_getty); free(sp->se_getty_argv_space); sp->se_getty = sp->se_getty_argv_space = 0; return (0); } if (sp->se_window) { free(sp->se_window); free(sp->se_window_argv_space); free(sp->se_window_argv); } sp->se_window = sp->se_window_argv_space = 0; sp->se_window_argv = 0; if (typ->ty_window) { sp->se_window = strdup(typ->ty_window); sp->se_window_argv_space = strdup(sp->se_window); sp->se_window_argv = construct_argv(sp->se_window_argv_space); if (sp->se_window_argv == NULL) { warning("can't parse window for port %s", sp->se_device); free(sp->se_window_argv_space); free(sp->se_window); sp->se_window = sp->se_window_argv_space = 0; return (0); } } if (sp->se_type) free(sp->se_type); sp->se_type = typ->ty_type ? strdup(typ->ty_type) : 0; return (1); } /* * Walk the list of ttys and create sessions for each active line. */ static state_func_t read_ttys(void) { session_t *sp, *snext; struct ttyent *typ; /* * Destroy any previous session state. * There shouldn't be any, but just in case... */ for (sp = sessions; sp; sp = snext) { snext = sp->se_next; free_session(sp); } sessions = 0; if (start_session_db()) return (state_func_t) single_user; /* * Allocate a session entry for each active port. * Note that sp starts at 0. */ while ((typ = getttyent()) != NULL) if ((snext = new_session(sp, typ)) != NULL) sp = snext; endttyent(); return (state_func_t) multi_user; } /* * Start a window system running. */ static void start_window_system(session_t *sp) { pid_t pid; sigset_t mask; char term[64], *env[2]; int status; if ((pid = fork()) == -1) { emergency("can't fork for window system on port %s: %m", sp->se_device); /* hope that getty fails and we can try again */ return; } if (pid) { waitpid(-1, &status, 0); return; } /* reparent window process to the init to not make a zombie on exit */ if ((pid = fork()) == -1) { emergency("can't fork for window system on port %s: %m", sp->se_device); _exit(1); } if (pid) _exit(0); sigemptyset(&mask); sigprocmask(SIG_SETMASK, &mask, (sigset_t *) 0); if (setsid() < 0) emergency("setsid failed (window) %m"); #ifdef LOGIN_CAP setprocresources(RESOURCE_WINDOW); #endif if (sp->se_type) { /* Don't use malloc after fork */ strcpy(term, "TERM="); strncat(term, sp->se_type, sizeof(term) - 6); env[0] = term; env[1] = 0; } else env[0] = 0; execve(sp->se_window_argv[0], sp->se_window_argv, env); stall("can't exec window system '%s' for port %s: %m", sp->se_window_argv[0], sp->se_device); _exit(1); } /* * Start a login session running. */ static pid_t start_getty(session_t *sp) { pid_t pid; sigset_t mask; time_t current_time = time((time_t *) 0); int too_quick = 0; char term[64], *env[2]; if (current_time >= sp->se_started && current_time - sp->se_started < GETTY_SPACING) { if (++sp->se_nspace > GETTY_NSPACE) { sp->se_nspace = 0; too_quick = 1; } } else sp->se_nspace = 0; /* * fork(), not vfork() -- we can't afford to block. */ if ((pid = fork()) == -1) { emergency("can't fork for getty on port %s: %m", sp->se_device); return -1; } if (pid) return pid; if (too_quick) { warning("getty repeating too quickly on port %s, sleeping %d secs", sp->se_device, GETTY_SLEEP); sleep((unsigned) GETTY_SLEEP); } if (sp->se_window) { start_window_system(sp); sleep(WINDOW_WAIT); } sigemptyset(&mask); sigprocmask(SIG_SETMASK, &mask, (sigset_t *) 0); #ifdef LOGIN_CAP setprocresources(RESOURCE_GETTY); #endif if (sp->se_type) { /* Don't use malloc after fork */ strcpy(term, "TERM="); strncat(term, sp->se_type, sizeof(term) - 6); env[0] = term; env[1] = 0; } else env[0] = 0; execve(sp->se_getty_argv[0], sp->se_getty_argv, env); stall("can't exec getty '%s' for port %s: %m", sp->se_getty_argv[0], sp->se_device); _exit(1); } /* * Collect exit status for a child. * If an exiting login, start a new login running. */ static void collect_child(pid_t pid) { session_t *sp, *sprev, *snext; if (! sessions) return; if (! (sp = find_session(pid))) return; del_session(sp); sp->se_process = 0; if (sp->se_flags & SE_SHUTDOWN) { if ((sprev = sp->se_prev) != NULL) sprev->se_next = sp->se_next; else sessions = sp->se_next; if ((snext = sp->se_next) != NULL) snext->se_prev = sp->se_prev; free_session(sp); return; } if ((pid = start_getty(sp)) == -1) { /* serious trouble */ requested_transition = clean_ttys; return; } sp->se_process = pid; sp->se_started = time((time_t *) 0); add_session(sp); } /* * Catch a signal and request a state transition. */ static void transition_handler(int sig) { switch (sig) { case SIGHUP: if (current_state == read_ttys || current_state == multi_user || current_state == clean_ttys || current_state == catatonia) requested_transition = clean_ttys; break; case SIGUSR2: howto = RB_POWEROFF; case SIGUSR1: howto |= RB_HALT; case SIGINT: Reboot = TRUE; case SIGTERM: if (current_state == read_ttys || current_state == multi_user || current_state == clean_ttys || current_state == catatonia) requested_transition = death; else requested_transition = death_single; break; case SIGTSTP: if (current_state == runcom || current_state == read_ttys || current_state == clean_ttys || current_state == multi_user || current_state == catatonia) requested_transition = catatonia; break; case SIGEMT: requested_transition = reroot; break; default: requested_transition = 0; break; } } /* * Take the system multiuser. */ static state_func_t multi_user(void) { pid_t pid; session_t *sp; requested_transition = 0; /* * If the administrator has not set the security level to -1 * to indicate that the kernel should not run multiuser in secure * mode, and the run script has not set a higher level of security * than level 1, then put the kernel into secure mode. */ if (getsecuritylevel() == 0) setsecuritylevel(1); for (sp = sessions; sp; sp = sp->se_next) { if (sp->se_process) continue; if ((pid = start_getty(sp)) == -1) { /* serious trouble */ requested_transition = clean_ttys; break; } sp->se_process = pid; sp->se_started = time((time_t *) 0); add_session(sp); } while (!requested_transition) if ((pid = waitpid(-1, (int *) 0, 0)) != -1) collect_child(pid); return (state_func_t) requested_transition; } /* * This is an (n*2)+(n^2) algorithm. We hope it isn't run often... */ static state_func_t clean_ttys(void) { session_t *sp, *sprev; struct ttyent *typ; int devlen; char *old_getty, *old_window, *old_type; /* * mark all sessions for death, (!SE_PRESENT) * as we find or create new ones they'll be marked as keepers, * we'll later nuke all the ones not found in /etc/ttys */ for (sp = sessions; sp != NULL; sp = sp->se_next) sp->se_flags &= ~SE_PRESENT; devlen = sizeof(_PATH_DEV) - 1; while ((typ = getttyent()) != NULL) { for (sprev = 0, sp = sessions; sp; sprev = sp, sp = sp->se_next) if (strcmp(typ->ty_name, sp->se_device + devlen) == 0) break; if (sp) { /* we want this one to live */ sp->se_flags |= SE_PRESENT; if ((typ->ty_status & TTY_ON) == 0 || typ->ty_getty == 0) { sp->se_flags |= SE_SHUTDOWN; kill(sp->se_process, SIGHUP); continue; } sp->se_flags &= ~SE_SHUTDOWN; old_getty = sp->se_getty ? strdup(sp->se_getty) : 0; old_window = sp->se_window ? strdup(sp->se_window) : 0; old_type = sp->se_type ? strdup(sp->se_type) : 0; if (setupargv(sp, typ) == 0) { warning("can't parse getty for port %s", sp->se_device); sp->se_flags |= SE_SHUTDOWN; kill(sp->se_process, SIGHUP); } else if ( !old_getty || (!old_type && sp->se_type) || (old_type && !sp->se_type) || (!old_window && sp->se_window) || (old_window && !sp->se_window) || (strcmp(old_getty, sp->se_getty) != 0) || (old_window && strcmp(old_window, sp->se_window) != 0) || (old_type && strcmp(old_type, sp->se_type) != 0) ) { /* Don't set SE_SHUTDOWN here */ sp->se_nspace = 0; sp->se_started = 0; kill(sp->se_process, SIGHUP); } if (old_getty) free(old_getty); if (old_window) free(old_window); if (old_type) free(old_type); continue; } new_session(sprev, typ); } endttyent(); /* * sweep through and kill all deleted sessions * ones who's /etc/ttys line was deleted (SE_PRESENT unset) */ for (sp = sessions; sp != NULL; sp = sp->se_next) { if ((sp->se_flags & SE_PRESENT) == 0) { sp->se_flags |= SE_SHUTDOWN; kill(sp->se_process, SIGHUP); } } return (state_func_t) multi_user; } /* * Block further logins. */ static state_func_t catatonia(void) { session_t *sp; for (sp = sessions; sp; sp = sp->se_next) sp->se_flags |= SE_SHUTDOWN; return (state_func_t) multi_user; } /* * Note SIGALRM. */ static void alrm_handler(int sig) { (void)sig; clang = 1; } /* * Bring the system down to single user. */ static state_func_t death(void) { int block, blocked; size_t len; /* Temporarily block suspend. */ len = sizeof(blocked); block = 1; if (sysctlbyname("kern.suspend_blocked", &blocked, &len, &block, sizeof(block)) == -1) blocked = 0; /* * Also revoke the TTY here. Because runshutdown() may reopen * the TTY whose getty we're killing here, there is no guarantee * runshutdown() will perform the initial open() call, causing * the terminal attributes to be misconfigured. */ revoke_ttys(); /* Try to run the rc.shutdown script within a period of time */ runshutdown(); /* Unblock suspend if we blocked it. */ if (!blocked) sysctlbyname("kern.suspend_blocked", NULL, NULL, &blocked, sizeof(blocked)); return (state_func_t) death_single; } /* * Do what is necessary to reinitialize single user mode or reboot * from an incomplete state. */ static state_func_t death_single(void) { int i; pid_t pid; static const int death_sigs[2] = { SIGTERM, SIGKILL }; revoke(_PATH_CONSOLE); for (i = 0; i < 2; ++i) { if (kill(-1, death_sigs[i]) == -1 && errno == ESRCH) return (state_func_t) single_user; clang = 0; alarm(DEATH_WATCH); do if ((pid = waitpid(-1, (int *)0, 0)) != -1) collect_child(pid); while (clang == 0 && errno != ECHILD); if (errno == ECHILD) return (state_func_t) single_user; } warning("some processes would not die; ps axl advised"); return (state_func_t) single_user; } static void revoke_ttys(void) { session_t *sp; for (sp = sessions; sp; sp = sp->se_next) { sp->se_flags |= SE_SHUTDOWN; kill(sp->se_process, SIGHUP); revoke(sp->se_device); } } /* * Run the system shutdown script. * * Exit codes: XXX I should document more * -2 shutdown script terminated abnormally * -1 fatal error - can't run script * 0 good. * >0 some error (exit code) */ static int runshutdown(void) { pid_t pid, wpid; int status; int shutdowntimeout; size_t len; char *argv[4]; const char *shell; struct sigaction sa; struct stat sb; /* * rc.shutdown is optional, so to prevent any unnecessary * complaints from the shell we simply don't run it if the * file does not exist. If the stat() here fails for other * reasons, we'll let the shell complain. */ if (stat(_PATH_RUNDOWN, &sb) == -1 && errno == ENOENT) return 0; shell = get_shell(); if ((pid = fork()) == 0) { sigemptyset(&sa.sa_mask); sa.sa_flags = 0; sa.sa_handler = SIG_IGN; sigaction(SIGTSTP, &sa, (struct sigaction *)0); sigaction(SIGHUP, &sa, (struct sigaction *)0); open_console(); char _sh[] = "sh"; char _reboot[] = "reboot"; char _single[] = "single"; char _path_rundown[] = _PATH_RUNDOWN; argv[0] = _sh; argv[1] = _path_rundown; argv[2] = Reboot ? _reboot : _single; argv[3] = 0; sigprocmask(SIG_SETMASK, &sa.sa_mask, (sigset_t *) 0); #ifdef LOGIN_CAP setprocresources(RESOURCE_RC); #endif execv(shell, argv); warning("can't exec %s for %s: %m", shell, _PATH_RUNDOWN); _exit(1); /* force single user mode */ } if (pid == -1) { emergency("can't fork for %s on %s: %m", shell, _PATH_RUNDOWN); while (waitpid(-1, (int *) 0, WNOHANG) > 0) continue; sleep(STALL_TIMEOUT); return -1; } len = sizeof(shutdowntimeout); if (sysctlbyname("kern.init_shutdown_timeout", &shutdowntimeout, &len, NULL, 0) == -1 || shutdowntimeout < 2) shutdowntimeout = DEATH_SCRIPT; alarm(shutdowntimeout); clang = 0; /* * Copied from single_user(). This is a bit paranoid. * Use the same ALRM handler. */ do { if ((wpid = waitpid(-1, &status, WUNTRACED)) != -1) collect_child(wpid); if (clang == 1) { /* we were waiting for the sub-shell */ kill(wpid, SIGTERM); warning("timeout expired for %s on %s: %m; going to " "single user mode", shell, _PATH_RUNDOWN); return -1; } if (wpid == -1) { if (errno == EINTR) continue; warning("wait for %s on %s failed: %m; going to " "single user mode", shell, _PATH_RUNDOWN); return -1; } if (wpid == pid && WIFSTOPPED(status)) { warning("init: %s on %s stopped, restarting\n", shell, _PATH_RUNDOWN); kill(pid, SIGCONT); wpid = -1; } } while (wpid != pid && !clang); /* Turn off the alarm */ alarm(0); if (WIFSIGNALED(status) && WTERMSIG(status) == SIGTERM && requested_transition == catatonia) { /* * /etc/rc.shutdown executed /sbin/reboot; * wait for the end quietly */ sigset_t s; sigfillset(&s); for (;;) sigsuspend(&s); } if (!WIFEXITED(status)) { warning("%s on %s terminated abnormally, going to " "single user mode", shell, _PATH_RUNDOWN); return -2; } if ((status = WEXITSTATUS(status)) != 0) warning("%s returned status %d", _PATH_RUNDOWN, status); return status; } static char * strk(char *p) { static char *t; char *q; int c; if (p) t = p; if (!t) return 0; c = *t; while (c == ' ' || c == '\t' ) c = *++t; if (!c) { t = 0; return 0; } q = t; if (c == '\'') { c = *++t; q = t; while (c && c != '\'') c = *++t; if (!c) /* unterminated string */ q = t = 0; else *t++ = 0; } else { while (c && c != ' ' && c != '\t' ) c = *++t; *t++ = 0; if (!c) t = 0; } return q; } #ifdef LOGIN_CAP static void setprocresources(const char *cname) { login_cap_t *lc; if ((lc = login_getclassbyname(cname, NULL)) != NULL) { setusercontext(lc, (struct passwd*)NULL, 0, LOGIN_SETPRIORITY | LOGIN_SETRESOURCES | LOGIN_SETLOGINCLASS | LOGIN_SETCPUMASK); login_close(lc); } } #endif Index: user/alc/PQ_LAUNDRY/sys/boot/common/self_reloc.c =================================================================== --- user/alc/PQ_LAUNDRY/sys/boot/common/self_reloc.c (revision 306831) +++ user/alc/PQ_LAUNDRY/sys/boot/common/self_reloc.c (revision 306832) @@ -1,125 +1,131 @@ /*- * Copyright (c) 2008-2010 Rui Paulo * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #if defined(__aarch64__) #define ElfW_Rel Elf64_Rela #define ElfW_Dyn Elf64_Dyn #define ELFW_R_TYPE ELF64_R_TYPE #define ELF_RELA #elif defined(__arm__) || defined(__i386__) #define ElfW_Rel Elf32_Rel #define ElfW_Dyn Elf32_Dyn #define ELFW_R_TYPE ELF32_R_TYPE #elif defined(__amd64__) #define ElfW_Rel Elf64_Rel #define ElfW_Dyn Elf64_Dyn #define ELFW_R_TYPE ELF64_R_TYPE #else #error architecture not supported #endif #if defined(__aarch64__) #define RELOC_TYPE_NONE R_AARCH64_NONE #define RELOC_TYPE_RELATIVE R_AARCH64_RELATIVE #elif defined(__amd64__) #define RELOC_TYPE_NONE R_X86_64_NONE #define RELOC_TYPE_RELATIVE R_X86_64_RELATIVE #elif defined(__arm__) #define RELOC_TYPE_NONE R_ARM_NONE #define RELOC_TYPE_RELATIVE R_ARM_RELATIVE #elif defined(__i386__) #define RELOC_TYPE_NONE R_386_NONE #define RELOC_TYPE_RELATIVE R_386_RELATIVE #endif void self_reloc(Elf_Addr baseaddr, ElfW_Dyn *dynamic); /* * A simple elf relocator. */ void self_reloc(Elf_Addr baseaddr, ElfW_Dyn *dynamic) { Elf_Word relsz, relent; Elf_Addr *newaddr; ElfW_Rel *rel; ElfW_Dyn *dynp; /* * Find the relocation address, its size and the relocation entry. */ relsz = 0; relent = 0; for (dynp = dynamic; dynp->d_tag != DT_NULL; dynp++) { switch (dynp->d_tag) { case DT_REL: case DT_RELA: rel = (ElfW_Rel *)(dynp->d_un.d_ptr + baseaddr); break; case DT_RELSZ: case DT_RELASZ: relsz = dynp->d_un.d_val; break; case DT_RELENT: case DT_RELAENT: relent = dynp->d_un.d_val; break; default: break; } } /* * Perform the actual relocation. */ for (; relsz > 0; relsz -= relent) { switch (ELFW_R_TYPE(rel->r_info)) { case RELOC_TYPE_NONE: /* No relocation needs be performed. */ break; case RELOC_TYPE_RELATIVE: - /* Address relative to the base address. */ newaddr = (Elf_Addr *)(rel->r_offset + baseaddr); - *newaddr += baseaddr; - /* Add the addend when the ABI uses them */ #ifdef ELF_RELA - *newaddr += rel->r_addend; + /* + * For R_AARCH64_RELATIVE we need to calculate the + * delta between the address we are run from and the + * address we are linked at. As the latter is 0 we + * just use the address we are run from for this. + */ + *newaddr = baseaddr + rel->r_addend; +#else + /* Address relative to the base address. */ + *newaddr += baseaddr; #endif break; default: /* XXX: do we need other relocations ? */ break; } rel = (ElfW_Rel *)(void *)((caddr_t) rel + relent); } } Index: user/alc/PQ_LAUNDRY/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_znode.h =================================================================== --- user/alc/PQ_LAUNDRY/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_znode.h (revision 306831) +++ user/alc/PQ_LAUNDRY/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_znode.h (revision 306832) @@ -1,366 +1,368 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2015 by Delphix. All rights reserved. * Copyright (c) 2014 Integros [integros.com] */ #ifndef _SYS_FS_ZFS_ZNODE_H #define _SYS_FS_ZFS_ZNODE_H #ifdef _KERNEL #include #include #include #include #include #include #include #endif #include #include #ifdef __cplusplus extern "C" { #endif /* * Additional file level attributes, that are stored * in the upper half of zp_flags */ #define ZFS_READONLY 0x0000000100000000 #define ZFS_HIDDEN 0x0000000200000000 #define ZFS_SYSTEM 0x0000000400000000 #define ZFS_ARCHIVE 0x0000000800000000 #define ZFS_IMMUTABLE 0x0000001000000000 #define ZFS_NOUNLINK 0x0000002000000000 #define ZFS_APPENDONLY 0x0000004000000000 #define ZFS_NODUMP 0x0000008000000000 #define ZFS_OPAQUE 0x0000010000000000 #define ZFS_AV_QUARANTINED 0x0000020000000000 #define ZFS_AV_MODIFIED 0x0000040000000000 #define ZFS_REPARSE 0x0000080000000000 #define ZFS_OFFLINE 0x0000100000000000 #define ZFS_SPARSE 0x0000200000000000 #define ZFS_ATTR_SET(zp, attr, value, pflags, tx) \ { \ if (value) \ pflags |= attr; \ else \ pflags &= ~attr; \ VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_FLAGS(zp->z_zfsvfs), \ &pflags, sizeof (pflags), tx)); \ } /* * Define special zfs pflags */ #define ZFS_XATTR 0x1 /* is an extended attribute */ #define ZFS_INHERIT_ACE 0x2 /* ace has inheritable ACEs */ #define ZFS_ACL_TRIVIAL 0x4 /* files ACL is trivial */ #define ZFS_ACL_OBJ_ACE 0x8 /* ACL has CMPLX Object ACE */ #define ZFS_ACL_PROTECTED 0x10 /* ACL protected */ #define ZFS_ACL_DEFAULTED 0x20 /* ACL should be defaulted */ #define ZFS_ACL_AUTO_INHERIT 0x40 /* ACL should be inherited */ #define ZFS_BONUS_SCANSTAMP 0x80 /* Scanstamp in bonus area */ #define ZFS_NO_EXECS_DENIED 0x100 /* exec was given to everyone */ #define SA_ZPL_ATIME(z) z->z_attr_table[ZPL_ATIME] #define SA_ZPL_MTIME(z) z->z_attr_table[ZPL_MTIME] #define SA_ZPL_CTIME(z) z->z_attr_table[ZPL_CTIME] #define SA_ZPL_CRTIME(z) z->z_attr_table[ZPL_CRTIME] #define SA_ZPL_GEN(z) z->z_attr_table[ZPL_GEN] #define SA_ZPL_DACL_ACES(z) z->z_attr_table[ZPL_DACL_ACES] #define SA_ZPL_XATTR(z) z->z_attr_table[ZPL_XATTR] #define SA_ZPL_SYMLINK(z) z->z_attr_table[ZPL_SYMLINK] #define SA_ZPL_RDEV(z) z->z_attr_table[ZPL_RDEV] #define SA_ZPL_SCANSTAMP(z) z->z_attr_table[ZPL_SCANSTAMP] #define SA_ZPL_UID(z) z->z_attr_table[ZPL_UID] #define SA_ZPL_GID(z) z->z_attr_table[ZPL_GID] #define SA_ZPL_PARENT(z) z->z_attr_table[ZPL_PARENT] #define SA_ZPL_LINKS(z) z->z_attr_table[ZPL_LINKS] #define SA_ZPL_MODE(z) z->z_attr_table[ZPL_MODE] #define SA_ZPL_DACL_COUNT(z) z->z_attr_table[ZPL_DACL_COUNT] #define SA_ZPL_FLAGS(z) z->z_attr_table[ZPL_FLAGS] #define SA_ZPL_SIZE(z) z->z_attr_table[ZPL_SIZE] #define SA_ZPL_ZNODE_ACL(z) z->z_attr_table[ZPL_ZNODE_ACL] #define SA_ZPL_PAD(z) z->z_attr_table[ZPL_PAD] /* * Is ID ephemeral? */ #define IS_EPHEMERAL(x) (x > MAXUID) /* * Should we use FUIDs? */ #define USE_FUIDS(version, os) (version >= ZPL_VERSION_FUID && \ spa_version(dmu_objset_spa(os)) >= SPA_VERSION_FUID) #define USE_SA(version, os) (version >= ZPL_VERSION_SA && \ spa_version(dmu_objset_spa(os)) >= SPA_VERSION_SA) #define MASTER_NODE_OBJ 1 /* * Special attributes for master node. * "userquota@" and "groupquota@" are also valid (from * zfs_userquota_prop_prefixes[]). */ #define ZFS_FSID "FSID" #define ZFS_UNLINKED_SET "DELETE_QUEUE" #define ZFS_ROOT_OBJ "ROOT" #define ZPL_VERSION_STR "VERSION" #define ZFS_FUID_TABLES "FUID" #define ZFS_SHARES_DIR "SHARES" #define ZFS_SA_ATTRS "SA_ATTRS" /* * Convert mode bits (zp_mode) to BSD-style DT_* values for storing in * the directory entries. */ #ifndef IFTODT #define IFTODT(mode) (((mode) & S_IFMT) >> 12) #endif /* * The directory entry has the type (currently unused on Solaris) in the * top 4 bits, and the object number in the low 48 bits. The "middle" * 12 bits are unused. */ #define ZFS_DIRENT_TYPE(de) BF64_GET(de, 60, 4) #define ZFS_DIRENT_OBJ(de) BF64_GET(de, 0, 48) /* * Directory entry locks control access to directory entries. * They are used to protect creates, deletes, and renames. * Each directory znode has a mutex and a list of locked names. */ #ifdef _KERNEL typedef struct zfs_dirlock { char *dl_name; /* directory entry being locked */ uint32_t dl_sharecnt; /* 0 if exclusive, > 0 if shared */ uint8_t dl_namelock; /* 1 if z_name_lock is NOT held */ uint16_t dl_namesize; /* set if dl_name was allocated */ kcondvar_t dl_cv; /* wait for entry to be unlocked */ struct znode *dl_dzp; /* directory znode */ struct zfs_dirlock *dl_next; /* next in z_dirlocks list */ } zfs_dirlock_t; typedef struct znode { struct zfsvfs *z_zfsvfs; vnode_t *z_vnode; uint64_t z_id; /* object ID for this znode */ #ifdef illumos kmutex_t z_lock; /* znode modification lock */ krwlock_t z_parent_lock; /* parent lock for directories */ krwlock_t z_name_lock; /* "master" lock for dirent locks */ zfs_dirlock_t *z_dirlocks; /* directory entry lock list */ #endif kmutex_t z_range_lock; /* protects changes to z_range_avl */ avl_tree_t z_range_avl; /* avl tree of file range locks */ uint8_t z_unlinked; /* file has been unlinked */ uint8_t z_atime_dirty; /* atime needs to be synced */ uint8_t z_zn_prefetch; /* Prefetch znodes? */ uint8_t z_moved; /* Has this znode been moved? */ uint_t z_blksz; /* block size in bytes */ uint_t z_seq; /* modification sequence number */ uint64_t z_mapcnt; /* number of pages mapped to file */ uint64_t z_gen; /* generation (cached) */ uint64_t z_size; /* file size (cached) */ uint64_t z_atime[2]; /* atime (cached) */ uint64_t z_links; /* file links (cached) */ uint64_t z_pflags; /* pflags (cached) */ uint64_t z_uid; /* uid fuid (cached) */ uint64_t z_gid; /* gid fuid (cached) */ mode_t z_mode; /* mode (cached) */ uint32_t z_sync_cnt; /* synchronous open count */ kmutex_t z_acl_lock; /* acl data lock */ zfs_acl_t *z_acl_cached; /* cached acl */ list_node_t z_link_node; /* all znodes in fs link */ sa_handle_t *z_sa_hdl; /* handle to sa data */ boolean_t z_is_sa; /* are we native sa? */ } znode_t; /* * Range locking rules * -------------------- * 1. When truncating a file (zfs_create, zfs_setattr, zfs_space) the whole * file range needs to be locked as RL_WRITER. Only then can the pages be * freed etc and zp_size reset. zp_size must be set within range lock. * 2. For writes and punching holes (zfs_write & zfs_space) just the range * being written or freed needs to be locked as RL_WRITER. * Multiple writes at the end of the file must coordinate zp_size updates * to ensure data isn't lost. A compare and swap loop is currently used * to ensure the file size is at least the offset last written. * 3. For reads (zfs_read, zfs_get_data & zfs_putapage) just the range being * read needs to be locked as RL_READER. A check against zp_size can then * be made for reading beyond end of file. */ /* * Convert between znode pointers and vnode pointers */ #ifdef DEBUG static __inline vnode_t * ZTOV(znode_t *zp) { vnode_t *vp = zp->z_vnode; ASSERT(vp == NULL || vp->v_data == NULL || vp->v_data == zp); return (vp); } static __inline znode_t * VTOZ(vnode_t *vp) { znode_t *zp = (znode_t *)vp->v_data; ASSERT(zp == NULL || zp->z_vnode == NULL || zp->z_vnode == vp); return (zp); } #else #define ZTOV(ZP) ((ZP)->z_vnode) #define VTOZ(VP) ((znode_t *)(VP)->v_data) #endif /* Called on entry to each ZFS vnode and vfs operation */ #define ZFS_ENTER(zfsvfs) \ { \ rrm_enter_read(&(zfsvfs)->z_teardown_lock, FTAG); \ if ((zfsvfs)->z_unmounted) { \ ZFS_EXIT(zfsvfs); \ return (EIO); \ } \ } /* Must be called before exiting the vop */ #define ZFS_EXIT(zfsvfs) rrm_exit(&(zfsvfs)->z_teardown_lock, FTAG) /* Verifies the znode is valid */ #define ZFS_VERIFY_ZP(zp) \ if ((zp)->z_sa_hdl == NULL) { \ ZFS_EXIT((zp)->z_zfsvfs); \ return (EIO); \ } \ /* * Macros for dealing with dmu_buf_hold */ #define ZFS_OBJ_HASH(obj_num) ((obj_num) & (ZFS_OBJ_MTX_SZ - 1)) #define ZFS_OBJ_MUTEX(zfsvfs, obj_num) \ (&(zfsvfs)->z_hold_mtx[ZFS_OBJ_HASH(obj_num)]) #define ZFS_OBJ_HOLD_ENTER(zfsvfs, obj_num) \ mutex_enter(ZFS_OBJ_MUTEX((zfsvfs), (obj_num))) #define ZFS_OBJ_HOLD_TRYENTER(zfsvfs, obj_num) \ mutex_tryenter(ZFS_OBJ_MUTEX((zfsvfs), (obj_num))) #define ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num) \ mutex_exit(ZFS_OBJ_MUTEX((zfsvfs), (obj_num))) /* Encode ZFS stored time values from a struct timespec */ #define ZFS_TIME_ENCODE(tp, stmp) \ { \ (stmp)[0] = (uint64_t)(tp)->tv_sec; \ (stmp)[1] = (uint64_t)(tp)->tv_nsec; \ } /* Decode ZFS stored time values to a struct timespec */ #define ZFS_TIME_DECODE(tp, stmp) \ { \ (tp)->tv_sec = (time_t)(stmp)[0]; \ (tp)->tv_nsec = (long)(stmp)[1]; \ } /* * Timestamp defines */ #define ACCESSED (AT_ATIME) #define STATE_CHANGED (AT_CTIME) #define CONTENT_MODIFIED (AT_MTIME | AT_CTIME) #define ZFS_ACCESSTIME_STAMP(zfsvfs, zp) \ if ((zfsvfs)->z_atime && !((zfsvfs)->z_vfs->vfs_flag & VFS_RDONLY)) \ zfs_tstamp_update_setup(zp, ACCESSED, NULL, NULL, B_FALSE); extern int zfs_init_fs(zfsvfs_t *, znode_t **); extern void zfs_set_dataprop(objset_t *); extern void zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *, dmu_tx_t *tx); extern void zfs_tstamp_update_setup(znode_t *, uint_t, uint64_t [2], uint64_t [2], boolean_t); extern void zfs_grow_blocksize(znode_t *, uint64_t, dmu_tx_t *); extern int zfs_freesp(znode_t *, uint64_t, uint64_t, int, boolean_t); extern void zfs_znode_init(void); extern void zfs_znode_fini(void); extern int zfs_zget(zfsvfs_t *, uint64_t, znode_t **); extern int zfs_rezget(znode_t *); extern void zfs_zinactive(znode_t *); extern void zfs_znode_delete(znode_t *, dmu_tx_t *); extern void zfs_znode_free(znode_t *); extern void zfs_remove_op_tables(); extern int zfs_create_op_tables(); extern dev_t zfs_cmpldev(uint64_t); extern int zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value); extern int zfs_get_stats(objset_t *os, nvlist_t *nv); extern void zfs_znode_dmu_fini(znode_t *); extern void zfs_log_create(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, znode_t *dzp, znode_t *zp, char *name, vsecattr_t *, zfs_fuid_info_t *, vattr_t *vap); extern int zfs_log_create_txtype(zil_create_t, vsecattr_t *vsecp, vattr_t *vap); extern void zfs_log_remove(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, znode_t *dzp, char *name, uint64_t foid); #define ZFS_NO_OBJECT 0 /* no object id */ extern void zfs_log_link(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, znode_t *dzp, znode_t *zp, char *name); extern void zfs_log_symlink(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, znode_t *dzp, znode_t *zp, char *name, char *link); extern void zfs_log_rename(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, znode_t *sdzp, char *sname, znode_t *tdzp, char *dname, znode_t *szp); extern void zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype, znode_t *zp, offset_t off, ssize_t len, int ioflag); extern void zfs_log_truncate(zilog_t *zilog, dmu_tx_t *tx, int txtype, znode_t *zp, uint64_t off, uint64_t len); extern void zfs_log_setattr(zilog_t *zilog, dmu_tx_t *tx, int txtype, znode_t *zp, vattr_t *vap, uint_t mask_applied, zfs_fuid_info_t *fuidp); #ifndef ZFS_NO_ACL extern void zfs_log_acl(zilog_t *zilog, dmu_tx_t *tx, znode_t *zp, vsecattr_t *vsecp, zfs_fuid_info_t *fuidp); #endif extern void zfs_xvattr_set(znode_t *zp, xvattr_t *xvap, dmu_tx_t *tx); extern void zfs_upgrade(zfsvfs_t *zfsvfs, dmu_tx_t *tx); extern int zfs_create_share_dir(zfsvfs_t *zfsvfs, dmu_tx_t *tx); extern zil_get_data_t zfs_get_data; extern zil_replay_func_t *zfs_replay_vector[TX_MAX_TYPE]; extern int zfsfstype; +extern int zfs_znode_parent_and_name(znode_t *zp, znode_t **dzpp, char *buf); + #endif /* _KERNEL */ extern int zfs_obj_to_path(objset_t *osp, uint64_t obj, char *buf, int len); #ifdef __cplusplus } #endif #endif /* _SYS_FS_ZFS_ZNODE_H */ Index: user/alc/PQ_LAUNDRY/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c =================================================================== --- user/alc/PQ_LAUNDRY/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c (revision 306831) +++ user/alc/PQ_LAUNDRY/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c (revision 306832) @@ -1,2518 +1,2518 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011 Pawel Jakub Dawidek . * All rights reserved. * Copyright (c) 2012, 2015 by Delphix. All rights reserved. * Copyright (c) 2014 Integros [integros.com] */ /* Portions Copyright 2010 Robert Milkowski */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "zfs_comutil.h" struct mtx zfs_debug_mtx; MTX_SYSINIT(zfs_debug_mtx, &zfs_debug_mtx, "zfs_debug", MTX_DEF); SYSCTL_NODE(_vfs, OID_AUTO, zfs, CTLFLAG_RW, 0, "ZFS file system"); int zfs_super_owner; SYSCTL_INT(_vfs_zfs, OID_AUTO, super_owner, CTLFLAG_RW, &zfs_super_owner, 0, "File system owner can perform privileged operation on his file systems"); int zfs_debug_level; SYSCTL_INT(_vfs_zfs, OID_AUTO, debug, CTLFLAG_RWTUN, &zfs_debug_level, 0, "Debug level"); SYSCTL_NODE(_vfs_zfs, OID_AUTO, version, CTLFLAG_RD, 0, "ZFS versions"); static int zfs_version_acl = ZFS_ACL_VERSION; SYSCTL_INT(_vfs_zfs_version, OID_AUTO, acl, CTLFLAG_RD, &zfs_version_acl, 0, "ZFS_ACL_VERSION"); static int zfs_version_spa = SPA_VERSION; SYSCTL_INT(_vfs_zfs_version, OID_AUTO, spa, CTLFLAG_RD, &zfs_version_spa, 0, "SPA_VERSION"); static int zfs_version_zpl = ZPL_VERSION; SYSCTL_INT(_vfs_zfs_version, OID_AUTO, zpl, CTLFLAG_RD, &zfs_version_zpl, 0, "ZPL_VERSION"); static int zfs_mount(vfs_t *vfsp); static int zfs_umount(vfs_t *vfsp, int fflag); static int zfs_root(vfs_t *vfsp, int flags, vnode_t **vpp); static int zfs_statfs(vfs_t *vfsp, struct statfs *statp); static int zfs_vget(vfs_t *vfsp, ino_t ino, int flags, vnode_t **vpp); static int zfs_sync(vfs_t *vfsp, int waitfor); static int zfs_checkexp(vfs_t *vfsp, struct sockaddr *nam, int *extflagsp, struct ucred **credanonp, int *numsecflavors, int **secflavors); static int zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, int flags, vnode_t **vpp); static void zfs_objset_close(zfsvfs_t *zfsvfs); static void zfs_freevfs(vfs_t *vfsp); struct vfsops zfs_vfsops = { .vfs_mount = zfs_mount, .vfs_unmount = zfs_umount, .vfs_root = zfs_root, .vfs_statfs = zfs_statfs, .vfs_vget = zfs_vget, .vfs_sync = zfs_sync, .vfs_checkexp = zfs_checkexp, .vfs_fhtovp = zfs_fhtovp, }; VFS_SET(zfs_vfsops, zfs, VFCF_JAIL | VFCF_DELEGADMIN); /* * We need to keep a count of active fs's. * This is necessary to prevent our module * from being unloaded after a umount -f */ static uint32_t zfs_active_fs_count = 0; /*ARGSUSED*/ static int zfs_sync(vfs_t *vfsp, int waitfor) { /* * Data integrity is job one. We don't want a compromised kernel * writing to the storage pool, so we never sync during panic. */ if (panicstr) return (0); /* * Ignore the system syncher. ZFS already commits async data * at zfs_txg_timeout intervals. */ if (waitfor == MNT_LAZY) return (0); if (vfsp != NULL) { /* * Sync a specific filesystem. */ zfsvfs_t *zfsvfs = vfsp->vfs_data; dsl_pool_t *dp; int error; error = vfs_stdsync(vfsp, waitfor); if (error != 0) return (error); ZFS_ENTER(zfsvfs); dp = dmu_objset_pool(zfsvfs->z_os); /* * If the system is shutting down, then skip any * filesystems which may exist on a suspended pool. */ if (sys_shutdown && spa_suspended(dp->dp_spa)) { ZFS_EXIT(zfsvfs); return (0); } if (zfsvfs->z_log != NULL) zil_commit(zfsvfs->z_log, 0); ZFS_EXIT(zfsvfs); } else { /* * Sync all ZFS filesystems. This is what happens when you * run sync(1M). Unlike other filesystems, ZFS honors the * request by waiting for all pools to commit all dirty data. */ spa_sync_allpools(); } return (0); } #ifndef __FreeBSD_kernel__ static int zfs_create_unique_device(dev_t *dev) { major_t new_major; do { ASSERT3U(zfs_minor, <=, MAXMIN32); minor_t start = zfs_minor; do { mutex_enter(&zfs_dev_mtx); if (zfs_minor >= MAXMIN32) { /* * If we're still using the real major * keep out of /dev/zfs and /dev/zvol minor * number space. If we're using a getudev()'ed * major number, we can use all of its minors. */ if (zfs_major == ddi_name_to_major(ZFS_DRIVER)) zfs_minor = ZFS_MIN_MINOR; else zfs_minor = 0; } else { zfs_minor++; } *dev = makedevice(zfs_major, zfs_minor); mutex_exit(&zfs_dev_mtx); } while (vfs_devismounted(*dev) && zfs_minor != start); if (zfs_minor == start) { /* * We are using all ~262,000 minor numbers for the * current major number. Create a new major number. */ if ((new_major = getudev()) == (major_t)-1) { cmn_err(CE_WARN, "zfs_mount: Can't get unique major " "device number."); return (-1); } mutex_enter(&zfs_dev_mtx); zfs_major = new_major; zfs_minor = 0; mutex_exit(&zfs_dev_mtx); } else { break; } /* CONSTANTCONDITION */ } while (1); return (0); } #endif /* !__FreeBSD_kernel__ */ static void atime_changed_cb(void *arg, uint64_t newval) { zfsvfs_t *zfsvfs = arg; if (newval == TRUE) { zfsvfs->z_atime = TRUE; zfsvfs->z_vfs->vfs_flag &= ~MNT_NOATIME; vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME); vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_ATIME, NULL, 0); } else { zfsvfs->z_atime = FALSE; zfsvfs->z_vfs->vfs_flag |= MNT_NOATIME; vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_ATIME); vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME, NULL, 0); } } static void xattr_changed_cb(void *arg, uint64_t newval) { zfsvfs_t *zfsvfs = arg; if (newval == TRUE) { /* XXX locking on vfs_flag? */ #ifdef TODO zfsvfs->z_vfs->vfs_flag |= VFS_XATTR; #endif vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOXATTR); vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_XATTR, NULL, 0); } else { /* XXX locking on vfs_flag? */ #ifdef TODO zfsvfs->z_vfs->vfs_flag &= ~VFS_XATTR; #endif vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_XATTR); vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOXATTR, NULL, 0); } } static void blksz_changed_cb(void *arg, uint64_t newval) { zfsvfs_t *zfsvfs = arg; ASSERT3U(newval, <=, spa_maxblocksize(dmu_objset_spa(zfsvfs->z_os))); ASSERT3U(newval, >=, SPA_MINBLOCKSIZE); ASSERT(ISP2(newval)); zfsvfs->z_max_blksz = newval; zfsvfs->z_vfs->mnt_stat.f_iosize = newval; } static void readonly_changed_cb(void *arg, uint64_t newval) { zfsvfs_t *zfsvfs = arg; if (newval) { /* XXX locking on vfs_flag? */ zfsvfs->z_vfs->vfs_flag |= VFS_RDONLY; vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RW); vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RO, NULL, 0); } else { /* XXX locking on vfs_flag? */ zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY; vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RO); vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RW, NULL, 0); } } static void setuid_changed_cb(void *arg, uint64_t newval) { zfsvfs_t *zfsvfs = arg; if (newval == FALSE) { zfsvfs->z_vfs->vfs_flag |= VFS_NOSETUID; vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_SETUID); vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID, NULL, 0); } else { zfsvfs->z_vfs->vfs_flag &= ~VFS_NOSETUID; vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID); vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_SETUID, NULL, 0); } } static void exec_changed_cb(void *arg, uint64_t newval) { zfsvfs_t *zfsvfs = arg; if (newval == FALSE) { zfsvfs->z_vfs->vfs_flag |= VFS_NOEXEC; vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_EXEC); vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC, NULL, 0); } else { zfsvfs->z_vfs->vfs_flag &= ~VFS_NOEXEC; vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC); vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_EXEC, NULL, 0); } } /* * The nbmand mount option can be changed at mount time. * We can't allow it to be toggled on live file systems or incorrect * behavior may be seen from cifs clients * * This property isn't registered via dsl_prop_register(), but this callback * will be called when a file system is first mounted */ static void nbmand_changed_cb(void *arg, uint64_t newval) { zfsvfs_t *zfsvfs = arg; if (newval == FALSE) { vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND); vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND, NULL, 0); } else { vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND); vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND, NULL, 0); } } static void snapdir_changed_cb(void *arg, uint64_t newval) { zfsvfs_t *zfsvfs = arg; zfsvfs->z_show_ctldir = newval; } static void vscan_changed_cb(void *arg, uint64_t newval) { zfsvfs_t *zfsvfs = arg; zfsvfs->z_vscan = newval; } static void acl_mode_changed_cb(void *arg, uint64_t newval) { zfsvfs_t *zfsvfs = arg; zfsvfs->z_acl_mode = newval; } static void acl_inherit_changed_cb(void *arg, uint64_t newval) { zfsvfs_t *zfsvfs = arg; zfsvfs->z_acl_inherit = newval; } static int zfs_register_callbacks(vfs_t *vfsp) { struct dsl_dataset *ds = NULL; objset_t *os = NULL; zfsvfs_t *zfsvfs = NULL; uint64_t nbmand; boolean_t readonly = B_FALSE; boolean_t do_readonly = B_FALSE; boolean_t setuid = B_FALSE; boolean_t do_setuid = B_FALSE; boolean_t exec = B_FALSE; boolean_t do_exec = B_FALSE; #ifdef illumos boolean_t devices = B_FALSE; boolean_t do_devices = B_FALSE; #endif boolean_t xattr = B_FALSE; boolean_t do_xattr = B_FALSE; boolean_t atime = B_FALSE; boolean_t do_atime = B_FALSE; int error = 0; ASSERT(vfsp); zfsvfs = vfsp->vfs_data; ASSERT(zfsvfs); os = zfsvfs->z_os; /* * This function can be called for a snapshot when we update snapshot's * mount point, which isn't really supported. */ if (dmu_objset_is_snapshot(os)) return (EOPNOTSUPP); /* * The act of registering our callbacks will destroy any mount * options we may have. In order to enable temporary overrides * of mount options, we stash away the current values and * restore them after we register the callbacks. */ if (vfs_optionisset(vfsp, MNTOPT_RO, NULL) || !spa_writeable(dmu_objset_spa(os))) { readonly = B_TRUE; do_readonly = B_TRUE; } else if (vfs_optionisset(vfsp, MNTOPT_RW, NULL)) { readonly = B_FALSE; do_readonly = B_TRUE; } if (vfs_optionisset(vfsp, MNTOPT_NOSUID, NULL)) { setuid = B_FALSE; do_setuid = B_TRUE; } else { if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL)) { setuid = B_FALSE; do_setuid = B_TRUE; } else if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL)) { setuid = B_TRUE; do_setuid = B_TRUE; } } if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL)) { exec = B_FALSE; do_exec = B_TRUE; } else if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL)) { exec = B_TRUE; do_exec = B_TRUE; } if (vfs_optionisset(vfsp, MNTOPT_NOXATTR, NULL)) { xattr = B_FALSE; do_xattr = B_TRUE; } else if (vfs_optionisset(vfsp, MNTOPT_XATTR, NULL)) { xattr = B_TRUE; do_xattr = B_TRUE; } if (vfs_optionisset(vfsp, MNTOPT_NOATIME, NULL)) { atime = B_FALSE; do_atime = B_TRUE; } else if (vfs_optionisset(vfsp, MNTOPT_ATIME, NULL)) { atime = B_TRUE; do_atime = B_TRUE; } /* * We need to enter pool configuration here, so that we can use * dsl_prop_get_int_ds() to handle the special nbmand property below. * dsl_prop_get_integer() can not be used, because it has to acquire * spa_namespace_lock and we can not do that because we already hold * z_teardown_lock. The problem is that spa_config_sync() is called * with spa_namespace_lock held and the function calls ZFS vnode * operations to write the cache file and thus z_teardown_lock is * acquired after spa_namespace_lock. */ ds = dmu_objset_ds(os); dsl_pool_config_enter(dmu_objset_pool(os), FTAG); /* * nbmand is a special property. It can only be changed at * mount time. * * This is weird, but it is documented to only be changeable * at mount time. */ if (vfs_optionisset(vfsp, MNTOPT_NONBMAND, NULL)) { nbmand = B_FALSE; } else if (vfs_optionisset(vfsp, MNTOPT_NBMAND, NULL)) { nbmand = B_TRUE; } else if (error = dsl_prop_get_int_ds(ds, "nbmand", &nbmand) != 0) { dsl_pool_config_exit(dmu_objset_pool(os), FTAG); return (error); } /* * Register property callbacks. * * It would probably be fine to just check for i/o error from * the first prop_register(), but I guess I like to go * overboard... */ error = dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_ATIME), atime_changed_cb, zfsvfs); error = error ? error : dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_XATTR), xattr_changed_cb, zfsvfs); error = error ? error : dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_RECORDSIZE), blksz_changed_cb, zfsvfs); error = error ? error : dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_READONLY), readonly_changed_cb, zfsvfs); #ifdef illumos error = error ? error : dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_DEVICES), devices_changed_cb, zfsvfs); #endif error = error ? error : dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_SETUID), setuid_changed_cb, zfsvfs); error = error ? error : dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_EXEC), exec_changed_cb, zfsvfs); error = error ? error : dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_SNAPDIR), snapdir_changed_cb, zfsvfs); error = error ? error : dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_ACLMODE), acl_mode_changed_cb, zfsvfs); error = error ? error : dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_ACLINHERIT), acl_inherit_changed_cb, zfsvfs); error = error ? error : dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_VSCAN), vscan_changed_cb, zfsvfs); dsl_pool_config_exit(dmu_objset_pool(os), FTAG); if (error) goto unregister; /* * Invoke our callbacks to restore temporary mount options. */ if (do_readonly) readonly_changed_cb(zfsvfs, readonly); if (do_setuid) setuid_changed_cb(zfsvfs, setuid); if (do_exec) exec_changed_cb(zfsvfs, exec); if (do_xattr) xattr_changed_cb(zfsvfs, xattr); if (do_atime) atime_changed_cb(zfsvfs, atime); nbmand_changed_cb(zfsvfs, nbmand); return (0); unregister: dsl_prop_unregister_all(ds, zfsvfs); return (error); } static int zfs_space_delta_cb(dmu_object_type_t bonustype, void *data, uint64_t *userp, uint64_t *groupp) { /* * Is it a valid type of object to track? */ if (bonustype != DMU_OT_ZNODE && bonustype != DMU_OT_SA) return (SET_ERROR(ENOENT)); /* * If we have a NULL data pointer * then assume the id's aren't changing and * return EEXIST to the dmu to let it know to * use the same ids */ if (data == NULL) return (SET_ERROR(EEXIST)); if (bonustype == DMU_OT_ZNODE) { znode_phys_t *znp = data; *userp = znp->zp_uid; *groupp = znp->zp_gid; } else { int hdrsize; sa_hdr_phys_t *sap = data; sa_hdr_phys_t sa = *sap; boolean_t swap = B_FALSE; ASSERT(bonustype == DMU_OT_SA); if (sa.sa_magic == 0) { /* * This should only happen for newly created * files that haven't had the znode data filled * in yet. */ *userp = 0; *groupp = 0; return (0); } if (sa.sa_magic == BSWAP_32(SA_MAGIC)) { sa.sa_magic = SA_MAGIC; sa.sa_layout_info = BSWAP_16(sa.sa_layout_info); swap = B_TRUE; } else { VERIFY3U(sa.sa_magic, ==, SA_MAGIC); } hdrsize = sa_hdrsize(&sa); VERIFY3U(hdrsize, >=, sizeof (sa_hdr_phys_t)); *userp = *((uint64_t *)((uintptr_t)data + hdrsize + SA_UID_OFFSET)); *groupp = *((uint64_t *)((uintptr_t)data + hdrsize + SA_GID_OFFSET)); if (swap) { *userp = BSWAP_64(*userp); *groupp = BSWAP_64(*groupp); } } return (0); } static void fuidstr_to_sid(zfsvfs_t *zfsvfs, const char *fuidstr, char *domainbuf, int buflen, uid_t *ridp) { uint64_t fuid; const char *domain; fuid = strtonum(fuidstr, NULL); domain = zfs_fuid_find_by_idx(zfsvfs, FUID_INDEX(fuid)); if (domain) (void) strlcpy(domainbuf, domain, buflen); else domainbuf[0] = '\0'; *ridp = FUID_RID(fuid); } static uint64_t zfs_userquota_prop_to_obj(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type) { switch (type) { case ZFS_PROP_USERUSED: return (DMU_USERUSED_OBJECT); case ZFS_PROP_GROUPUSED: return (DMU_GROUPUSED_OBJECT); case ZFS_PROP_USERQUOTA: return (zfsvfs->z_userquota_obj); case ZFS_PROP_GROUPQUOTA: return (zfsvfs->z_groupquota_obj); } return (0); } int zfs_userspace_many(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type, uint64_t *cookiep, void *vbuf, uint64_t *bufsizep) { int error; zap_cursor_t zc; zap_attribute_t za; zfs_useracct_t *buf = vbuf; uint64_t obj; if (!dmu_objset_userspace_present(zfsvfs->z_os)) return (SET_ERROR(ENOTSUP)); obj = zfs_userquota_prop_to_obj(zfsvfs, type); if (obj == 0) { *bufsizep = 0; return (0); } for (zap_cursor_init_serialized(&zc, zfsvfs->z_os, obj, *cookiep); (error = zap_cursor_retrieve(&zc, &za)) == 0; zap_cursor_advance(&zc)) { if ((uintptr_t)buf - (uintptr_t)vbuf + sizeof (zfs_useracct_t) > *bufsizep) break; fuidstr_to_sid(zfsvfs, za.za_name, buf->zu_domain, sizeof (buf->zu_domain), &buf->zu_rid); buf->zu_space = za.za_first_integer; buf++; } if (error == ENOENT) error = 0; ASSERT3U((uintptr_t)buf - (uintptr_t)vbuf, <=, *bufsizep); *bufsizep = (uintptr_t)buf - (uintptr_t)vbuf; *cookiep = zap_cursor_serialize(&zc); zap_cursor_fini(&zc); return (error); } /* * buf must be big enough (eg, 32 bytes) */ static int id_to_fuidstr(zfsvfs_t *zfsvfs, const char *domain, uid_t rid, char *buf, boolean_t addok) { uint64_t fuid; int domainid = 0; if (domain && domain[0]) { domainid = zfs_fuid_find_by_domain(zfsvfs, domain, NULL, addok); if (domainid == -1) return (SET_ERROR(ENOENT)); } fuid = FUID_ENCODE(domainid, rid); (void) sprintf(buf, "%llx", (longlong_t)fuid); return (0); } int zfs_userspace_one(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type, const char *domain, uint64_t rid, uint64_t *valp) { char buf[32]; int err; uint64_t obj; *valp = 0; if (!dmu_objset_userspace_present(zfsvfs->z_os)) return (SET_ERROR(ENOTSUP)); obj = zfs_userquota_prop_to_obj(zfsvfs, type); if (obj == 0) return (0); err = id_to_fuidstr(zfsvfs, domain, rid, buf, B_FALSE); if (err) return (err); err = zap_lookup(zfsvfs->z_os, obj, buf, 8, 1, valp); if (err == ENOENT) err = 0; return (err); } int zfs_set_userquota(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type, const char *domain, uint64_t rid, uint64_t quota) { char buf[32]; int err; dmu_tx_t *tx; uint64_t *objp; boolean_t fuid_dirtied; if (type != ZFS_PROP_USERQUOTA && type != ZFS_PROP_GROUPQUOTA) return (SET_ERROR(EINVAL)); if (zfsvfs->z_version < ZPL_VERSION_USERSPACE) return (SET_ERROR(ENOTSUP)); objp = (type == ZFS_PROP_USERQUOTA) ? &zfsvfs->z_userquota_obj : &zfsvfs->z_groupquota_obj; err = id_to_fuidstr(zfsvfs, domain, rid, buf, B_TRUE); if (err) return (err); fuid_dirtied = zfsvfs->z_fuid_dirty; tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_zap(tx, *objp ? *objp : DMU_NEW_OBJECT, B_TRUE, NULL); if (*objp == 0) { dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_TRUE, zfs_userquota_prop_prefixes[type]); } if (fuid_dirtied) zfs_fuid_txhold(zfsvfs, tx); err = dmu_tx_assign(tx, TXG_WAIT); if (err) { dmu_tx_abort(tx); return (err); } mutex_enter(&zfsvfs->z_lock); if (*objp == 0) { *objp = zap_create(zfsvfs->z_os, DMU_OT_USERGROUP_QUOTA, DMU_OT_NONE, 0, tx); VERIFY(0 == zap_add(zfsvfs->z_os, MASTER_NODE_OBJ, zfs_userquota_prop_prefixes[type], 8, 1, objp, tx)); } mutex_exit(&zfsvfs->z_lock); if (quota == 0) { err = zap_remove(zfsvfs->z_os, *objp, buf, tx); if (err == ENOENT) err = 0; } else { err = zap_update(zfsvfs->z_os, *objp, buf, 8, 1, "a, tx); } ASSERT(err == 0); if (fuid_dirtied) zfs_fuid_sync(zfsvfs, tx); dmu_tx_commit(tx); return (err); } boolean_t zfs_fuid_overquota(zfsvfs_t *zfsvfs, boolean_t isgroup, uint64_t fuid) { char buf[32]; uint64_t used, quota, usedobj, quotaobj; int err; usedobj = isgroup ? DMU_GROUPUSED_OBJECT : DMU_USERUSED_OBJECT; quotaobj = isgroup ? zfsvfs->z_groupquota_obj : zfsvfs->z_userquota_obj; if (quotaobj == 0 || zfsvfs->z_replay) return (B_FALSE); (void) sprintf(buf, "%llx", (longlong_t)fuid); err = zap_lookup(zfsvfs->z_os, quotaobj, buf, 8, 1, "a); if (err != 0) return (B_FALSE); err = zap_lookup(zfsvfs->z_os, usedobj, buf, 8, 1, &used); if (err != 0) return (B_FALSE); return (used >= quota); } boolean_t zfs_owner_overquota(zfsvfs_t *zfsvfs, znode_t *zp, boolean_t isgroup) { uint64_t fuid; uint64_t quotaobj; quotaobj = isgroup ? zfsvfs->z_groupquota_obj : zfsvfs->z_userquota_obj; fuid = isgroup ? zp->z_gid : zp->z_uid; if (quotaobj == 0 || zfsvfs->z_replay) return (B_FALSE); return (zfs_fuid_overquota(zfsvfs, isgroup, fuid)); } /* * Associate this zfsvfs with the given objset, which must be owned. * This will cache a bunch of on-disk state from the objset in the * zfsvfs. */ static int zfsvfs_init(zfsvfs_t *zfsvfs, objset_t *os) { int error; uint64_t val; zfsvfs->z_max_blksz = SPA_OLD_MAXBLOCKSIZE; zfsvfs->z_show_ctldir = ZFS_SNAPDIR_VISIBLE; zfsvfs->z_os = os; error = zfs_get_zplprop(os, ZFS_PROP_VERSION, &zfsvfs->z_version); if (error != 0) return (error); if (zfsvfs->z_version > zfs_zpl_version_map(spa_version(dmu_objset_spa(os)))) { (void) printf("Can't mount a version %lld file system " "on a version %lld pool\n. Pool must be upgraded to mount " "this file system.", (u_longlong_t)zfsvfs->z_version, (u_longlong_t)spa_version(dmu_objset_spa(os))); return (SET_ERROR(ENOTSUP)); } error = zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &val); if (error != 0) return (error); zfsvfs->z_norm = (int)val; error = zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &val); if (error != 0) return (error); zfsvfs->z_utf8 = (val != 0); error = zfs_get_zplprop(os, ZFS_PROP_CASE, &val); if (error != 0) return (error); zfsvfs->z_case = (uint_t)val; /* * Fold case on file systems that are always or sometimes case * insensitive. */ if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE || zfsvfs->z_case == ZFS_CASE_MIXED) zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER; zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os); zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os); uint64_t sa_obj = 0; if (zfsvfs->z_use_sa) { /* should either have both of these objects or none */ error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1, &sa_obj); if (error != 0) return (error); } error = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END, &zfsvfs->z_attr_table); if (error != 0) return (error); if (zfsvfs->z_version >= ZPL_VERSION_SA) sa_register_update_callback(os, zfs_sa_upgrade); error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1, &zfsvfs->z_root); if (error != 0) return (error); ASSERT(zfsvfs->z_root != 0); error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_UNLINKED_SET, 8, 1, &zfsvfs->z_unlinkedobj); if (error != 0) return (error); error = zap_lookup(os, MASTER_NODE_OBJ, zfs_userquota_prop_prefixes[ZFS_PROP_USERQUOTA], 8, 1, &zfsvfs->z_userquota_obj); if (error == ENOENT) zfsvfs->z_userquota_obj = 0; else if (error != 0) return (error); error = zap_lookup(os, MASTER_NODE_OBJ, zfs_userquota_prop_prefixes[ZFS_PROP_GROUPQUOTA], 8, 1, &zfsvfs->z_groupquota_obj); if (error == ENOENT) zfsvfs->z_groupquota_obj = 0; else if (error != 0) return (error); error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_FUID_TABLES, 8, 1, &zfsvfs->z_fuid_obj); if (error == ENOENT) zfsvfs->z_fuid_obj = 0; else if (error != 0) return (error); error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SHARES_DIR, 8, 1, &zfsvfs->z_shares_dir); if (error == ENOENT) zfsvfs->z_shares_dir = 0; else if (error != 0) return (error); /* * Only use the name cache if we are looking for a * name on a file system that does not require normalization * or case folding. We can also look there if we happen to be * on a non-normalizing, mixed sensitivity file system IF we * are looking for the exact name (which is always the case on * FreeBSD). */ zfsvfs->z_use_namecache = !zfsvfs->z_norm || ((zfsvfs->z_case == ZFS_CASE_MIXED) && !(zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER)); return (0); } int zfsvfs_create(const char *osname, zfsvfs_t **zfvp) { objset_t *os; zfsvfs_t *zfsvfs; int error; /* * XXX: Fix struct statfs so this isn't necessary! * * The 'osname' is used as the filesystem's special node, which means * it must fit in statfs.f_mntfromname, or else it can't be * enumerated, so libzfs_mnttab_find() returns NULL, which causes * 'zfs unmount' to think it's not mounted when it is. */ if (strlen(osname) >= MNAMELEN) return (SET_ERROR(ENAMETOOLONG)); zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP); /* * We claim to always be readonly so we can open snapshots; * other ZPL code will prevent us from writing to snapshots. */ error = dmu_objset_own(osname, DMU_OST_ZFS, B_TRUE, zfsvfs, &os); if (error) { kmem_free(zfsvfs, sizeof (zfsvfs_t)); return (error); } zfsvfs->z_vfs = NULL; zfsvfs->z_parent = zfsvfs; mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&zfsvfs->z_lock, NULL, MUTEX_DEFAULT, NULL); list_create(&zfsvfs->z_all_znodes, sizeof (znode_t), offsetof(znode_t, z_link_node)); #ifdef DIAGNOSTIC rrm_init(&zfsvfs->z_teardown_lock, B_TRUE); #else rrm_init(&zfsvfs->z_teardown_lock, B_FALSE); #endif rw_init(&zfsvfs->z_teardown_inactive_lock, NULL, RW_DEFAULT, NULL); rw_init(&zfsvfs->z_fuid_lock, NULL, RW_DEFAULT, NULL); for (int i = 0; i != ZFS_OBJ_MTX_SZ; i++) mutex_init(&zfsvfs->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL); error = zfsvfs_init(zfsvfs, os); if (error != 0) { dmu_objset_disown(os, zfsvfs); *zfvp = NULL; kmem_free(zfsvfs, sizeof (zfsvfs_t)); return (error); } *zfvp = zfsvfs; return (0); } static int zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting) { int error; error = zfs_register_callbacks(zfsvfs->z_vfs); if (error) return (error); /* * Set the objset user_ptr to track its zfsvfs. */ mutex_enter(&zfsvfs->z_os->os_user_ptr_lock); dmu_objset_set_user(zfsvfs->z_os, zfsvfs); mutex_exit(&zfsvfs->z_os->os_user_ptr_lock); zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data); /* * If we are not mounting (ie: online recv), then we don't * have to worry about replaying the log as we blocked all * operations out since we closed the ZIL. */ if (mounting) { boolean_t readonly; /* * During replay we remove the read only flag to * allow replays to succeed. */ readonly = zfsvfs->z_vfs->vfs_flag & VFS_RDONLY; if (readonly != 0) zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY; else zfs_unlinked_drain(zfsvfs); /* * Parse and replay the intent log. * * Because of ziltest, this must be done after * zfs_unlinked_drain(). (Further note: ziltest * doesn't use readonly mounts, where * zfs_unlinked_drain() isn't called.) This is because * ziltest causes spa_sync() to think it's committed, * but actually it is not, so the intent log contains * many txg's worth of changes. * * In particular, if object N is in the unlinked set in * the last txg to actually sync, then it could be * actually freed in a later txg and then reallocated * in a yet later txg. This would write a "create * object N" record to the intent log. Normally, this * would be fine because the spa_sync() would have * written out the fact that object N is free, before * we could write the "create object N" intent log * record. * * But when we are in ziltest mode, we advance the "open * txg" without actually spa_sync()-ing the changes to * disk. So we would see that object N is still * allocated and in the unlinked set, and there is an * intent log record saying to allocate it. */ if (spa_writeable(dmu_objset_spa(zfsvfs->z_os))) { if (zil_replay_disable) { zil_destroy(zfsvfs->z_log, B_FALSE); } else { zfsvfs->z_replay = B_TRUE; zil_replay(zfsvfs->z_os, zfsvfs, zfs_replay_vector); zfsvfs->z_replay = B_FALSE; } } zfsvfs->z_vfs->vfs_flag |= readonly; /* restore readonly bit */ } return (0); } extern krwlock_t zfsvfs_lock; /* in zfs_znode.c */ void zfsvfs_free(zfsvfs_t *zfsvfs) { int i; /* * This is a barrier to prevent the filesystem from going away in * zfs_znode_move() until we can safely ensure that the filesystem is * not unmounted. We consider the filesystem valid before the barrier * and invalid after the barrier. */ rw_enter(&zfsvfs_lock, RW_READER); rw_exit(&zfsvfs_lock); zfs_fuid_destroy(zfsvfs); mutex_destroy(&zfsvfs->z_znodes_lock); mutex_destroy(&zfsvfs->z_lock); list_destroy(&zfsvfs->z_all_znodes); rrm_destroy(&zfsvfs->z_teardown_lock); rw_destroy(&zfsvfs->z_teardown_inactive_lock); rw_destroy(&zfsvfs->z_fuid_lock); for (i = 0; i != ZFS_OBJ_MTX_SZ; i++) mutex_destroy(&zfsvfs->z_hold_mtx[i]); kmem_free(zfsvfs, sizeof (zfsvfs_t)); } static void zfs_set_fuid_feature(zfsvfs_t *zfsvfs) { zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os); if (zfsvfs->z_vfs) { if (zfsvfs->z_use_fuids) { vfs_set_feature(zfsvfs->z_vfs, VFSFT_XVATTR); vfs_set_feature(zfsvfs->z_vfs, VFSFT_SYSATTR_VIEWS); vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACEMASKONACCESS); vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACLONCREATE); vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACCESS_FILTER); vfs_set_feature(zfsvfs->z_vfs, VFSFT_REPARSE); } else { vfs_clear_feature(zfsvfs->z_vfs, VFSFT_XVATTR); vfs_clear_feature(zfsvfs->z_vfs, VFSFT_SYSATTR_VIEWS); vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACEMASKONACCESS); vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACLONCREATE); vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACCESS_FILTER); vfs_clear_feature(zfsvfs->z_vfs, VFSFT_REPARSE); } } zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os); } static int zfs_domount(vfs_t *vfsp, char *osname) { uint64_t recordsize, fsid_guid; int error = 0; zfsvfs_t *zfsvfs; vnode_t *vp; ASSERT(vfsp); ASSERT(osname); error = zfsvfs_create(osname, &zfsvfs); if (error) return (error); zfsvfs->z_vfs = vfsp; #ifdef illumos /* Initialize the generic filesystem structure. */ vfsp->vfs_bcount = 0; vfsp->vfs_data = NULL; if (zfs_create_unique_device(&mount_dev) == -1) { error = SET_ERROR(ENODEV); goto out; } ASSERT(vfs_devismounted(mount_dev) == 0); #endif if (error = dsl_prop_get_integer(osname, "recordsize", &recordsize, NULL)) goto out; zfsvfs->z_vfs->vfs_bsize = SPA_MINBLOCKSIZE; zfsvfs->z_vfs->mnt_stat.f_iosize = recordsize; vfsp->vfs_data = zfsvfs; vfsp->mnt_flag |= MNT_LOCAL; vfsp->mnt_kern_flag |= MNTK_LOOKUP_SHARED; vfsp->mnt_kern_flag |= MNTK_SHARED_WRITES; vfsp->mnt_kern_flag |= MNTK_EXTENDED_SHARED; vfsp->mnt_kern_flag |= MNTK_NO_IOPF; /* vn_io_fault can be used */ /* * The fsid is 64 bits, composed of an 8-bit fs type, which * separates our fsid from any other filesystem types, and a * 56-bit objset unique ID. The objset unique ID is unique to * all objsets open on this system, provided by unique_create(). * The 8-bit fs type must be put in the low bits of fsid[1] * because that's where other Solaris filesystems put it. */ fsid_guid = dmu_objset_fsid_guid(zfsvfs->z_os); ASSERT((fsid_guid & ~((1ULL<<56)-1)) == 0); vfsp->vfs_fsid.val[0] = fsid_guid; vfsp->vfs_fsid.val[1] = ((fsid_guid>>32) << 8) | vfsp->mnt_vfc->vfc_typenum & 0xFF; /* * Set features for file system. */ zfs_set_fuid_feature(zfsvfs); if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE) { vfs_set_feature(vfsp, VFSFT_DIRENTFLAGS); vfs_set_feature(vfsp, VFSFT_CASEINSENSITIVE); vfs_set_feature(vfsp, VFSFT_NOCASESENSITIVE); } else if (zfsvfs->z_case == ZFS_CASE_MIXED) { vfs_set_feature(vfsp, VFSFT_DIRENTFLAGS); vfs_set_feature(vfsp, VFSFT_CASEINSENSITIVE); } vfs_set_feature(vfsp, VFSFT_ZEROCOPY_SUPPORTED); if (dmu_objset_is_snapshot(zfsvfs->z_os)) { uint64_t pval; atime_changed_cb(zfsvfs, B_FALSE); readonly_changed_cb(zfsvfs, B_TRUE); if (error = dsl_prop_get_integer(osname, "xattr", &pval, NULL)) goto out; xattr_changed_cb(zfsvfs, pval); zfsvfs->z_issnap = B_TRUE; zfsvfs->z_os->os_sync = ZFS_SYNC_DISABLED; mutex_enter(&zfsvfs->z_os->os_user_ptr_lock); dmu_objset_set_user(zfsvfs->z_os, zfsvfs); mutex_exit(&zfsvfs->z_os->os_user_ptr_lock); } else { error = zfsvfs_setup(zfsvfs, B_TRUE); } vfs_mountedfrom(vfsp, osname); if (!zfsvfs->z_issnap) zfsctl_create(zfsvfs); out: if (error) { dmu_objset_disown(zfsvfs->z_os, zfsvfs); zfsvfs_free(zfsvfs); } else { atomic_inc_32(&zfs_active_fs_count); } return (error); } void zfs_unregister_callbacks(zfsvfs_t *zfsvfs) { objset_t *os = zfsvfs->z_os; if (!dmu_objset_is_snapshot(os)) dsl_prop_unregister_all(dmu_objset_ds(os), zfsvfs); } #ifdef SECLABEL /* * Convert a decimal digit string to a uint64_t integer. */ static int str_to_uint64(char *str, uint64_t *objnum) { uint64_t num = 0; while (*str) { if (*str < '0' || *str > '9') return (SET_ERROR(EINVAL)); num = num*10 + *str++ - '0'; } *objnum = num; return (0); } /* * The boot path passed from the boot loader is in the form of * "rootpool-name/root-filesystem-object-number'. Convert this * string to a dataset name: "rootpool-name/root-filesystem-name". */ static int zfs_parse_bootfs(char *bpath, char *outpath) { char *slashp; uint64_t objnum; int error; if (*bpath == 0 || *bpath == '/') return (SET_ERROR(EINVAL)); (void) strcpy(outpath, bpath); slashp = strchr(bpath, '/'); /* if no '/', just return the pool name */ if (slashp == NULL) { return (0); } /* if not a number, just return the root dataset name */ if (str_to_uint64(slashp+1, &objnum)) { return (0); } *slashp = '\0'; error = dsl_dsobj_to_dsname(bpath, objnum, outpath); *slashp = '/'; return (error); } /* * Check that the hex label string is appropriate for the dataset being * mounted into the global_zone proper. * * Return an error if the hex label string is not default or * admin_low/admin_high. For admin_low labels, the corresponding * dataset must be readonly. */ int zfs_check_global_label(const char *dsname, const char *hexsl) { if (strcasecmp(hexsl, ZFS_MLSLABEL_DEFAULT) == 0) return (0); if (strcasecmp(hexsl, ADMIN_HIGH) == 0) return (0); if (strcasecmp(hexsl, ADMIN_LOW) == 0) { /* must be readonly */ uint64_t rdonly; if (dsl_prop_get_integer(dsname, zfs_prop_to_name(ZFS_PROP_READONLY), &rdonly, NULL)) return (SET_ERROR(EACCES)); return (rdonly ? 0 : EACCES); } return (SET_ERROR(EACCES)); } /* * Determine whether the mount is allowed according to MAC check. * by comparing (where appropriate) label of the dataset against * the label of the zone being mounted into. If the dataset has * no label, create one. * * Returns 0 if access allowed, error otherwise (e.g. EACCES) */ static int zfs_mount_label_policy(vfs_t *vfsp, char *osname) { int error, retv; zone_t *mntzone = NULL; ts_label_t *mnt_tsl; bslabel_t *mnt_sl; bslabel_t ds_sl; char ds_hexsl[MAXNAMELEN]; retv = EACCES; /* assume the worst */ /* * Start by getting the dataset label if it exists. */ error = dsl_prop_get(osname, zfs_prop_to_name(ZFS_PROP_MLSLABEL), 1, sizeof (ds_hexsl), &ds_hexsl, NULL); if (error) return (SET_ERROR(EACCES)); /* * If labeling is NOT enabled, then disallow the mount of datasets * which have a non-default label already. No other label checks * are needed. */ if (!is_system_labeled()) { if (strcasecmp(ds_hexsl, ZFS_MLSLABEL_DEFAULT) == 0) return (0); return (SET_ERROR(EACCES)); } /* * Get the label of the mountpoint. If mounting into the global * zone (i.e. mountpoint is not within an active zone and the * zoned property is off), the label must be default or * admin_low/admin_high only; no other checks are needed. */ mntzone = zone_find_by_any_path(refstr_value(vfsp->vfs_mntpt), B_FALSE); if (mntzone->zone_id == GLOBAL_ZONEID) { uint64_t zoned; zone_rele(mntzone); if (dsl_prop_get_integer(osname, zfs_prop_to_name(ZFS_PROP_ZONED), &zoned, NULL)) return (SET_ERROR(EACCES)); if (!zoned) return (zfs_check_global_label(osname, ds_hexsl)); else /* * This is the case of a zone dataset being mounted * initially, before the zone has been fully created; * allow this mount into global zone. */ return (0); } mnt_tsl = mntzone->zone_slabel; ASSERT(mnt_tsl != NULL); label_hold(mnt_tsl); mnt_sl = label2bslabel(mnt_tsl); if (strcasecmp(ds_hexsl, ZFS_MLSLABEL_DEFAULT) == 0) { /* * The dataset doesn't have a real label, so fabricate one. */ char *str = NULL; if (l_to_str_internal(mnt_sl, &str) == 0 && dsl_prop_set_string(osname, zfs_prop_to_name(ZFS_PROP_MLSLABEL), ZPROP_SRC_LOCAL, str) == 0) retv = 0; if (str != NULL) kmem_free(str, strlen(str) + 1); } else if (hexstr_to_label(ds_hexsl, &ds_sl) == 0) { /* * Now compare labels to complete the MAC check. If the * labels are equal then allow access. If the mountpoint * label dominates the dataset label, allow readonly access. * Otherwise, access is denied. */ if (blequal(mnt_sl, &ds_sl)) retv = 0; else if (bldominates(mnt_sl, &ds_sl)) { vfs_setmntopt(vfsp, MNTOPT_RO, NULL, 0); retv = 0; } } label_rele(mnt_tsl); zone_rele(mntzone); return (retv); } #endif /* SECLABEL */ #ifdef OPENSOLARIS_MOUNTROOT static int zfs_mountroot(vfs_t *vfsp, enum whymountroot why) { int error = 0; static int zfsrootdone = 0; zfsvfs_t *zfsvfs = NULL; znode_t *zp = NULL; vnode_t *vp = NULL; char *zfs_bootfs; char *zfs_devid; ASSERT(vfsp); /* * The filesystem that we mount as root is defined in the * boot property "zfs-bootfs" with a format of * "poolname/root-dataset-objnum". */ if (why == ROOT_INIT) { if (zfsrootdone++) return (SET_ERROR(EBUSY)); /* * the process of doing a spa_load will require the * clock to be set before we could (for example) do * something better by looking at the timestamp on * an uberblock, so just set it to -1. */ clkset(-1); if ((zfs_bootfs = spa_get_bootprop("zfs-bootfs")) == NULL) { cmn_err(CE_NOTE, "spa_get_bootfs: can not get " "bootfs name"); return (SET_ERROR(EINVAL)); } zfs_devid = spa_get_bootprop("diskdevid"); error = spa_import_rootpool(rootfs.bo_name, zfs_devid); if (zfs_devid) spa_free_bootprop(zfs_devid); if (error) { spa_free_bootprop(zfs_bootfs); cmn_err(CE_NOTE, "spa_import_rootpool: error %d", error); return (error); } if (error = zfs_parse_bootfs(zfs_bootfs, rootfs.bo_name)) { spa_free_bootprop(zfs_bootfs); cmn_err(CE_NOTE, "zfs_parse_bootfs: error %d", error); return (error); } spa_free_bootprop(zfs_bootfs); if (error = vfs_lock(vfsp)) return (error); if (error = zfs_domount(vfsp, rootfs.bo_name)) { cmn_err(CE_NOTE, "zfs_domount: error %d", error); goto out; } zfsvfs = (zfsvfs_t *)vfsp->vfs_data; ASSERT(zfsvfs); if (error = zfs_zget(zfsvfs, zfsvfs->z_root, &zp)) { cmn_err(CE_NOTE, "zfs_zget: error %d", error); goto out; } vp = ZTOV(zp); mutex_enter(&vp->v_lock); vp->v_flag |= VROOT; mutex_exit(&vp->v_lock); rootvp = vp; /* * Leave rootvp held. The root file system is never unmounted. */ vfs_add((struct vnode *)0, vfsp, (vfsp->vfs_flag & VFS_RDONLY) ? MS_RDONLY : 0); out: vfs_unlock(vfsp); return (error); } else if (why == ROOT_REMOUNT) { readonly_changed_cb(vfsp->vfs_data, B_FALSE); vfsp->vfs_flag |= VFS_REMOUNT; /* refresh mount options */ zfs_unregister_callbacks(vfsp->vfs_data); return (zfs_register_callbacks(vfsp)); } else if (why == ROOT_UNMOUNT) { zfs_unregister_callbacks((zfsvfs_t *)vfsp->vfs_data); (void) zfs_sync(vfsp, 0, 0); return (0); } /* * if "why" is equal to anything else other than ROOT_INIT, * ROOT_REMOUNT, or ROOT_UNMOUNT, we do not support it. */ return (SET_ERROR(ENOTSUP)); } #endif /* OPENSOLARIS_MOUNTROOT */ static int getpoolname(const char *osname, char *poolname) { char *p; p = strchr(osname, '/'); if (p == NULL) { if (strlen(osname) >= MAXNAMELEN) return (ENAMETOOLONG); (void) strcpy(poolname, osname); } else { if (p - osname >= MAXNAMELEN) return (ENAMETOOLONG); (void) strncpy(poolname, osname, p - osname); poolname[p - osname] = '\0'; } return (0); } /*ARGSUSED*/ static int zfs_mount(vfs_t *vfsp) { kthread_t *td = curthread; vnode_t *mvp = vfsp->mnt_vnodecovered; cred_t *cr = td->td_ucred; char *osname; int error = 0; int canwrite; #ifdef illumos if (mvp->v_type != VDIR) return (SET_ERROR(ENOTDIR)); mutex_enter(&mvp->v_lock); if ((uap->flags & MS_REMOUNT) == 0 && (uap->flags & MS_OVERLAY) == 0 && (mvp->v_count != 1 || (mvp->v_flag & VROOT))) { mutex_exit(&mvp->v_lock); return (SET_ERROR(EBUSY)); } mutex_exit(&mvp->v_lock); /* * ZFS does not support passing unparsed data in via MS_DATA. * Users should use the MS_OPTIONSTR interface; this means * that all option parsing is already done and the options struct * can be interrogated. */ if ((uap->flags & MS_DATA) && uap->datalen > 0) #else /* !illumos */ if (!prison_allow(td->td_ucred, PR_ALLOW_MOUNT_ZFS)) return (SET_ERROR(EPERM)); if (vfs_getopt(vfsp->mnt_optnew, "from", (void **)&osname, NULL)) return (SET_ERROR(EINVAL)); #endif /* illumos */ /* * If full-owner-access is enabled and delegated administration is * turned on, we must set nosuid. */ if (zfs_super_owner && dsl_deleg_access(osname, ZFS_DELEG_PERM_MOUNT, cr) != ECANCELED) { secpolicy_fs_mount_clearopts(cr, vfsp); } /* * Check for mount privilege? * * If we don't have privilege then see if * we have local permission to allow it */ error = secpolicy_fs_mount(cr, mvp, vfsp); if (error) { if (dsl_deleg_access(osname, ZFS_DELEG_PERM_MOUNT, cr) != 0) goto out; if (!(vfsp->vfs_flag & MS_REMOUNT)) { vattr_t vattr; /* * Make sure user is the owner of the mount point * or has sufficient privileges. */ vattr.va_mask = AT_UID; vn_lock(mvp, LK_SHARED | LK_RETRY); if (VOP_GETATTR(mvp, &vattr, cr)) { VOP_UNLOCK(mvp, 0); goto out; } if (secpolicy_vnode_owner(mvp, cr, vattr.va_uid) != 0 && VOP_ACCESS(mvp, VWRITE, cr, td) != 0) { VOP_UNLOCK(mvp, 0); goto out; } VOP_UNLOCK(mvp, 0); } secpolicy_fs_mount_clearopts(cr, vfsp); } /* * Refuse to mount a filesystem if we are in a local zone and the * dataset is not visible. */ if (!INGLOBALZONE(curthread) && (!zone_dataset_visible(osname, &canwrite) || !canwrite)) { error = SET_ERROR(EPERM); goto out; } #ifdef SECLABEL error = zfs_mount_label_policy(vfsp, osname); if (error) goto out; #endif vfsp->vfs_flag |= MNT_NFS4ACLS; /* * When doing a remount, we simply refresh our temporary properties * according to those options set in the current VFS options. */ if (vfsp->vfs_flag & MS_REMOUNT) { zfsvfs_t *zfsvfs = vfsp->vfs_data; /* * Refresh mount options with z_teardown_lock blocking I/O while * the filesystem is in an inconsistent state. * The lock also serializes this code with filesystem * manipulations between entry to zfs_suspend_fs() and return * from zfs_resume_fs(). */ rrm_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG); zfs_unregister_callbacks(zfsvfs); error = zfs_register_callbacks(vfsp); rrm_exit(&zfsvfs->z_teardown_lock, FTAG); goto out; } /* Initial root mount: try hard to import the requested root pool. */ if ((vfsp->vfs_flag & MNT_ROOTFS) != 0 && (vfsp->vfs_flag & MNT_UPDATE) == 0) { char pname[MAXNAMELEN]; error = getpoolname(osname, pname); if (error == 0) error = spa_import_rootpool(pname); if (error) goto out; } DROP_GIANT(); error = zfs_domount(vfsp, osname); PICKUP_GIANT(); #ifdef illumos /* * Add an extra VFS_HOLD on our parent vfs so that it can't * disappear due to a forced unmount. */ if (error == 0 && ((zfsvfs_t *)vfsp->vfs_data)->z_issnap) VFS_HOLD(mvp->v_vfsp); #endif out: return (error); } static int zfs_statfs(vfs_t *vfsp, struct statfs *statp) { zfsvfs_t *zfsvfs = vfsp->vfs_data; uint64_t refdbytes, availbytes, usedobjs, availobjs; statp->f_version = STATFS_VERSION; ZFS_ENTER(zfsvfs); dmu_objset_space(zfsvfs->z_os, &refdbytes, &availbytes, &usedobjs, &availobjs); /* * The underlying storage pool actually uses multiple block sizes. * We report the fragsize as the smallest block size we support, * and we report our blocksize as the filesystem's maximum blocksize. */ statp->f_bsize = SPA_MINBLOCKSIZE; statp->f_iosize = zfsvfs->z_vfs->mnt_stat.f_iosize; /* * The following report "total" blocks of various kinds in the * file system, but reported in terms of f_frsize - the * "fragment" size. */ statp->f_blocks = (refdbytes + availbytes) >> SPA_MINBLOCKSHIFT; statp->f_bfree = availbytes / statp->f_bsize; statp->f_bavail = statp->f_bfree; /* no root reservation */ /* * statvfs() should really be called statufs(), because it assumes * static metadata. ZFS doesn't preallocate files, so the best * we can do is report the max that could possibly fit in f_files, * and that minus the number actually used in f_ffree. * For f_ffree, report the smaller of the number of object available * and the number of blocks (each object will take at least a block). */ statp->f_ffree = MIN(availobjs, statp->f_bfree); statp->f_files = statp->f_ffree + usedobjs; /* * We're a zfs filesystem. */ (void) strlcpy(statp->f_fstypename, "zfs", sizeof(statp->f_fstypename)); strlcpy(statp->f_mntfromname, vfsp->mnt_stat.f_mntfromname, sizeof(statp->f_mntfromname)); strlcpy(statp->f_mntonname, vfsp->mnt_stat.f_mntonname, sizeof(statp->f_mntonname)); statp->f_namemax = MAXNAMELEN - 1; ZFS_EXIT(zfsvfs); return (0); } static int zfs_root(vfs_t *vfsp, int flags, vnode_t **vpp) { zfsvfs_t *zfsvfs = vfsp->vfs_data; znode_t *rootzp; int error; ZFS_ENTER(zfsvfs); error = zfs_zget(zfsvfs, zfsvfs->z_root, &rootzp); if (error == 0) *vpp = ZTOV(rootzp); ZFS_EXIT(zfsvfs); if (error == 0) { error = vn_lock(*vpp, flags); if (error != 0) { VN_RELE(*vpp); *vpp = NULL; } } return (error); } /* * Teardown the zfsvfs::z_os. * * Note, if 'unmounting' if FALSE, we return with the 'z_teardown_lock' * and 'z_teardown_inactive_lock' held. */ static int zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting) { znode_t *zp; rrm_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG); if (!unmounting) { /* * We purge the parent filesystem's vfsp as the parent * filesystem and all of its snapshots have their vnode's * v_vfsp set to the parent's filesystem's vfsp. Note, * 'z_parent' is self referential for non-snapshots. */ (void) dnlc_purge_vfsp(zfsvfs->z_parent->z_vfs, 0); #ifdef FREEBSD_NAMECACHE - cache_purgevfs(zfsvfs->z_parent->z_vfs); + cache_purgevfs(zfsvfs->z_parent->z_vfs, true); #endif } /* * Close the zil. NB: Can't close the zil while zfs_inactive * threads are blocked as zil_close can call zfs_inactive. */ if (zfsvfs->z_log) { zil_close(zfsvfs->z_log); zfsvfs->z_log = NULL; } rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_WRITER); /* * If we are not unmounting (ie: online recv) and someone already * unmounted this file system while we were doing the switcheroo, * or a reopen of z_os failed then just bail out now. */ if (!unmounting && (zfsvfs->z_unmounted || zfsvfs->z_os == NULL)) { rw_exit(&zfsvfs->z_teardown_inactive_lock); rrm_exit(&zfsvfs->z_teardown_lock, FTAG); return (SET_ERROR(EIO)); } /* * At this point there are no vops active, and any new vops will * fail with EIO since we have z_teardown_lock for writer (only * relavent for forced unmount). * * Release all holds on dbufs. */ mutex_enter(&zfsvfs->z_znodes_lock); for (zp = list_head(&zfsvfs->z_all_znodes); zp != NULL; zp = list_next(&zfsvfs->z_all_znodes, zp)) if (zp->z_sa_hdl) { ASSERT(ZTOV(zp)->v_count >= 0); zfs_znode_dmu_fini(zp); } mutex_exit(&zfsvfs->z_znodes_lock); /* * If we are unmounting, set the unmounted flag and let new vops * unblock. zfs_inactive will have the unmounted behavior, and all * other vops will fail with EIO. */ if (unmounting) { zfsvfs->z_unmounted = B_TRUE; rrm_exit(&zfsvfs->z_teardown_lock, FTAG); rw_exit(&zfsvfs->z_teardown_inactive_lock); } /* * z_os will be NULL if there was an error in attempting to reopen * zfsvfs, so just return as the properties had already been * unregistered and cached data had been evicted before. */ if (zfsvfs->z_os == NULL) return (0); /* * Unregister properties. */ zfs_unregister_callbacks(zfsvfs); /* * Evict cached data */ if (dsl_dataset_is_dirty(dmu_objset_ds(zfsvfs->z_os)) && !(zfsvfs->z_vfs->vfs_flag & VFS_RDONLY)) txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0); dmu_objset_evict_dbufs(zfsvfs->z_os); return (0); } /*ARGSUSED*/ static int zfs_umount(vfs_t *vfsp, int fflag) { kthread_t *td = curthread; zfsvfs_t *zfsvfs = vfsp->vfs_data; objset_t *os; cred_t *cr = td->td_ucred; int ret; ret = secpolicy_fs_unmount(cr, vfsp); if (ret) { if (dsl_deleg_access((char *)refstr_value(vfsp->vfs_resource), ZFS_DELEG_PERM_MOUNT, cr)) return (ret); } /* * We purge the parent filesystem's vfsp as the parent filesystem * and all of its snapshots have their vnode's v_vfsp set to the * parent's filesystem's vfsp. Note, 'z_parent' is self * referential for non-snapshots. */ (void) dnlc_purge_vfsp(zfsvfs->z_parent->z_vfs, 0); /* * Unmount any snapshots mounted under .zfs before unmounting the * dataset itself. */ if (zfsvfs->z_ctldir != NULL) { if ((ret = zfsctl_umount_snapshots(vfsp, fflag, cr)) != 0) return (ret); ret = vflush(vfsp, 0, 0, td); ASSERT(ret == EBUSY); if (!(fflag & MS_FORCE)) { if (zfsvfs->z_ctldir->v_count > 1) return (EBUSY); ASSERT(zfsvfs->z_ctldir->v_count == 1); } zfsctl_destroy(zfsvfs); ASSERT(zfsvfs->z_ctldir == NULL); } if (fflag & MS_FORCE) { /* * Mark file system as unmounted before calling * vflush(FORCECLOSE). This way we ensure no future vnops * will be called and risk operating on DOOMED vnodes. */ rrm_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG); zfsvfs->z_unmounted = B_TRUE; rrm_exit(&zfsvfs->z_teardown_lock, FTAG); } /* * Flush all the files. */ ret = vflush(vfsp, 0, (fflag & MS_FORCE) ? FORCECLOSE : 0, td); if (ret != 0) { if (!zfsvfs->z_issnap) { zfsctl_create(zfsvfs); ASSERT(zfsvfs->z_ctldir != NULL); } return (ret); } #ifdef illumos if (!(fflag & MS_FORCE)) { /* * Check the number of active vnodes in the file system. * Our count is maintained in the vfs structure, but the * number is off by 1 to indicate a hold on the vfs * structure itself. * * The '.zfs' directory maintains a reference of its * own, and any active references underneath are * reflected in the vnode count. */ if (zfsvfs->z_ctldir == NULL) { if (vfsp->vfs_count > 1) return (SET_ERROR(EBUSY)); } else { if (vfsp->vfs_count > 2 || zfsvfs->z_ctldir->v_count > 1) return (SET_ERROR(EBUSY)); } } #endif VERIFY(zfsvfs_teardown(zfsvfs, B_TRUE) == 0); os = zfsvfs->z_os; /* * z_os will be NULL if there was an error in * attempting to reopen zfsvfs. */ if (os != NULL) { /* * Unset the objset user_ptr. */ mutex_enter(&os->os_user_ptr_lock); dmu_objset_set_user(os, NULL); mutex_exit(&os->os_user_ptr_lock); /* * Finally release the objset */ dmu_objset_disown(os, zfsvfs); } /* * We can now safely destroy the '.zfs' directory node. */ if (zfsvfs->z_ctldir != NULL) zfsctl_destroy(zfsvfs); zfs_freevfs(vfsp); return (0); } static int zfs_vget(vfs_t *vfsp, ino_t ino, int flags, vnode_t **vpp) { zfsvfs_t *zfsvfs = vfsp->vfs_data; znode_t *zp; int err; /* * zfs_zget() can't operate on virtual entries like .zfs/ or * .zfs/snapshot/ directories, that's why we return EOPNOTSUPP. * This will make NFS to switch to LOOKUP instead of using VGET. */ if (ino == ZFSCTL_INO_ROOT || ino == ZFSCTL_INO_SNAPDIR || (zfsvfs->z_shares_dir != 0 && ino == zfsvfs->z_shares_dir)) return (EOPNOTSUPP); ZFS_ENTER(zfsvfs); err = zfs_zget(zfsvfs, ino, &zp); if (err == 0 && zp->z_unlinked) { vrele(ZTOV(zp)); err = EINVAL; } if (err == 0) *vpp = ZTOV(zp); ZFS_EXIT(zfsvfs); if (err == 0) err = vn_lock(*vpp, flags); if (err != 0) *vpp = NULL; return (err); } static int zfs_checkexp(vfs_t *vfsp, struct sockaddr *nam, int *extflagsp, struct ucred **credanonp, int *numsecflavors, int **secflavors) { zfsvfs_t *zfsvfs = vfsp->vfs_data; /* * If this is regular file system vfsp is the same as * zfsvfs->z_parent->z_vfs, but if it is snapshot, * zfsvfs->z_parent->z_vfs represents parent file system * which we have to use here, because only this file system * has mnt_export configured. */ return (vfs_stdcheckexp(zfsvfs->z_parent->z_vfs, nam, extflagsp, credanonp, numsecflavors, secflavors)); } CTASSERT(SHORT_FID_LEN <= sizeof(struct fid)); CTASSERT(LONG_FID_LEN <= sizeof(struct fid)); static int zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, int flags, vnode_t **vpp) { zfsvfs_t *zfsvfs = vfsp->vfs_data; znode_t *zp; uint64_t object = 0; uint64_t fid_gen = 0; uint64_t gen_mask; uint64_t zp_gen; int i, err; *vpp = NULL; ZFS_ENTER(zfsvfs); /* * On FreeBSD we can get snapshot's mount point or its parent file * system mount point depending if snapshot is already mounted or not. */ if (zfsvfs->z_parent == zfsvfs && fidp->fid_len == LONG_FID_LEN) { zfid_long_t *zlfid = (zfid_long_t *)fidp; uint64_t objsetid = 0; uint64_t setgen = 0; for (i = 0; i < sizeof (zlfid->zf_setid); i++) objsetid |= ((uint64_t)zlfid->zf_setid[i]) << (8 * i); for (i = 0; i < sizeof (zlfid->zf_setgen); i++) setgen |= ((uint64_t)zlfid->zf_setgen[i]) << (8 * i); ZFS_EXIT(zfsvfs); err = zfsctl_lookup_objset(vfsp, objsetid, &zfsvfs); if (err) return (SET_ERROR(EINVAL)); ZFS_ENTER(zfsvfs); } if (fidp->fid_len == SHORT_FID_LEN || fidp->fid_len == LONG_FID_LEN) { zfid_short_t *zfid = (zfid_short_t *)fidp; for (i = 0; i < sizeof (zfid->zf_object); i++) object |= ((uint64_t)zfid->zf_object[i]) << (8 * i); for (i = 0; i < sizeof (zfid->zf_gen); i++) fid_gen |= ((uint64_t)zfid->zf_gen[i]) << (8 * i); } else { ZFS_EXIT(zfsvfs); return (SET_ERROR(EINVAL)); } /* * A zero fid_gen means we are in .zfs or the .zfs/snapshot * directory tree. If the object == zfsvfs->z_shares_dir, then * we are in the .zfs/shares directory tree. */ if ((fid_gen == 0 && (object == ZFSCTL_INO_ROOT || object == ZFSCTL_INO_SNAPDIR)) || (zfsvfs->z_shares_dir != 0 && object == zfsvfs->z_shares_dir)) { *vpp = zfsvfs->z_ctldir; ASSERT(*vpp != NULL); if (object == ZFSCTL_INO_SNAPDIR) { VERIFY(zfsctl_root_lookup(*vpp, "snapshot", vpp, NULL, 0, NULL, NULL, NULL, NULL, NULL) == 0); } else if (object == zfsvfs->z_shares_dir) { VERIFY(zfsctl_root_lookup(*vpp, "shares", vpp, NULL, 0, NULL, NULL, NULL, NULL, NULL) == 0); } else { vref(*vpp); } ZFS_EXIT(zfsvfs); err = vn_lock(*vpp, flags); if (err != 0) *vpp = NULL; return (err); } gen_mask = -1ULL >> (64 - 8 * i); dprintf("getting %llu [%u mask %llx]\n", object, fid_gen, gen_mask); if (err = zfs_zget(zfsvfs, object, &zp)) { ZFS_EXIT(zfsvfs); return (err); } (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs), &zp_gen, sizeof (uint64_t)); zp_gen = zp_gen & gen_mask; if (zp_gen == 0) zp_gen = 1; if (zp->z_unlinked || zp_gen != fid_gen) { dprintf("znode gen (%u) != fid gen (%u)\n", zp_gen, fid_gen); vrele(ZTOV(zp)); ZFS_EXIT(zfsvfs); return (SET_ERROR(EINVAL)); } *vpp = ZTOV(zp); ZFS_EXIT(zfsvfs); err = vn_lock(*vpp, flags | LK_RETRY); if (err == 0) vnode_create_vobject(*vpp, zp->z_size, curthread); else *vpp = NULL; return (err); } /* * Block out VOPs and close zfsvfs_t::z_os * * Note, if successful, then we return with the 'z_teardown_lock' and * 'z_teardown_inactive_lock' write held. We leave ownership of the underlying * dataset and objset intact so that they can be atomically handed off during * a subsequent rollback or recv operation and the resume thereafter. */ int zfs_suspend_fs(zfsvfs_t *zfsvfs) { int error; if ((error = zfsvfs_teardown(zfsvfs, B_FALSE)) != 0) return (error); return (0); } /* * Rebuild SA and release VOPs. Note that ownership of the underlying dataset * is an invariant across any of the operations that can be performed while the * filesystem was suspended. Whether it succeeded or failed, the preconditions * are the same: the relevant objset and associated dataset are owned by * zfsvfs, held, and long held on entry. */ int zfs_resume_fs(zfsvfs_t *zfsvfs, const char *osname) { int err; znode_t *zp; ASSERT(RRM_WRITE_HELD(&zfsvfs->z_teardown_lock)); ASSERT(RW_WRITE_HELD(&zfsvfs->z_teardown_inactive_lock)); /* * We already own this, so just hold and rele it to update the * objset_t, as the one we had before may have been evicted. */ objset_t *os; VERIFY0(dmu_objset_hold(osname, zfsvfs, &os)); VERIFY3P(os->os_dsl_dataset->ds_owner, ==, zfsvfs); VERIFY(dsl_dataset_long_held(os->os_dsl_dataset)); dmu_objset_rele(os, zfsvfs); err = zfsvfs_init(zfsvfs, os); if (err != 0) goto bail; VERIFY(zfsvfs_setup(zfsvfs, B_FALSE) == 0); zfs_set_fuid_feature(zfsvfs); /* * Attempt to re-establish all the active znodes with * their dbufs. If a zfs_rezget() fails, then we'll let * any potential callers discover that via ZFS_ENTER_VERIFY_VP * when they try to use their znode. */ mutex_enter(&zfsvfs->z_znodes_lock); for (zp = list_head(&zfsvfs->z_all_znodes); zp; zp = list_next(&zfsvfs->z_all_znodes, zp)) { (void) zfs_rezget(zp); } mutex_exit(&zfsvfs->z_znodes_lock); bail: /* release the VOPs */ rw_exit(&zfsvfs->z_teardown_inactive_lock); rrm_exit(&zfsvfs->z_teardown_lock, FTAG); if (err) { /* * Since we couldn't setup the sa framework, try to force * unmount this file system. */ if (vn_vfswlock(zfsvfs->z_vfs->vfs_vnodecovered) == 0) { vfs_ref(zfsvfs->z_vfs); (void) dounmount(zfsvfs->z_vfs, MS_FORCE, curthread); } } return (err); } static void zfs_freevfs(vfs_t *vfsp) { zfsvfs_t *zfsvfs = vfsp->vfs_data; #ifdef illumos /* * If this is a snapshot, we have an extra VFS_HOLD on our parent * from zfs_mount(). Release it here. If we came through * zfs_mountroot() instead, we didn't grab an extra hold, so * skip the VFS_RELE for rootvfs. */ if (zfsvfs->z_issnap && (vfsp != rootvfs)) VFS_RELE(zfsvfs->z_parent->z_vfs); #endif zfsvfs_free(zfsvfs); atomic_dec_32(&zfs_active_fs_count); } #ifdef __i386__ static int desiredvnodes_backup; #endif static void zfs_vnodes_adjust(void) { #ifdef __i386__ int newdesiredvnodes; desiredvnodes_backup = desiredvnodes; /* * We calculate newdesiredvnodes the same way it is done in * vntblinit(). If it is equal to desiredvnodes, it means that * it wasn't tuned by the administrator and we can tune it down. */ newdesiredvnodes = min(maxproc + vm_cnt.v_page_count / 4, 2 * vm_kmem_size / (5 * (sizeof(struct vm_object) + sizeof(struct vnode)))); if (newdesiredvnodes == desiredvnodes) desiredvnodes = (3 * newdesiredvnodes) / 4; #endif } static void zfs_vnodes_adjust_back(void) { #ifdef __i386__ desiredvnodes = desiredvnodes_backup; #endif } void zfs_init(void) { printf("ZFS filesystem version: " ZPL_VERSION_STRING "\n"); /* * Initialize .zfs directory structures */ zfsctl_init(); /* * Initialize znode cache, vnode ops, etc... */ zfs_znode_init(); /* * Reduce number of vnodes. Originally number of vnodes is calculated * with UFS inode in mind. We reduce it here, because it's too big for * ZFS/i386. */ zfs_vnodes_adjust(); dmu_objset_register_type(DMU_OST_ZFS, zfs_space_delta_cb); } void zfs_fini(void) { zfsctl_fini(); zfs_znode_fini(); zfs_vnodes_adjust_back(); } int zfs_busy(void) { return (zfs_active_fs_count != 0); } int zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers) { int error; objset_t *os = zfsvfs->z_os; dmu_tx_t *tx; if (newvers < ZPL_VERSION_INITIAL || newvers > ZPL_VERSION) return (SET_ERROR(EINVAL)); if (newvers < zfsvfs->z_version) return (SET_ERROR(EINVAL)); if (zfs_spa_version_map(newvers) > spa_version(dmu_objset_spa(zfsvfs->z_os))) return (SET_ERROR(ENOTSUP)); tx = dmu_tx_create(os); dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_FALSE, ZPL_VERSION_STR); if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) { dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_TRUE, ZFS_SA_ATTRS); dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL); } error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); return (error); } error = zap_update(os, MASTER_NODE_OBJ, ZPL_VERSION_STR, 8, 1, &newvers, tx); if (error) { dmu_tx_commit(tx); return (error); } if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) { uint64_t sa_obj; ASSERT3U(spa_version(dmu_objset_spa(zfsvfs->z_os)), >=, SPA_VERSION_SA); sa_obj = zap_create(os, DMU_OT_SA_MASTER_NODE, DMU_OT_NONE, 0, tx); error = zap_add(os, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1, &sa_obj, tx); ASSERT0(error); VERIFY(0 == sa_set_sa_object(os, sa_obj)); sa_register_update_callback(os, zfs_sa_upgrade); } spa_history_log_internal_ds(dmu_objset_ds(os), "upgrade", tx, "from %llu to %llu", zfsvfs->z_version, newvers); dmu_tx_commit(tx); zfsvfs->z_version = newvers; zfs_set_fuid_feature(zfsvfs); return (0); } /* * Read a property stored within the master node. */ int zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value) { const char *pname; int error = ENOENT; /* * Look up the file system's value for the property. For the * version property, we look up a slightly different string. */ if (prop == ZFS_PROP_VERSION) pname = ZPL_VERSION_STR; else pname = zfs_prop_to_name(prop); if (os != NULL) error = zap_lookup(os, MASTER_NODE_OBJ, pname, 8, 1, value); if (error == ENOENT) { /* No value set, use the default value */ switch (prop) { case ZFS_PROP_VERSION: *value = ZPL_VERSION; break; case ZFS_PROP_NORMALIZE: case ZFS_PROP_UTF8ONLY: *value = 0; break; case ZFS_PROP_CASE: *value = ZFS_CASE_SENSITIVE; break; default: return (error); } error = 0; } return (error); } #ifdef _KERNEL void zfsvfs_update_fromname(const char *oldname, const char *newname) { char tmpbuf[MAXPATHLEN]; struct mount *mp; char *fromname; size_t oldlen; oldlen = strlen(oldname); mtx_lock(&mountlist_mtx); TAILQ_FOREACH(mp, &mountlist, mnt_list) { fromname = mp->mnt_stat.f_mntfromname; if (strcmp(fromname, oldname) == 0) { (void)strlcpy(fromname, newname, sizeof(mp->mnt_stat.f_mntfromname)); continue; } if (strncmp(fromname, oldname, oldlen) == 0 && (fromname[oldlen] == '/' || fromname[oldlen] == '@')) { (void)snprintf(tmpbuf, sizeof(tmpbuf), "%s%s", newname, fromname + oldlen); (void)strlcpy(fromname, tmpbuf, sizeof(mp->mnt_stat.f_mntfromname)); continue; } } mtx_unlock(&mountlist_mtx); } #endif Index: user/alc/PQ_LAUNDRY/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c =================================================================== --- user/alc/PQ_LAUNDRY/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c (revision 306831) +++ user/alc/PQ_LAUNDRY/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c (revision 306832) @@ -1,6067 +1,6078 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2015 by Delphix. All rights reserved. * Copyright 2014 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2014 Integros [integros.com] */ /* Portions Copyright 2007 Jeremy Teo */ /* Portions Copyright 2010 Robert Milkowski */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Programming rules. * * Each vnode op performs some logical unit of work. To do this, the ZPL must * properly lock its in-core state, create a DMU transaction, do the work, * record this work in the intent log (ZIL), commit the DMU transaction, * and wait for the intent log to commit if it is a synchronous operation. * Moreover, the vnode ops must work in both normal and log replay context. * The ordering of events is important to avoid deadlocks and references * to freed memory. The example below illustrates the following Big Rules: * * (1) A check must be made in each zfs thread for a mounted file system. * This is done avoiding races using ZFS_ENTER(zfsvfs). * A ZFS_EXIT(zfsvfs) is needed before all returns. Any znodes * must be checked with ZFS_VERIFY_ZP(zp). Both of these macros * can return EIO from the calling function. * * (2) VN_RELE() should always be the last thing except for zil_commit() * (if necessary) and ZFS_EXIT(). This is for 3 reasons: * First, if it's the last reference, the vnode/znode * can be freed, so the zp may point to freed memory. Second, the last * reference will call zfs_zinactive(), which may induce a lot of work -- * pushing cached pages (which acquires range locks) and syncing out * cached atime changes. Third, zfs_zinactive() may require a new tx, * which could deadlock the system if you were already holding one. * If you must call VN_RELE() within a tx then use VN_RELE_ASYNC(). * * (3) All range locks must be grabbed before calling dmu_tx_assign(), * as they can span dmu_tx_assign() calls. * * (4) If ZPL locks are held, pass TXG_NOWAIT as the second argument to * dmu_tx_assign(). This is critical because we don't want to block * while holding locks. * * If no ZPL locks are held (aside from ZFS_ENTER()), use TXG_WAIT. This * reduces lock contention and CPU usage when we must wait (note that if * throughput is constrained by the storage, nearly every transaction * must wait). * * Note, in particular, that if a lock is sometimes acquired before * the tx assigns, and sometimes after (e.g. z_lock), then failing * to use a non-blocking assign can deadlock the system. The scenario: * * Thread A has grabbed a lock before calling dmu_tx_assign(). * Thread B is in an already-assigned tx, and blocks for this lock. * Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open() * forever, because the previous txg can't quiesce until B's tx commits. * * If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT, * then drop all locks, call dmu_tx_wait(), and try again. On subsequent * calls to dmu_tx_assign(), pass TXG_WAITED rather than TXG_NOWAIT, * to indicate that this operation has already called dmu_tx_wait(). * This will ensure that we don't retry forever, waiting a short bit * each time. * * (5) If the operation succeeded, generate the intent log entry for it * before dropping locks. This ensures that the ordering of events * in the intent log matches the order in which they actually occurred. * During ZIL replay the zfs_log_* functions will update the sequence * number to indicate the zil transaction has replayed. * * (6) At the end of each vnode op, the DMU tx must always commit, * regardless of whether there were any errors. * * (7) After dropping all locks, invoke zil_commit(zilog, foid) * to ensure that synchronous semantics are provided when necessary. * * In general, this is how things should be ordered in each vnode op: * * ZFS_ENTER(zfsvfs); // exit if unmounted * top: * zfs_dirent_lookup(&dl, ...) // lock directory entry (may VN_HOLD()) * rw_enter(...); // grab any other locks you need * tx = dmu_tx_create(...); // get DMU tx * dmu_tx_hold_*(); // hold each object you might modify * error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT); * if (error) { * rw_exit(...); // drop locks * zfs_dirent_unlock(dl); // unlock directory entry * VN_RELE(...); // release held vnodes * if (error == ERESTART) { * waited = B_TRUE; * dmu_tx_wait(tx); * dmu_tx_abort(tx); * goto top; * } * dmu_tx_abort(tx); // abort DMU tx * ZFS_EXIT(zfsvfs); // finished in zfs * return (error); // really out of space * } * error = do_real_work(); // do whatever this VOP does * if (error == 0) * zfs_log_*(...); // on success, make ZIL entry * dmu_tx_commit(tx); // commit DMU tx -- error or not * rw_exit(...); // drop locks * zfs_dirent_unlock(dl); // unlock directory entry * VN_RELE(...); // release held vnodes * zil_commit(zilog, foid); // synchronous when necessary * ZFS_EXIT(zfsvfs); // finished in zfs * return (error); // done, report error */ /* ARGSUSED */ static int zfs_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct) { znode_t *zp = VTOZ(*vpp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); if ((flag & FWRITE) && (zp->z_pflags & ZFS_APPENDONLY) && ((flag & FAPPEND) == 0)) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EPERM)); } if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan && ZTOV(zp)->v_type == VREG && !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0) { if (fs_vscan(*vpp, cr, 0) != 0) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EACCES)); } } /* Keep a count of the synchronous opens in the znode */ if (flag & (FSYNC | FDSYNC)) atomic_inc_32(&zp->z_sync_cnt); ZFS_EXIT(zfsvfs); return (0); } /* ARGSUSED */ static int zfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr, caller_context_t *ct) { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; /* * Clean up any locks held by this process on the vp. */ cleanlocks(vp, ddi_get_pid(), 0); cleanshares(vp, ddi_get_pid()); ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); /* Decrement the synchronous opens in the znode */ if ((flag & (FSYNC | FDSYNC)) && (count == 1)) atomic_dec_32(&zp->z_sync_cnt); if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan && ZTOV(zp)->v_type == VREG && !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0) VERIFY(fs_vscan(vp, cr, 1) == 0); ZFS_EXIT(zfsvfs); return (0); } /* * Lseek support for finding holes (cmd == _FIO_SEEK_HOLE) and * data (cmd == _FIO_SEEK_DATA). "off" is an in/out parameter. */ static int zfs_holey(vnode_t *vp, u_long cmd, offset_t *off) { znode_t *zp = VTOZ(vp); uint64_t noff = (uint64_t)*off; /* new offset */ uint64_t file_sz; int error; boolean_t hole; file_sz = zp->z_size; if (noff >= file_sz) { return (SET_ERROR(ENXIO)); } if (cmd == _FIO_SEEK_HOLE) hole = B_TRUE; else hole = B_FALSE; error = dmu_offset_next(zp->z_zfsvfs->z_os, zp->z_id, hole, &noff); if (error == ESRCH) return (SET_ERROR(ENXIO)); /* * We could find a hole that begins after the logical end-of-file, * because dmu_offset_next() only works on whole blocks. If the * EOF falls mid-block, then indicate that the "virtual hole" * at the end of the file begins at the logical EOF, rather than * at the end of the last block. */ if (noff > file_sz) { ASSERT(hole); noff = file_sz; } if (noff < *off) return (error); *off = noff; return (error); } /* ARGSUSED */ static int zfs_ioctl(vnode_t *vp, u_long com, intptr_t data, int flag, cred_t *cred, int *rvalp, caller_context_t *ct) { offset_t off; offset_t ndata; dmu_object_info_t doi; int error; zfsvfs_t *zfsvfs; znode_t *zp; switch (com) { case _FIOFFS: { return (0); /* * The following two ioctls are used by bfu. Faking out, * necessary to avoid bfu errors. */ } case _FIOGDIO: case _FIOSDIO: { return (0); } case _FIO_SEEK_DATA: case _FIO_SEEK_HOLE: { #ifdef illumos if (ddi_copyin((void *)data, &off, sizeof (off), flag)) return (SET_ERROR(EFAULT)); #else off = *(offset_t *)data; #endif zp = VTOZ(vp); zfsvfs = zp->z_zfsvfs; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); /* offset parameter is in/out */ error = zfs_holey(vp, com, &off); ZFS_EXIT(zfsvfs); if (error) return (error); #ifdef illumos if (ddi_copyout(&off, (void *)data, sizeof (off), flag)) return (SET_ERROR(EFAULT)); #else *(offset_t *)data = off; #endif return (0); } #ifdef illumos case _FIO_COUNT_FILLED: { /* * _FIO_COUNT_FILLED adds a new ioctl command which * exposes the number of filled blocks in a * ZFS object. */ zp = VTOZ(vp); zfsvfs = zp->z_zfsvfs; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); /* * Wait for all dirty blocks for this object * to get synced out to disk, and the DMU info * updated. */ error = dmu_object_wait_synced(zfsvfs->z_os, zp->z_id); if (error) { ZFS_EXIT(zfsvfs); return (error); } /* * Retrieve fill count from DMU object. */ error = dmu_object_info(zfsvfs->z_os, zp->z_id, &doi); if (error) { ZFS_EXIT(zfsvfs); return (error); } ndata = doi.doi_fill_count; ZFS_EXIT(zfsvfs); if (ddi_copyout(&ndata, (void *)data, sizeof (ndata), flag)) return (SET_ERROR(EFAULT)); return (0); } #endif } return (SET_ERROR(ENOTTY)); } static vm_page_t page_busy(vnode_t *vp, int64_t start, int64_t off, int64_t nbytes) { vm_object_t obj; vm_page_t pp; int64_t end; /* * At present vm_page_clear_dirty extends the cleared range to DEV_BSIZE * aligned boundaries, if the range is not aligned. As a result a * DEV_BSIZE subrange with partially dirty data may get marked as clean. * It may happen that all DEV_BSIZE subranges are marked clean and thus * the whole page would be considred clean despite have some dirty data. * For this reason we should shrink the range to DEV_BSIZE aligned * boundaries before calling vm_page_clear_dirty. */ end = rounddown2(off + nbytes, DEV_BSIZE); off = roundup2(off, DEV_BSIZE); nbytes = end - off; obj = vp->v_object; zfs_vmobject_assert_wlocked(obj); for (;;) { if ((pp = vm_page_lookup(obj, OFF_TO_IDX(start))) != NULL && pp->valid) { if (vm_page_xbusied(pp)) { /* * Reference the page before unlocking and * sleeping so that the page daemon is less * likely to reclaim it. */ vm_page_reference(pp); vm_page_lock(pp); zfs_vmobject_wunlock(obj); vm_page_busy_sleep(pp, "zfsmwb"); zfs_vmobject_wlock(obj); continue; } vm_page_sbusy(pp); } else if (pp == NULL) { pp = vm_page_alloc(obj, OFF_TO_IDX(start), VM_ALLOC_SYSTEM | VM_ALLOC_IFCACHED | VM_ALLOC_SBUSY); } else { ASSERT(pp != NULL && !pp->valid); pp = NULL; } if (pp != NULL) { ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL); vm_object_pip_add(obj, 1); pmap_remove_write(pp); if (nbytes != 0) vm_page_clear_dirty(pp, off, nbytes); } break; } return (pp); } static void page_unbusy(vm_page_t pp) { vm_page_sunbusy(pp); vm_object_pip_subtract(pp->object, 1); } static vm_page_t page_hold(vnode_t *vp, int64_t start) { vm_object_t obj; vm_page_t pp; obj = vp->v_object; zfs_vmobject_assert_wlocked(obj); for (;;) { if ((pp = vm_page_lookup(obj, OFF_TO_IDX(start))) != NULL && pp->valid) { if (vm_page_xbusied(pp)) { /* * Reference the page before unlocking and * sleeping so that the page daemon is less * likely to reclaim it. */ vm_page_reference(pp); vm_page_lock(pp); zfs_vmobject_wunlock(obj); vm_page_busy_sleep(pp, "zfsmwb"); zfs_vmobject_wlock(obj); continue; } ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL); vm_page_lock(pp); vm_page_hold(pp); vm_page_unlock(pp); } else pp = NULL; break; } return (pp); } static void page_unhold(vm_page_t pp) { vm_page_lock(pp); vm_page_unhold(pp); vm_page_unlock(pp); } /* * When a file is memory mapped, we must keep the IO data synchronized * between the DMU cache and the memory mapped pages. What this means: * * On Write: If we find a memory mapped page, we write to *both* * the page and the dmu buffer. */ static void update_pages(vnode_t *vp, int64_t start, int len, objset_t *os, uint64_t oid, int segflg, dmu_tx_t *tx) { vm_object_t obj; struct sf_buf *sf; caddr_t va; int off; ASSERT(segflg != UIO_NOCOPY); ASSERT(vp->v_mount != NULL); obj = vp->v_object; ASSERT(obj != NULL); off = start & PAGEOFFSET; zfs_vmobject_wlock(obj); for (start &= PAGEMASK; len > 0; start += PAGESIZE) { vm_page_t pp; int nbytes = imin(PAGESIZE - off, len); if ((pp = page_busy(vp, start, off, nbytes)) != NULL) { zfs_vmobject_wunlock(obj); va = zfs_map_page(pp, &sf); (void) dmu_read(os, oid, start+off, nbytes, va+off, DMU_READ_PREFETCH);; zfs_unmap_page(sf); zfs_vmobject_wlock(obj); page_unbusy(pp); } len -= nbytes; off = 0; } vm_object_pip_wakeupn(obj, 0); zfs_vmobject_wunlock(obj); } /* * Read with UIO_NOCOPY flag means that sendfile(2) requests * ZFS to populate a range of page cache pages with data. * * NOTE: this function could be optimized to pre-allocate * all pages in advance, drain exclusive busy on all of them, * map them into contiguous KVA region and populate them * in one single dmu_read() call. */ static int mappedread_sf(vnode_t *vp, int nbytes, uio_t *uio) { znode_t *zp = VTOZ(vp); objset_t *os = zp->z_zfsvfs->z_os; struct sf_buf *sf; vm_object_t obj; vm_page_t pp; int64_t start; caddr_t va; int len = nbytes; int off; int error = 0; ASSERT(uio->uio_segflg == UIO_NOCOPY); ASSERT(vp->v_mount != NULL); obj = vp->v_object; ASSERT(obj != NULL); ASSERT((uio->uio_loffset & PAGEOFFSET) == 0); zfs_vmobject_wlock(obj); for (start = uio->uio_loffset; len > 0; start += PAGESIZE) { int bytes = MIN(PAGESIZE, len); pp = vm_page_grab(obj, OFF_TO_IDX(start), VM_ALLOC_SBUSY | VM_ALLOC_NORMAL | VM_ALLOC_IGN_SBUSY); if (pp->valid == 0) { zfs_vmobject_wunlock(obj); va = zfs_map_page(pp, &sf); error = dmu_read(os, zp->z_id, start, bytes, va, DMU_READ_PREFETCH); if (bytes != PAGESIZE && error == 0) bzero(va + bytes, PAGESIZE - bytes); zfs_unmap_page(sf); zfs_vmobject_wlock(obj); vm_page_sunbusy(pp); vm_page_lock(pp); if (error) { if (pp->wire_count == 0 && pp->valid == 0 && !vm_page_busied(pp)) vm_page_free(pp); } else { pp->valid = VM_PAGE_BITS_ALL; vm_page_activate(pp); } vm_page_unlock(pp); } else { ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL); vm_page_sunbusy(pp); } if (error) break; uio->uio_resid -= bytes; uio->uio_offset += bytes; len -= bytes; } zfs_vmobject_wunlock(obj); return (error); } /* * When a file is memory mapped, we must keep the IO data synchronized * between the DMU cache and the memory mapped pages. What this means: * * On Read: We "read" preferentially from memory mapped pages, * else we default from the dmu buffer. * * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when * the file is memory mapped. */ static int mappedread(vnode_t *vp, int nbytes, uio_t *uio) { znode_t *zp = VTOZ(vp); vm_object_t obj; int64_t start; caddr_t va; int len = nbytes; int off; int error = 0; ASSERT(vp->v_mount != NULL); obj = vp->v_object; ASSERT(obj != NULL); start = uio->uio_loffset; off = start & PAGEOFFSET; zfs_vmobject_wlock(obj); for (start &= PAGEMASK; len > 0; start += PAGESIZE) { vm_page_t pp; uint64_t bytes = MIN(PAGESIZE - off, len); if (pp = page_hold(vp, start)) { struct sf_buf *sf; caddr_t va; zfs_vmobject_wunlock(obj); va = zfs_map_page(pp, &sf); #ifdef illumos error = uiomove(va + off, bytes, UIO_READ, uio); #else error = vn_io_fault_uiomove(va + off, bytes, uio); #endif zfs_unmap_page(sf); zfs_vmobject_wlock(obj); page_unhold(pp); } else { zfs_vmobject_wunlock(obj); error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl), uio, bytes); zfs_vmobject_wlock(obj); } len -= bytes; off = 0; if (error) break; } zfs_vmobject_wunlock(obj); return (error); } offset_t zfs_read_chunk_size = 1024 * 1024; /* Tunable */ /* * Read bytes from specified file into supplied buffer. * * IN: vp - vnode of file to be read from. * uio - structure supplying read location, range info, * and return buffer. * ioflag - SYNC flags; used to provide FRSYNC semantics. * cr - credentials of caller. * ct - caller context * * OUT: uio - updated offset and range, buffer filled. * * RETURN: 0 on success, error code on failure. * * Side Effects: * vp - atime updated if byte count > 0 */ /* ARGSUSED */ static int zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; ssize_t n, nbytes; int error = 0; rl_t *rl; xuio_t *xuio = NULL; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); if (zp->z_pflags & ZFS_AV_QUARANTINED) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EACCES)); } /* * Validate file offset */ if (uio->uio_loffset < (offset_t)0) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EINVAL)); } /* * Fasttrack empty reads */ if (uio->uio_resid == 0) { ZFS_EXIT(zfsvfs); return (0); } /* * Check for mandatory locks */ if (MANDMODE(zp->z_mode)) { if (error = chklock(vp, FREAD, uio->uio_loffset, uio->uio_resid, uio->uio_fmode, ct)) { ZFS_EXIT(zfsvfs); return (error); } } /* * If we're in FRSYNC mode, sync out this znode before reading it. */ if (zfsvfs->z_log && (ioflag & FRSYNC || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)) zil_commit(zfsvfs->z_log, zp->z_id); /* * Lock the range against changes. */ rl = zfs_range_lock(zp, uio->uio_loffset, uio->uio_resid, RL_READER); /* * If we are reading past end-of-file we can skip * to the end; but we might still need to set atime. */ if (uio->uio_loffset >= zp->z_size) { error = 0; goto out; } ASSERT(uio->uio_loffset < zp->z_size); n = MIN(uio->uio_resid, zp->z_size - uio->uio_loffset); #ifdef illumos if ((uio->uio_extflg == UIO_XUIO) && (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY)) { int nblk; int blksz = zp->z_blksz; uint64_t offset = uio->uio_loffset; xuio = (xuio_t *)uio; if ((ISP2(blksz))) { nblk = (P2ROUNDUP(offset + n, blksz) - P2ALIGN(offset, blksz)) / blksz; } else { ASSERT(offset + n <= blksz); nblk = 1; } (void) dmu_xuio_init(xuio, nblk); if (vn_has_cached_data(vp)) { /* * For simplicity, we always allocate a full buffer * even if we only expect to read a portion of a block. */ while (--nblk >= 0) { (void) dmu_xuio_add(xuio, dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl), blksz), 0, blksz); } } } #endif /* illumos */ while (n > 0) { nbytes = MIN(n, zfs_read_chunk_size - P2PHASE(uio->uio_loffset, zfs_read_chunk_size)); #ifdef __FreeBSD__ if (uio->uio_segflg == UIO_NOCOPY) error = mappedread_sf(vp, nbytes, uio); else #endif /* __FreeBSD__ */ if (vn_has_cached_data(vp)) { error = mappedread(vp, nbytes, uio); } else { error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl), uio, nbytes); } if (error) { /* convert checksum errors into IO errors */ if (error == ECKSUM) error = SET_ERROR(EIO); break; } n -= nbytes; } out: zfs_range_unlock(rl); ZFS_ACCESSTIME_STAMP(zfsvfs, zp); ZFS_EXIT(zfsvfs); return (error); } /* * Write the bytes to a file. * * IN: vp - vnode of file to be written to. * uio - structure supplying write location, range info, * and data buffer. * ioflag - FAPPEND, FSYNC, and/or FDSYNC. FAPPEND is * set if in append mode. * cr - credentials of caller. * ct - caller context (NFS/CIFS fem monitor only) * * OUT: uio - updated offset and range. * * RETURN: 0 on success, error code on failure. * * Timestamps: * vp - ctime|mtime updated if byte count > 0 */ /* ARGSUSED */ static int zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) { znode_t *zp = VTOZ(vp); rlim64_t limit = MAXOFFSET_T; ssize_t start_resid = uio->uio_resid; ssize_t tx_bytes; uint64_t end_size; dmu_tx_t *tx; zfsvfs_t *zfsvfs = zp->z_zfsvfs; zilog_t *zilog; offset_t woff; ssize_t n, nbytes; rl_t *rl; int max_blksz = zfsvfs->z_max_blksz; int error = 0; arc_buf_t *abuf; iovec_t *aiov = NULL; xuio_t *xuio = NULL; int i_iov = 0; int iovcnt = uio->uio_iovcnt; iovec_t *iovp = uio->uio_iov; int write_eof; int count = 0; sa_bulk_attr_t bulk[4]; uint64_t mtime[2], ctime[2]; /* * Fasttrack empty write */ n = start_resid; if (n == 0) return (0); if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T) limit = MAXOFFSET_T; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL, &zp->z_size, 8); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, &zp->z_pflags, 8); /* * In a case vp->v_vfsp != zp->z_zfsvfs->z_vfs (e.g. snapshots) our * callers might not be able to detect properly that we are read-only, * so check it explicitly here. */ if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EROFS)); } /* * If immutable or not appending then return EPERM */ if ((zp->z_pflags & (ZFS_IMMUTABLE | ZFS_READONLY)) || ((zp->z_pflags & ZFS_APPENDONLY) && !(ioflag & FAPPEND) && (uio->uio_loffset < zp->z_size))) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EPERM)); } zilog = zfsvfs->z_log; /* * Validate file offset */ woff = ioflag & FAPPEND ? zp->z_size : uio->uio_loffset; if (woff < 0) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EINVAL)); } /* * Check for mandatory locks before calling zfs_range_lock() * in order to prevent a deadlock with locks set via fcntl(). */ if (MANDMODE((mode_t)zp->z_mode) && (error = chklock(vp, FWRITE, woff, n, uio->uio_fmode, ct)) != 0) { ZFS_EXIT(zfsvfs); return (error); } #ifdef illumos /* * Pre-fault the pages to ensure slow (eg NFS) pages * don't hold up txg. * Skip this if uio contains loaned arc_buf. */ if ((uio->uio_extflg == UIO_XUIO) && (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY)) xuio = (xuio_t *)uio; else uio_prefaultpages(MIN(n, max_blksz), uio); #endif /* * If in append mode, set the io offset pointer to eof. */ if (ioflag & FAPPEND) { /* * Obtain an appending range lock to guarantee file append * semantics. We reset the write offset once we have the lock. */ rl = zfs_range_lock(zp, 0, n, RL_APPEND); woff = rl->r_off; if (rl->r_len == UINT64_MAX) { /* * We overlocked the file because this write will cause * the file block size to increase. * Note that zp_size cannot change with this lock held. */ woff = zp->z_size; } uio->uio_loffset = woff; } else { /* * Note that if the file block size will change as a result of * this write, then this range lock will lock the entire file * so that we can re-write the block safely. */ rl = zfs_range_lock(zp, woff, n, RL_WRITER); } if (vn_rlimit_fsize(vp, uio, uio->uio_td)) { zfs_range_unlock(rl); ZFS_EXIT(zfsvfs); return (EFBIG); } if (woff >= limit) { zfs_range_unlock(rl); ZFS_EXIT(zfsvfs); return (SET_ERROR(EFBIG)); } if ((woff + n) > limit || woff > (limit - n)) n = limit - woff; /* Will this write extend the file length? */ write_eof = (woff + n > zp->z_size); end_size = MAX(zp->z_size, woff + n); /* * Write the file in reasonable size chunks. Each chunk is written * in a separate transaction; this keeps the intent log records small * and allows us to do more fine-grained space accounting. */ while (n > 0) { abuf = NULL; woff = uio->uio_loffset; if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) || zfs_owner_overquota(zfsvfs, zp, B_TRUE)) { if (abuf != NULL) dmu_return_arcbuf(abuf); error = SET_ERROR(EDQUOT); break; } if (xuio && abuf == NULL) { ASSERT(i_iov < iovcnt); aiov = &iovp[i_iov]; abuf = dmu_xuio_arcbuf(xuio, i_iov); dmu_xuio_clear(xuio, i_iov); DTRACE_PROBE3(zfs_cp_write, int, i_iov, iovec_t *, aiov, arc_buf_t *, abuf); ASSERT((aiov->iov_base == abuf->b_data) || ((char *)aiov->iov_base - (char *)abuf->b_data + aiov->iov_len == arc_buf_size(abuf))); i_iov++; } else if (abuf == NULL && n >= max_blksz && woff >= zp->z_size && P2PHASE(woff, max_blksz) == 0 && zp->z_blksz == max_blksz) { /* * This write covers a full block. "Borrow" a buffer * from the dmu so that we can fill it before we enter * a transaction. This avoids the possibility of * holding up the transaction if the data copy hangs * up on a pagefault (e.g., from an NFS server mapping). */ #ifdef illumos size_t cbytes; #endif abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl), max_blksz); ASSERT(abuf != NULL); ASSERT(arc_buf_size(abuf) == max_blksz); #ifdef illumos if (error = uiocopy(abuf->b_data, max_blksz, UIO_WRITE, uio, &cbytes)) { dmu_return_arcbuf(abuf); break; } ASSERT(cbytes == max_blksz); #else ssize_t resid = uio->uio_resid; error = vn_io_fault_uiomove(abuf->b_data, max_blksz, uio); if (error != 0) { uio->uio_offset -= resid - uio->uio_resid; uio->uio_resid = resid; dmu_return_arcbuf(abuf); break; } #endif } /* * Start a transaction. */ tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); dmu_tx_hold_write(tx, zp->z_id, woff, MIN(n, max_blksz)); zfs_sa_upgrade_txholds(tx, zp); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); if (abuf != NULL) dmu_return_arcbuf(abuf); break; } /* * If zfs_range_lock() over-locked we grow the blocksize * and then reduce the lock range. This will only happen * on the first iteration since zfs_range_reduce() will * shrink down r_len to the appropriate size. */ if (rl->r_len == UINT64_MAX) { uint64_t new_blksz; if (zp->z_blksz > max_blksz) { /* * File's blocksize is already larger than the * "recordsize" property. Only let it grow to * the next power of 2. */ ASSERT(!ISP2(zp->z_blksz)); new_blksz = MIN(end_size, 1 << highbit64(zp->z_blksz)); } else { new_blksz = MIN(end_size, max_blksz); } zfs_grow_blocksize(zp, new_blksz, tx); zfs_range_reduce(rl, woff, n); } /* * XXX - should we really limit each write to z_max_blksz? * Perhaps we should use SPA_MAXBLOCKSIZE chunks? */ nbytes = MIN(n, max_blksz - P2PHASE(woff, max_blksz)); if (woff + nbytes > zp->z_size) vnode_pager_setsize(vp, woff + nbytes); if (abuf == NULL) { tx_bytes = uio->uio_resid; error = dmu_write_uio_dbuf(sa_get_db(zp->z_sa_hdl), uio, nbytes, tx); tx_bytes -= uio->uio_resid; } else { tx_bytes = nbytes; ASSERT(xuio == NULL || tx_bytes == aiov->iov_len); /* * If this is not a full block write, but we are * extending the file past EOF and this data starts * block-aligned, use assign_arcbuf(). Otherwise, * write via dmu_write(). */ if (tx_bytes < max_blksz && (!write_eof || aiov->iov_base != abuf->b_data)) { ASSERT(xuio); dmu_write(zfsvfs->z_os, zp->z_id, woff, aiov->iov_len, aiov->iov_base, tx); dmu_return_arcbuf(abuf); xuio_stat_wbuf_copied(); } else { ASSERT(xuio || tx_bytes == max_blksz); dmu_assign_arcbuf(sa_get_db(zp->z_sa_hdl), woff, abuf, tx); } #ifdef illumos ASSERT(tx_bytes <= uio->uio_resid); uioskip(uio, tx_bytes); #endif } if (tx_bytes && vn_has_cached_data(vp)) { update_pages(vp, woff, tx_bytes, zfsvfs->z_os, zp->z_id, uio->uio_segflg, tx); } /* * If we made no progress, we're done. If we made even * partial progress, update the znode and ZIL accordingly. */ if (tx_bytes == 0) { (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs), (void *)&zp->z_size, sizeof (uint64_t), tx); dmu_tx_commit(tx); ASSERT(error != 0); break; } /* * Clear Set-UID/Set-GID bits on successful write if not * privileged and at least one of the excute bits is set. * * It would be nice to to this after all writes have * been done, but that would still expose the ISUID/ISGID * to another app after the partial write is committed. * * Note: we don't call zfs_fuid_map_id() here because * user 0 is not an ephemeral uid. */ mutex_enter(&zp->z_acl_lock); if ((zp->z_mode & (S_IXUSR | (S_IXUSR >> 3) | (S_IXUSR >> 6))) != 0 && (zp->z_mode & (S_ISUID | S_ISGID)) != 0 && secpolicy_vnode_setid_retain(vp, cr, (zp->z_mode & S_ISUID) != 0 && zp->z_uid == 0) != 0) { uint64_t newmode; zp->z_mode &= ~(S_ISUID | S_ISGID); newmode = zp->z_mode; (void) sa_update(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs), (void *)&newmode, sizeof (uint64_t), tx); } mutex_exit(&zp->z_acl_lock); zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime, B_TRUE); /* * Update the file size (zp_size) if it has changed; * account for possible concurrent updates. */ while ((end_size = zp->z_size) < uio->uio_loffset) { (void) atomic_cas_64(&zp->z_size, end_size, uio->uio_loffset); #ifdef illumos ASSERT(error == 0); #else ASSERT(error == 0 || error == EFAULT); #endif } /* * If we are replaying and eof is non zero then force * the file size to the specified eof. Note, there's no * concurrency during replay. */ if (zfsvfs->z_replay && zfsvfs->z_replay_eof != 0) zp->z_size = zfsvfs->z_replay_eof; if (error == 0) error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); else (void) sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag); dmu_tx_commit(tx); if (error != 0) break; ASSERT(tx_bytes == nbytes); n -= nbytes; #ifdef illumos if (!xuio && n > 0) uio_prefaultpages(MIN(n, max_blksz), uio); #endif } zfs_range_unlock(rl); /* * If we're in replay mode, or we made no progress, return error. * Otherwise, it's at least a partial write, so it's successful. */ if (zfsvfs->z_replay || uio->uio_resid == start_resid) { ZFS_EXIT(zfsvfs); return (error); } #ifdef __FreeBSD__ /* * EFAULT means that at least one page of the source buffer was not * available. VFS will re-try remaining I/O upon this error. */ if (error == EFAULT) { ZFS_EXIT(zfsvfs); return (error); } #endif if (ioflag & (FSYNC | FDSYNC) || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, zp->z_id); ZFS_EXIT(zfsvfs); return (0); } void zfs_get_done(zgd_t *zgd, int error) { znode_t *zp = zgd->zgd_private; objset_t *os = zp->z_zfsvfs->z_os; if (zgd->zgd_db) dmu_buf_rele(zgd->zgd_db, zgd); zfs_range_unlock(zgd->zgd_rl); /* * Release the vnode asynchronously as we currently have the * txg stopped from syncing. */ VN_RELE_ASYNC(ZTOV(zp), dsl_pool_vnrele_taskq(dmu_objset_pool(os))); if (error == 0 && zgd->zgd_bp) zil_add_block(zgd->zgd_zilog, zgd->zgd_bp); kmem_free(zgd, sizeof (zgd_t)); } #ifdef DEBUG static int zil_fault_io = 0; #endif /* * Get data to generate a TX_WRITE intent log record. */ int zfs_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio) { zfsvfs_t *zfsvfs = arg; objset_t *os = zfsvfs->z_os; znode_t *zp; uint64_t object = lr->lr_foid; uint64_t offset = lr->lr_offset; uint64_t size = lr->lr_length; blkptr_t *bp = &lr->lr_blkptr; dmu_buf_t *db; zgd_t *zgd; int error = 0; ASSERT(zio != NULL); ASSERT(size != 0); /* * Nothing to do if the file has been removed */ if (zfs_zget(zfsvfs, object, &zp) != 0) return (SET_ERROR(ENOENT)); if (zp->z_unlinked) { /* * Release the vnode asynchronously as we currently have the * txg stopped from syncing. */ VN_RELE_ASYNC(ZTOV(zp), dsl_pool_vnrele_taskq(dmu_objset_pool(os))); return (SET_ERROR(ENOENT)); } zgd = (zgd_t *)kmem_zalloc(sizeof (zgd_t), KM_SLEEP); zgd->zgd_zilog = zfsvfs->z_log; zgd->zgd_private = zp; /* * Write records come in two flavors: immediate and indirect. * For small writes it's cheaper to store the data with the * log record (immediate); for large writes it's cheaper to * sync the data and get a pointer to it (indirect) so that * we don't have to write the data twice. */ if (buf != NULL) { /* immediate write */ zgd->zgd_rl = zfs_range_lock(zp, offset, size, RL_READER); /* test for truncation needs to be done while range locked */ if (offset >= zp->z_size) { error = SET_ERROR(ENOENT); } else { error = dmu_read(os, object, offset, size, buf, DMU_READ_NO_PREFETCH); } ASSERT(error == 0 || error == ENOENT); } else { /* indirect write */ /* * Have to lock the whole block to ensure when it's * written out and it's checksum is being calculated * that no one can change the data. We need to re-check * blocksize after we get the lock in case it's changed! */ for (;;) { uint64_t blkoff; size = zp->z_blksz; blkoff = ISP2(size) ? P2PHASE(offset, size) : offset; offset -= blkoff; zgd->zgd_rl = zfs_range_lock(zp, offset, size, RL_READER); if (zp->z_blksz == size) break; offset += blkoff; zfs_range_unlock(zgd->zgd_rl); } /* test for truncation needs to be done while range locked */ if (lr->lr_offset >= zp->z_size) error = SET_ERROR(ENOENT); #ifdef DEBUG if (zil_fault_io) { error = SET_ERROR(EIO); zil_fault_io = 0; } #endif if (error == 0) error = dmu_buf_hold(os, object, offset, zgd, &db, DMU_READ_NO_PREFETCH); if (error == 0) { blkptr_t *obp = dmu_buf_get_blkptr(db); if (obp) { ASSERT(BP_IS_HOLE(bp)); *bp = *obp; } zgd->zgd_db = db; zgd->zgd_bp = bp; ASSERT(db->db_offset == offset); ASSERT(db->db_size == size); error = dmu_sync(zio, lr->lr_common.lrc_txg, zfs_get_done, zgd); ASSERT(error || lr->lr_length <= zp->z_blksz); /* * On success, we need to wait for the write I/O * initiated by dmu_sync() to complete before we can * release this dbuf. We will finish everything up * in the zfs_get_done() callback. */ if (error == 0) return (0); if (error == EALREADY) { lr->lr_common.lrc_txtype = TX_WRITE2; error = 0; } } } zfs_get_done(zgd, error); return (error); } /*ARGSUSED*/ static int zfs_access(vnode_t *vp, int mode, int flag, cred_t *cr, caller_context_t *ct) { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; int error; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); if (flag & V_ACE_MASK) error = zfs_zaccess(zp, mode, flag, B_FALSE, cr); else error = zfs_zaccess_rwx(zp, mode, flag, cr); ZFS_EXIT(zfsvfs); return (error); } static int zfs_dd_callback(struct mount *mp, void *arg, int lkflags, struct vnode **vpp) { int error; *vpp = arg; error = vn_lock(*vpp, lkflags); if (error != 0) vrele(*vpp); return (error); } static int zfs_lookup_lock(vnode_t *dvp, vnode_t *vp, const char *name, int lkflags) { znode_t *zdp = VTOZ(dvp); zfsvfs_t *zfsvfs = zdp->z_zfsvfs; int error; int ltype; ASSERT_VOP_LOCKED(dvp, __func__); #ifdef DIAGNOSTIC if ((zdp->z_pflags & ZFS_XATTR) == 0) VERIFY(!RRM_LOCK_HELD(&zfsvfs->z_teardown_lock)); #endif if (name[0] == 0 || (name[0] == '.' && name[1] == 0)) { ASSERT3P(dvp, ==, vp); vref(dvp); ltype = lkflags & LK_TYPE_MASK; if (ltype != VOP_ISLOCKED(dvp)) { if (ltype == LK_EXCLUSIVE) vn_lock(dvp, LK_UPGRADE | LK_RETRY); else /* if (ltype == LK_SHARED) */ vn_lock(dvp, LK_DOWNGRADE | LK_RETRY); /* * Relock for the "." case could leave us with * reclaimed vnode. */ if (dvp->v_iflag & VI_DOOMED) { vrele(dvp); return (SET_ERROR(ENOENT)); } } return (0); } else if (name[0] == '.' && name[1] == '.' && name[2] == 0) { /* * Note that in this case, dvp is the child vnode, and we * are looking up the parent vnode - exactly reverse from * normal operation. Unlocking dvp requires some rather * tricky unlock/relock dance to prevent mp from being freed; * use vn_vget_ino_gen() which takes care of all that. * * XXX Note that there is a time window when both vnodes are * unlocked. It is possible, although highly unlikely, that * during that window the parent-child relationship between * the vnodes may change, for example, get reversed. * In that case we would have a wrong lock order for the vnodes. * All other filesystems seem to ignore this problem, so we * do the same here. * A potential solution could be implemented as follows: * - using LK_NOWAIT when locking the second vnode and retrying * if necessary * - checking that the parent-child relationship still holds * after locking both vnodes and retrying if it doesn't */ error = vn_vget_ino_gen(dvp, zfs_dd_callback, vp, lkflags, &vp); return (error); } else { error = vn_lock(vp, lkflags); if (error != 0) vrele(vp); return (error); } } /* * Lookup an entry in a directory, or an extended attribute directory. * If it exists, return a held vnode reference for it. * * IN: dvp - vnode of directory to search. * nm - name of entry to lookup. * pnp - full pathname to lookup [UNUSED]. * flags - LOOKUP_XATTR set if looking for an attribute. * rdir - root directory vnode [UNUSED]. * cr - credentials of caller. * ct - caller context * * OUT: vpp - vnode of located entry, NULL if not found. * * RETURN: 0 on success, error code on failure. * * Timestamps: * NA */ /* ARGSUSED */ static int zfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct componentname *cnp, int nameiop, cred_t *cr, kthread_t *td, int flags) { znode_t *zdp = VTOZ(dvp); znode_t *zp; zfsvfs_t *zfsvfs = zdp->z_zfsvfs; int error = 0; /* fast path (should be redundant with vfs namecache) */ if (!(flags & LOOKUP_XATTR)) { if (dvp->v_type != VDIR) { return (SET_ERROR(ENOTDIR)); } else if (zdp->z_sa_hdl == NULL) { return (SET_ERROR(EIO)); } } DTRACE_PROBE2(zfs__fastpath__lookup__miss, vnode_t *, dvp, char *, nm); ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zdp); *vpp = NULL; if (flags & LOOKUP_XATTR) { #ifdef TODO /* * If the xattr property is off, refuse the lookup request. */ if (!(zfsvfs->z_vfs->vfs_flag & VFS_XATTR)) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EINVAL)); } #endif /* * We don't allow recursive attributes.. * Maybe someday we will. */ if (zdp->z_pflags & ZFS_XATTR) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EINVAL)); } if (error = zfs_get_xattrdir(VTOZ(dvp), vpp, cr, flags)) { ZFS_EXIT(zfsvfs); return (error); } /* * Do we have permission to get into attribute directory? */ if (error = zfs_zaccess(VTOZ(*vpp), ACE_EXECUTE, 0, B_FALSE, cr)) { vrele(*vpp); *vpp = NULL; } ZFS_EXIT(zfsvfs); return (error); } /* * Check accessibility of directory. */ if (error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr)) { ZFS_EXIT(zfsvfs); return (error); } if (zfsvfs->z_utf8 && u8_validate(nm, strlen(nm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EILSEQ)); } /* * First handle the special cases. */ if ((cnp->cn_flags & ISDOTDOT) != 0) { /* * If we are a snapshot mounted under .zfs, return * the vp for the snapshot directory. */ if (zdp->z_id == zfsvfs->z_root && zfsvfs->z_parent != zfsvfs) { error = zfsctl_root_lookup(zfsvfs->z_parent->z_ctldir, "snapshot", vpp, NULL, 0, NULL, kcred, NULL, NULL, NULL); ZFS_EXIT(zfsvfs); if (error == 0) { error = zfs_lookup_lock(dvp, *vpp, nm, cnp->cn_lkflags); } goto out; } } if (zfs_has_ctldir(zdp) && strcmp(nm, ZFS_CTLDIR_NAME) == 0) { error = 0; if ((cnp->cn_flags & ISLASTCN) != 0 && nameiop != LOOKUP) error = SET_ERROR(ENOTSUP); else *vpp = zfsctl_root(zdp); ZFS_EXIT(zfsvfs); if (error == 0) error = zfs_lookup_lock(dvp, *vpp, nm, cnp->cn_lkflags); goto out; } /* * The loop is retry the lookup if the parent-child relationship * changes during the dot-dot locking complexities. */ for (;;) { uint64_t parent; error = zfs_dirlook(zdp, nm, &zp); if (error == 0) *vpp = ZTOV(zp); ZFS_EXIT(zfsvfs); if (error != 0) break; error = zfs_lookup_lock(dvp, *vpp, nm, cnp->cn_lkflags); if (error != 0) { /* * If we've got a locking error, then the vnode * got reclaimed because of a force unmount. * We never enter doomed vnodes into the name cache. */ *vpp = NULL; return (error); } if ((cnp->cn_flags & ISDOTDOT) == 0) break; ZFS_ENTER(zfsvfs); if (zdp->z_sa_hdl == NULL) { error = SET_ERROR(EIO); } else { error = sa_lookup(zdp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), &parent, sizeof (parent)); } if (error != 0) { ZFS_EXIT(zfsvfs); vput(ZTOV(zp)); break; } if (zp->z_id == parent) { ZFS_EXIT(zfsvfs); break; } vput(ZTOV(zp)); } out: if (error != 0) *vpp = NULL; /* Translate errors and add SAVENAME when needed. */ if (cnp->cn_flags & ISLASTCN) { switch (nameiop) { case CREATE: case RENAME: if (error == ENOENT) { error = EJUSTRETURN; cnp->cn_flags |= SAVENAME; break; } /* FALLTHROUGH */ case DELETE: if (error == 0) cnp->cn_flags |= SAVENAME; break; } } /* Insert name into cache (as non-existent) if appropriate. */ if (zfsvfs->z_use_namecache && error == ENOENT && (cnp->cn_flags & MAKEENTRY) != 0) cache_enter(dvp, NULL, cnp); /* Insert name into cache if appropriate. */ if (zfsvfs->z_use_namecache && error == 0 && (cnp->cn_flags & MAKEENTRY)) { if (!(cnp->cn_flags & ISLASTCN) || (nameiop != DELETE && nameiop != RENAME)) { cache_enter(dvp, *vpp, cnp); } } return (error); } /* * Attempt to create a new entry in a directory. If the entry * already exists, truncate the file if permissible, else return * an error. Return the vp of the created or trunc'd file. * * IN: dvp - vnode of directory to put new file entry in. * name - name of new file entry. * vap - attributes of new file. * excl - flag indicating exclusive or non-exclusive mode. * mode - mode to open file with. * cr - credentials of caller. * flag - large file flag [UNUSED]. * ct - caller context * vsecp - ACL to be set * * OUT: vpp - vnode of created or trunc'd entry. * * RETURN: 0 on success, error code on failure. * * Timestamps: * dvp - ctime|mtime updated if new entry created * vp - ctime|mtime always, atime if new */ /* ARGSUSED */ static int zfs_create(vnode_t *dvp, char *name, vattr_t *vap, int excl, int mode, vnode_t **vpp, cred_t *cr, kthread_t *td) { znode_t *zp, *dzp = VTOZ(dvp); zfsvfs_t *zfsvfs = dzp->z_zfsvfs; zilog_t *zilog; objset_t *os; dmu_tx_t *tx; int error; ksid_t *ksid; uid_t uid; gid_t gid = crgetgid(cr); zfs_acl_ids_t acl_ids; boolean_t fuid_dirtied; void *vsecp = NULL; int flag = 0; uint64_t txtype; /* * If we have an ephemeral id, ACL, or XVATTR then * make sure file system is at proper version */ ksid = crgetsid(cr, KSID_OWNER); if (ksid) uid = ksid_getid(ksid); else uid = crgetuid(cr); if (zfsvfs->z_use_fuids == B_FALSE && (vsecp || (vap->va_mask & AT_XVATTR) || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid))) return (SET_ERROR(EINVAL)); ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(dzp); os = zfsvfs->z_os; zilog = zfsvfs->z_log; if (zfsvfs->z_utf8 && u8_validate(name, strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EILSEQ)); } if (vap->va_mask & AT_XVATTR) { if ((error = secpolicy_xvattr(dvp, (xvattr_t *)vap, crgetuid(cr), cr, vap->va_type)) != 0) { ZFS_EXIT(zfsvfs); return (error); } } *vpp = NULL; if ((vap->va_mode & S_ISVTX) && secpolicy_vnode_stky_modify(cr)) vap->va_mode &= ~S_ISVTX; error = zfs_dirent_lookup(dzp, name, &zp, ZNEW); if (error) { ZFS_EXIT(zfsvfs); return (error); } ASSERT3P(zp, ==, NULL); /* * Create a new file object and update the directory * to reference it. */ if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) { goto out; } /* * We only support the creation of regular files in * extended attribute directories. */ if ((dzp->z_pflags & ZFS_XATTR) && (vap->va_type != VREG)) { error = SET_ERROR(EINVAL); goto out; } if ((error = zfs_acl_ids_create(dzp, 0, vap, cr, vsecp, &acl_ids)) != 0) goto out; if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) { zfs_acl_ids_free(&acl_ids); error = SET_ERROR(EDQUOT); goto out; } getnewvnode_reserve(1); tx = dmu_tx_create(os); dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + ZFS_SA_BASE_ATTR_SIZE); fuid_dirtied = zfsvfs->z_fuid_dirty; if (fuid_dirtied) zfs_fuid_txhold(zfsvfs, tx); dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE); if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, acl_ids.z_aclp->z_acl_bytes); } error = dmu_tx_assign(tx, TXG_WAIT); if (error) { zfs_acl_ids_free(&acl_ids); dmu_tx_abort(tx); getnewvnode_drop_reserve(); ZFS_EXIT(zfsvfs); return (error); } zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); if (fuid_dirtied) zfs_fuid_sync(zfsvfs, tx); (void) zfs_link_create(dzp, name, zp, tx, ZNEW); txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap); zfs_log_create(zilog, tx, txtype, dzp, zp, name, vsecp, acl_ids.z_fuidp, vap); zfs_acl_ids_free(&acl_ids); dmu_tx_commit(tx); getnewvnode_drop_reserve(); out: if (error == 0) { *vpp = ZTOV(zp); } if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); ZFS_EXIT(zfsvfs); return (error); } /* * Remove an entry from a directory. * * IN: dvp - vnode of directory to remove entry from. * name - name of entry to remove. * cr - credentials of caller. * ct - caller context * flags - case flags * * RETURN: 0 on success, error code on failure. * * Timestamps: * dvp - ctime|mtime * vp - ctime (if nlink > 0) */ /*ARGSUSED*/ static int zfs_remove(vnode_t *dvp, vnode_t *vp, char *name, cred_t *cr) { znode_t *dzp = VTOZ(dvp); znode_t *zp = VTOZ(vp); znode_t *xzp; zfsvfs_t *zfsvfs = dzp->z_zfsvfs; zilog_t *zilog; uint64_t acl_obj, xattr_obj; uint64_t obj = 0; dmu_tx_t *tx; boolean_t unlinked, toobig = FALSE; uint64_t txtype; int error; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(dzp); ZFS_VERIFY_ZP(zp); zilog = zfsvfs->z_log; zp = VTOZ(vp); xattr_obj = 0; xzp = NULL; if (error = zfs_zaccess_delete(dzp, zp, cr)) { goto out; } /* * Need to use rmdir for removing directories. */ if (vp->v_type == VDIR) { error = SET_ERROR(EPERM); goto out; } vnevent_remove(vp, dvp, name, ct); obj = zp->z_id; /* are there any extended attributes? */ error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &xattr_obj, sizeof (xattr_obj)); if (error == 0 && xattr_obj) { error = zfs_zget(zfsvfs, xattr_obj, &xzp); ASSERT0(error); } /* * We may delete the znode now, or we may put it in the unlinked set; * it depends on whether we're the last link, and on whether there are * other holds on the vnode. So we dmu_tx_hold() the right things to * allow for either case. */ tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); zfs_sa_upgrade_txholds(tx, zp); zfs_sa_upgrade_txholds(tx, dzp); if (xzp) { dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE); } /* charge as an update -- would be nice not to charge at all */ dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); /* * Mark this transaction as typically resulting in a net free of space */ dmu_tx_mark_netfree(tx); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); ZFS_EXIT(zfsvfs); return (error); } /* * Remove the directory entry. */ error = zfs_link_destroy(dzp, name, zp, tx, ZEXISTS, &unlinked); if (error) { dmu_tx_commit(tx); goto out; } if (unlinked) { zfs_unlinked_add(zp, tx); vp->v_vflag |= VV_NOSYNC; } txtype = TX_REMOVE; zfs_log_remove(zilog, tx, txtype, dzp, name, obj); dmu_tx_commit(tx); out: if (xzp) vrele(ZTOV(xzp)); if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); ZFS_EXIT(zfsvfs); return (error); } /* * Create a new directory and insert it into dvp using the name * provided. Return a pointer to the inserted directory. * * IN: dvp - vnode of directory to add subdir to. * dirname - name of new directory. * vap - attributes of new directory. * cr - credentials of caller. * ct - caller context * flags - case flags * vsecp - ACL to be set * * OUT: vpp - vnode of created directory. * * RETURN: 0 on success, error code on failure. * * Timestamps: * dvp - ctime|mtime updated * vp - ctime|mtime|atime updated */ /*ARGSUSED*/ static int zfs_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp, cred_t *cr) { znode_t *zp, *dzp = VTOZ(dvp); zfsvfs_t *zfsvfs = dzp->z_zfsvfs; zilog_t *zilog; uint64_t txtype; dmu_tx_t *tx; int error; ksid_t *ksid; uid_t uid; gid_t gid = crgetgid(cr); zfs_acl_ids_t acl_ids; boolean_t fuid_dirtied; ASSERT(vap->va_type == VDIR); /* * If we have an ephemeral id, ACL, or XVATTR then * make sure file system is at proper version */ ksid = crgetsid(cr, KSID_OWNER); if (ksid) uid = ksid_getid(ksid); else uid = crgetuid(cr); if (zfsvfs->z_use_fuids == B_FALSE && ((vap->va_mask & AT_XVATTR) || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid))) return (SET_ERROR(EINVAL)); ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(dzp); zilog = zfsvfs->z_log; if (dzp->z_pflags & ZFS_XATTR) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EINVAL)); } if (zfsvfs->z_utf8 && u8_validate(dirname, strlen(dirname), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EILSEQ)); } if (vap->va_mask & AT_XVATTR) { if ((error = secpolicy_xvattr(dvp, (xvattr_t *)vap, crgetuid(cr), cr, vap->va_type)) != 0) { ZFS_EXIT(zfsvfs); return (error); } } if ((error = zfs_acl_ids_create(dzp, 0, vap, cr, NULL, &acl_ids)) != 0) { ZFS_EXIT(zfsvfs); return (error); } /* * First make sure the new directory doesn't exist. * * Existence is checked first to make sure we don't return * EACCES instead of EEXIST which can cause some applications * to fail. */ *vpp = NULL; if (error = zfs_dirent_lookup(dzp, dirname, &zp, ZNEW)) { zfs_acl_ids_free(&acl_ids); ZFS_EXIT(zfsvfs); return (error); } ASSERT3P(zp, ==, NULL); if (error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr)) { zfs_acl_ids_free(&acl_ids); ZFS_EXIT(zfsvfs); return (error); } if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) { zfs_acl_ids_free(&acl_ids); ZFS_EXIT(zfsvfs); return (SET_ERROR(EDQUOT)); } /* * Add a new entry to the directory. */ getnewvnode_reserve(1); tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname); dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL); fuid_dirtied = zfsvfs->z_fuid_dirty; if (fuid_dirtied) zfs_fuid_txhold(zfsvfs, tx); if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, acl_ids.z_aclp->z_acl_bytes); } dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + ZFS_SA_BASE_ATTR_SIZE); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { zfs_acl_ids_free(&acl_ids); dmu_tx_abort(tx); getnewvnode_drop_reserve(); ZFS_EXIT(zfsvfs); return (error); } /* * Create new node. */ zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); if (fuid_dirtied) zfs_fuid_sync(zfsvfs, tx); /* * Now put new name in parent dir. */ (void) zfs_link_create(dzp, dirname, zp, tx, ZNEW); *vpp = ZTOV(zp); txtype = zfs_log_create_txtype(Z_DIR, NULL, vap); zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, NULL, acl_ids.z_fuidp, vap); zfs_acl_ids_free(&acl_ids); dmu_tx_commit(tx); getnewvnode_drop_reserve(); if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); ZFS_EXIT(zfsvfs); return (0); } /* * Remove a directory subdir entry. If the current working * directory is the same as the subdir to be removed, the * remove will fail. * * IN: dvp - vnode of directory to remove from. * name - name of directory to be removed. * cwd - vnode of current working directory. * cr - credentials of caller. * ct - caller context * flags - case flags * * RETURN: 0 on success, error code on failure. * * Timestamps: * dvp - ctime|mtime updated */ /*ARGSUSED*/ static int zfs_rmdir(vnode_t *dvp, vnode_t *vp, char *name, cred_t *cr) { znode_t *dzp = VTOZ(dvp); znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = dzp->z_zfsvfs; zilog_t *zilog; dmu_tx_t *tx; int error; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(dzp); ZFS_VERIFY_ZP(zp); zilog = zfsvfs->z_log; if (error = zfs_zaccess_delete(dzp, zp, cr)) { goto out; } if (vp->v_type != VDIR) { error = SET_ERROR(ENOTDIR); goto out; } vnevent_rmdir(vp, dvp, name, ct); tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); zfs_sa_upgrade_txholds(tx, zp); zfs_sa_upgrade_txholds(tx, dzp); dmu_tx_mark_netfree(tx); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); ZFS_EXIT(zfsvfs); return (error); } cache_purge(dvp); error = zfs_link_destroy(dzp, name, zp, tx, ZEXISTS, NULL); if (error == 0) { uint64_t txtype = TX_RMDIR; zfs_log_remove(zilog, tx, txtype, dzp, name, ZFS_NO_OBJECT); } dmu_tx_commit(tx); cache_purge(vp); out: if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); ZFS_EXIT(zfsvfs); return (error); } /* * Read as many directory entries as will fit into the provided * buffer from the given directory cursor position (specified in * the uio structure). * * IN: vp - vnode of directory to read. * uio - structure supplying read location, range info, * and return buffer. * cr - credentials of caller. * ct - caller context * flags - case flags * * OUT: uio - updated offset and range, buffer filled. * eofp - set to true if end-of-file detected. * * RETURN: 0 on success, error code on failure. * * Timestamps: * vp - atime updated * * Note that the low 4 bits of the cookie returned by zap is always zero. * This allows us to use the low range for "special" directory entries: * We use 0 for '.', and 1 for '..'. If this is the root of the filesystem, * we use the offset 2 for the '.zfs' directory. */ /* ARGSUSED */ static int zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp, int *ncookies, u_long **cookies) { znode_t *zp = VTOZ(vp); iovec_t *iovp; edirent_t *eodp; dirent64_t *odp; zfsvfs_t *zfsvfs = zp->z_zfsvfs; objset_t *os; caddr_t outbuf; size_t bufsize; zap_cursor_t zc; zap_attribute_t zap; uint_t bytes_wanted; uint64_t offset; /* must be unsigned; checks for < 1 */ uint64_t parent; int local_eof; int outcount; int error; uint8_t prefetch; boolean_t check_sysattrs; uint8_t type; int ncooks; u_long *cooks = NULL; int flags = 0; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), &parent, sizeof (parent))) != 0) { ZFS_EXIT(zfsvfs); return (error); } /* * If we are not given an eof variable, * use a local one. */ if (eofp == NULL) eofp = &local_eof; /* * Check for valid iov_len. */ if (uio->uio_iov->iov_len <= 0) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EINVAL)); } /* * Quit if directory has been removed (posix) */ if ((*eofp = zp->z_unlinked) != 0) { ZFS_EXIT(zfsvfs); return (0); } error = 0; os = zfsvfs->z_os; offset = uio->uio_loffset; prefetch = zp->z_zn_prefetch; /* * Initialize the iterator cursor. */ if (offset <= 3) { /* * Start iteration from the beginning of the directory. */ zap_cursor_init(&zc, os, zp->z_id); } else { /* * The offset is a serialized cursor. */ zap_cursor_init_serialized(&zc, os, zp->z_id, offset); } /* * Get space to change directory entries into fs independent format. */ iovp = uio->uio_iov; bytes_wanted = iovp->iov_len; if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1) { bufsize = bytes_wanted; outbuf = kmem_alloc(bufsize, KM_SLEEP); odp = (struct dirent64 *)outbuf; } else { bufsize = bytes_wanted; outbuf = NULL; odp = (struct dirent64 *)iovp->iov_base; } eodp = (struct edirent *)odp; if (ncookies != NULL) { /* * Minimum entry size is dirent size and 1 byte for a file name. */ ncooks = uio->uio_resid / (sizeof(struct dirent) - sizeof(((struct dirent *)NULL)->d_name) + 1); cooks = malloc(ncooks * sizeof(u_long), M_TEMP, M_WAITOK); *cookies = cooks; *ncookies = ncooks; } /* * If this VFS supports the system attribute view interface; and * we're looking at an extended attribute directory; and we care * about normalization conflicts on this vfs; then we must check * for normalization conflicts with the sysattr name space. */ #ifdef TODO check_sysattrs = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) && (vp->v_flag & V_XATTRDIR) && zfsvfs->z_norm && (flags & V_RDDIR_ENTFLAGS); #else check_sysattrs = 0; #endif /* * Transform to file-system independent format */ outcount = 0; while (outcount < bytes_wanted) { ino64_t objnum; ushort_t reclen; off64_t *next = NULL; /* * Special case `.', `..', and `.zfs'. */ if (offset == 0) { (void) strcpy(zap.za_name, "."); zap.za_normalization_conflict = 0; objnum = zp->z_id; type = DT_DIR; } else if (offset == 1) { (void) strcpy(zap.za_name, ".."); zap.za_normalization_conflict = 0; objnum = parent; type = DT_DIR; } else if (offset == 2 && zfs_show_ctldir(zp)) { (void) strcpy(zap.za_name, ZFS_CTLDIR_NAME); zap.za_normalization_conflict = 0; objnum = ZFSCTL_INO_ROOT; type = DT_DIR; } else { /* * Grab next entry. */ if (error = zap_cursor_retrieve(&zc, &zap)) { if ((*eofp = (error == ENOENT)) != 0) break; else goto update; } if (zap.za_integer_length != 8 || zap.za_num_integers != 1) { cmn_err(CE_WARN, "zap_readdir: bad directory " "entry, obj = %lld, offset = %lld\n", (u_longlong_t)zp->z_id, (u_longlong_t)offset); error = SET_ERROR(ENXIO); goto update; } objnum = ZFS_DIRENT_OBJ(zap.za_first_integer); /* * MacOS X can extract the object type here such as: * uint8_t type = ZFS_DIRENT_TYPE(zap.za_first_integer); */ type = ZFS_DIRENT_TYPE(zap.za_first_integer); if (check_sysattrs && !zap.za_normalization_conflict) { #ifdef TODO zap.za_normalization_conflict = xattr_sysattr_casechk(zap.za_name); #else panic("%s:%u: TODO", __func__, __LINE__); #endif } } if (flags & V_RDDIR_ACCFILTER) { /* * If we have no access at all, don't include * this entry in the returned information */ znode_t *ezp; if (zfs_zget(zp->z_zfsvfs, objnum, &ezp) != 0) goto skip_entry; if (!zfs_has_access(ezp, cr)) { vrele(ZTOV(ezp)); goto skip_entry; } vrele(ZTOV(ezp)); } if (flags & V_RDDIR_ENTFLAGS) reclen = EDIRENT_RECLEN(strlen(zap.za_name)); else reclen = DIRENT64_RECLEN(strlen(zap.za_name)); /* * Will this entry fit in the buffer? */ if (outcount + reclen > bufsize) { /* * Did we manage to fit anything in the buffer? */ if (!outcount) { error = SET_ERROR(EINVAL); goto update; } break; } if (flags & V_RDDIR_ENTFLAGS) { /* * Add extended flag entry: */ eodp->ed_ino = objnum; eodp->ed_reclen = reclen; /* NOTE: ed_off is the offset for the *next* entry */ next = &(eodp->ed_off); eodp->ed_eflags = zap.za_normalization_conflict ? ED_CASE_CONFLICT : 0; (void) strncpy(eodp->ed_name, zap.za_name, EDIRENT_NAMELEN(reclen)); eodp = (edirent_t *)((intptr_t)eodp + reclen); } else { /* * Add normal entry: */ odp->d_ino = objnum; odp->d_reclen = reclen; odp->d_namlen = strlen(zap.za_name); (void) strlcpy(odp->d_name, zap.za_name, odp->d_namlen + 1); odp->d_type = type; odp = (dirent64_t *)((intptr_t)odp + reclen); } outcount += reclen; ASSERT(outcount <= bufsize); /* Prefetch znode */ if (prefetch) dmu_prefetch(os, objnum, 0, 0, 0, ZIO_PRIORITY_SYNC_READ); skip_entry: /* * Move to the next entry, fill in the previous offset. */ if (offset > 2 || (offset == 2 && !zfs_show_ctldir(zp))) { zap_cursor_advance(&zc); offset = zap_cursor_serialize(&zc); } else { offset += 1; } if (cooks != NULL) { *cooks++ = offset; ncooks--; KASSERT(ncooks >= 0, ("ncookies=%d", ncooks)); } } zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */ /* Subtract unused cookies */ if (ncookies != NULL) *ncookies -= ncooks; if (uio->uio_segflg == UIO_SYSSPACE && uio->uio_iovcnt == 1) { iovp->iov_base += outcount; iovp->iov_len -= outcount; uio->uio_resid -= outcount; } else if (error = uiomove(outbuf, (long)outcount, UIO_READ, uio)) { /* * Reset the pointer. */ offset = uio->uio_loffset; } update: zap_cursor_fini(&zc); if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1) kmem_free(outbuf, bufsize); if (error == ENOENT) error = 0; ZFS_ACCESSTIME_STAMP(zfsvfs, zp); uio->uio_loffset = offset; ZFS_EXIT(zfsvfs); if (error != 0 && cookies != NULL) { free(*cookies, M_TEMP); *cookies = NULL; *ncookies = 0; } return (error); } ulong_t zfs_fsync_sync_cnt = 4; static int zfs_fsync(vnode_t *vp, int syncflag, cred_t *cr, caller_context_t *ct) { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; (void) tsd_set(zfs_fsyncer_key, (void *)zfs_fsync_sync_cnt); if (zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) { ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); zil_commit(zfsvfs->z_log, zp->z_id); ZFS_EXIT(zfsvfs); } return (0); } /* * Get the requested file attributes and place them in the provided * vattr structure. * * IN: vp - vnode of file. * vap - va_mask identifies requested attributes. * If AT_XVATTR set, then optional attrs are requested * flags - ATTR_NOACLCHECK (CIFS server context) * cr - credentials of caller. * ct - caller context * * OUT: vap - attribute values. * * RETURN: 0 (always succeeds). */ /* ARGSUSED */ static int zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, caller_context_t *ct) { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; int error = 0; uint32_t blksize; u_longlong_t nblocks; uint64_t links; uint64_t mtime[2], ctime[2], crtime[2], rdev; xvattr_t *xvap = (xvattr_t *)vap; /* vap may be an xvattr_t * */ xoptattr_t *xoap = NULL; boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; sa_bulk_attr_t bulk[4]; int count = 0; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); zfs_fuid_map_ids(zp, cr, &vap->va_uid, &vap->va_gid); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CRTIME(zfsvfs), NULL, &crtime, 16); if (vp->v_type == VBLK || vp->v_type == VCHR) SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_RDEV(zfsvfs), NULL, &rdev, 8); if ((error = sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) != 0) { ZFS_EXIT(zfsvfs); return (error); } /* * If ACL is trivial don't bother looking for ACE_READ_ATTRIBUTES. * Also, if we are the owner don't bother, since owner should * always be allowed to read basic attributes of file. */ if (!(zp->z_pflags & ZFS_ACL_TRIVIAL) && (vap->va_uid != crgetuid(cr))) { if (error = zfs_zaccess(zp, ACE_READ_ATTRIBUTES, 0, skipaclchk, cr)) { ZFS_EXIT(zfsvfs); return (error); } } /* * Return all attributes. It's cheaper to provide the answer * than to determine whether we were asked the question. */ vap->va_type = IFTOVT(zp->z_mode); vap->va_mode = zp->z_mode & ~S_IFMT; #ifdef illumos vap->va_fsid = zp->z_zfsvfs->z_vfs->vfs_dev; #else vap->va_fsid = vp->v_mount->mnt_stat.f_fsid.val[0]; #endif vap->va_nodeid = zp->z_id; if ((vp->v_flag & VROOT) && zfs_show_ctldir(zp)) links = zp->z_links + 1; else links = zp->z_links; vap->va_nlink = MIN(links, LINK_MAX); /* nlink_t limit! */ vap->va_size = zp->z_size; #ifdef illumos vap->va_rdev = vp->v_rdev; #else if (vp->v_type == VBLK || vp->v_type == VCHR) vap->va_rdev = zfs_cmpldev(rdev); #endif vap->va_seq = zp->z_seq; vap->va_flags = 0; /* FreeBSD: Reset chflags(2) flags. */ vap->va_filerev = zp->z_seq; /* * Add in any requested optional attributes and the create time. * Also set the corresponding bits in the returned attribute bitmap. */ if ((xoap = xva_getxoptattr(xvap)) != NULL && zfsvfs->z_use_fuids) { if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) { xoap->xoa_archive = ((zp->z_pflags & ZFS_ARCHIVE) != 0); XVA_SET_RTN(xvap, XAT_ARCHIVE); } if (XVA_ISSET_REQ(xvap, XAT_READONLY)) { xoap->xoa_readonly = ((zp->z_pflags & ZFS_READONLY) != 0); XVA_SET_RTN(xvap, XAT_READONLY); } if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) { xoap->xoa_system = ((zp->z_pflags & ZFS_SYSTEM) != 0); XVA_SET_RTN(xvap, XAT_SYSTEM); } if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) { xoap->xoa_hidden = ((zp->z_pflags & ZFS_HIDDEN) != 0); XVA_SET_RTN(xvap, XAT_HIDDEN); } if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) { xoap->xoa_nounlink = ((zp->z_pflags & ZFS_NOUNLINK) != 0); XVA_SET_RTN(xvap, XAT_NOUNLINK); } if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) { xoap->xoa_immutable = ((zp->z_pflags & ZFS_IMMUTABLE) != 0); XVA_SET_RTN(xvap, XAT_IMMUTABLE); } if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) { xoap->xoa_appendonly = ((zp->z_pflags & ZFS_APPENDONLY) != 0); XVA_SET_RTN(xvap, XAT_APPENDONLY); } if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) { xoap->xoa_nodump = ((zp->z_pflags & ZFS_NODUMP) != 0); XVA_SET_RTN(xvap, XAT_NODUMP); } if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) { xoap->xoa_opaque = ((zp->z_pflags & ZFS_OPAQUE) != 0); XVA_SET_RTN(xvap, XAT_OPAQUE); } if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) { xoap->xoa_av_quarantined = ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0); XVA_SET_RTN(xvap, XAT_AV_QUARANTINED); } if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) { xoap->xoa_av_modified = ((zp->z_pflags & ZFS_AV_MODIFIED) != 0); XVA_SET_RTN(xvap, XAT_AV_MODIFIED); } if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) && vp->v_type == VREG) { zfs_sa_get_scanstamp(zp, xvap); } if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) { uint64_t times[2]; (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_CRTIME(zfsvfs), times, sizeof (times)); ZFS_TIME_DECODE(&xoap->xoa_createtime, times); XVA_SET_RTN(xvap, XAT_CREATETIME); } if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) { xoap->xoa_reparse = ((zp->z_pflags & ZFS_REPARSE) != 0); XVA_SET_RTN(xvap, XAT_REPARSE); } if (XVA_ISSET_REQ(xvap, XAT_GEN)) { xoap->xoa_generation = zp->z_gen; XVA_SET_RTN(xvap, XAT_GEN); } if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) { xoap->xoa_offline = ((zp->z_pflags & ZFS_OFFLINE) != 0); XVA_SET_RTN(xvap, XAT_OFFLINE); } if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) { xoap->xoa_sparse = ((zp->z_pflags & ZFS_SPARSE) != 0); XVA_SET_RTN(xvap, XAT_SPARSE); } } ZFS_TIME_DECODE(&vap->va_atime, zp->z_atime); ZFS_TIME_DECODE(&vap->va_mtime, mtime); ZFS_TIME_DECODE(&vap->va_ctime, ctime); ZFS_TIME_DECODE(&vap->va_birthtime, crtime); sa_object_size(zp->z_sa_hdl, &blksize, &nblocks); vap->va_blksize = blksize; vap->va_bytes = nblocks << 9; /* nblocks * 512 */ if (zp->z_blksz == 0) { /* * Block size hasn't been set; suggest maximal I/O transfers. */ vap->va_blksize = zfsvfs->z_max_blksz; } ZFS_EXIT(zfsvfs); return (0); } /* * Set the file attributes to the values contained in the * vattr structure. * * IN: vp - vnode of file to be modified. * vap - new attribute values. * If AT_XVATTR set, then optional attrs are being set * flags - ATTR_UTIME set if non-default time values provided. * - ATTR_NOACLCHECK (CIFS context only). * cr - credentials of caller. * ct - caller context * * RETURN: 0 on success, error code on failure. * * Timestamps: * vp - ctime updated, mtime updated if size changed. */ /* ARGSUSED */ static int zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, caller_context_t *ct) { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; zilog_t *zilog; dmu_tx_t *tx; vattr_t oldva; xvattr_t tmpxvattr; uint_t mask = vap->va_mask; uint_t saved_mask = 0; uint64_t saved_mode; int trim_mask = 0; uint64_t new_mode; uint64_t new_uid, new_gid; uint64_t xattr_obj; uint64_t mtime[2], ctime[2]; znode_t *attrzp; int need_policy = FALSE; int err, err2; zfs_fuid_info_t *fuidp = NULL; xvattr_t *xvap = (xvattr_t *)vap; /* vap may be an xvattr_t * */ xoptattr_t *xoap; zfs_acl_t *aclp; boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; boolean_t fuid_dirtied = B_FALSE; sa_bulk_attr_t bulk[7], xattr_bulk[7]; int count = 0, xattr_count = 0; if (mask == 0) return (0); if (mask & AT_NOSET) return (SET_ERROR(EINVAL)); ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); zilog = zfsvfs->z_log; /* * Make sure that if we have ephemeral uid/gid or xvattr specified * that file system is at proper version level */ if (zfsvfs->z_use_fuids == B_FALSE && (((mask & AT_UID) && IS_EPHEMERAL(vap->va_uid)) || ((mask & AT_GID) && IS_EPHEMERAL(vap->va_gid)) || (mask & AT_XVATTR))) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EINVAL)); } if (mask & AT_SIZE && vp->v_type == VDIR) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EISDIR)); } if (mask & AT_SIZE && vp->v_type != VREG && vp->v_type != VFIFO) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EINVAL)); } /* * If this is an xvattr_t, then get a pointer to the structure of * optional attributes. If this is NULL, then we have a vattr_t. */ xoap = xva_getxoptattr(xvap); xva_init(&tmpxvattr); /* * Immutable files can only alter immutable bit and atime */ if ((zp->z_pflags & ZFS_IMMUTABLE) && ((mask & (AT_SIZE|AT_UID|AT_GID|AT_MTIME|AT_MODE)) || ((mask & AT_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME)))) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EPERM)); } if ((mask & AT_SIZE) && (zp->z_pflags & ZFS_READONLY)) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EPERM)); } /* * Verify timestamps doesn't overflow 32 bits. * ZFS can handle large timestamps, but 32bit syscalls can't * handle times greater than 2039. This check should be removed * once large timestamps are fully supported. */ if (mask & (AT_ATIME | AT_MTIME)) { if (((mask & AT_ATIME) && TIMESPEC_OVERFLOW(&vap->va_atime)) || ((mask & AT_MTIME) && TIMESPEC_OVERFLOW(&vap->va_mtime))) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EOVERFLOW)); } } attrzp = NULL; aclp = NULL; /* Can this be moved to before the top label? */ if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EROFS)); } /* * First validate permissions */ if (mask & AT_SIZE) { /* * XXX - Note, we are not providing any open * mode flags here (like FNDELAY), so we may * block if there are locks present... this * should be addressed in openat(). */ /* XXX - would it be OK to generate a log record here? */ err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE); if (err) { ZFS_EXIT(zfsvfs); return (err); } } if (mask & (AT_ATIME|AT_MTIME) || ((mask & AT_XVATTR) && (XVA_ISSET_REQ(xvap, XAT_HIDDEN) || XVA_ISSET_REQ(xvap, XAT_READONLY) || XVA_ISSET_REQ(xvap, XAT_ARCHIVE) || XVA_ISSET_REQ(xvap, XAT_OFFLINE) || XVA_ISSET_REQ(xvap, XAT_SPARSE) || XVA_ISSET_REQ(xvap, XAT_CREATETIME) || XVA_ISSET_REQ(xvap, XAT_SYSTEM)))) { need_policy = zfs_zaccess(zp, ACE_WRITE_ATTRIBUTES, 0, skipaclchk, cr); } if (mask & (AT_UID|AT_GID)) { int idmask = (mask & (AT_UID|AT_GID)); int take_owner; int take_group; /* * NOTE: even if a new mode is being set, * we may clear S_ISUID/S_ISGID bits. */ if (!(mask & AT_MODE)) vap->va_mode = zp->z_mode; /* * Take ownership or chgrp to group we are a member of */ take_owner = (mask & AT_UID) && (vap->va_uid == crgetuid(cr)); take_group = (mask & AT_GID) && zfs_groupmember(zfsvfs, vap->va_gid, cr); /* * If both AT_UID and AT_GID are set then take_owner and * take_group must both be set in order to allow taking * ownership. * * Otherwise, send the check through secpolicy_vnode_setattr() * */ if (((idmask == (AT_UID|AT_GID)) && take_owner && take_group) || ((idmask == AT_UID) && take_owner) || ((idmask == AT_GID) && take_group)) { if (zfs_zaccess(zp, ACE_WRITE_OWNER, 0, skipaclchk, cr) == 0) { /* * Remove setuid/setgid for non-privileged users */ secpolicy_setid_clear(vap, vp, cr); trim_mask = (mask & (AT_UID|AT_GID)); } else { need_policy = TRUE; } } else { need_policy = TRUE; } } oldva.va_mode = zp->z_mode; zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid); if (mask & AT_XVATTR) { /* * Update xvattr mask to include only those attributes * that are actually changing. * * the bits will be restored prior to actually setting * the attributes so the caller thinks they were set. */ if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) { if (xoap->xoa_appendonly != ((zp->z_pflags & ZFS_APPENDONLY) != 0)) { need_policy = TRUE; } else { XVA_CLR_REQ(xvap, XAT_APPENDONLY); XVA_SET_REQ(&tmpxvattr, XAT_APPENDONLY); } } if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) { if (xoap->xoa_nounlink != ((zp->z_pflags & ZFS_NOUNLINK) != 0)) { need_policy = TRUE; } else { XVA_CLR_REQ(xvap, XAT_NOUNLINK); XVA_SET_REQ(&tmpxvattr, XAT_NOUNLINK); } } if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) { if (xoap->xoa_immutable != ((zp->z_pflags & ZFS_IMMUTABLE) != 0)) { need_policy = TRUE; } else { XVA_CLR_REQ(xvap, XAT_IMMUTABLE); XVA_SET_REQ(&tmpxvattr, XAT_IMMUTABLE); } } if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) { if (xoap->xoa_nodump != ((zp->z_pflags & ZFS_NODUMP) != 0)) { need_policy = TRUE; } else { XVA_CLR_REQ(xvap, XAT_NODUMP); XVA_SET_REQ(&tmpxvattr, XAT_NODUMP); } } if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) { if (xoap->xoa_av_modified != ((zp->z_pflags & ZFS_AV_MODIFIED) != 0)) { need_policy = TRUE; } else { XVA_CLR_REQ(xvap, XAT_AV_MODIFIED); XVA_SET_REQ(&tmpxvattr, XAT_AV_MODIFIED); } } if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) { if ((vp->v_type != VREG && xoap->xoa_av_quarantined) || xoap->xoa_av_quarantined != ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0)) { need_policy = TRUE; } else { XVA_CLR_REQ(xvap, XAT_AV_QUARANTINED); XVA_SET_REQ(&tmpxvattr, XAT_AV_QUARANTINED); } } if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EPERM)); } if (need_policy == FALSE && (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) || XVA_ISSET_REQ(xvap, XAT_OPAQUE))) { need_policy = TRUE; } } if (mask & AT_MODE) { if (zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr) == 0) { err = secpolicy_setid_setsticky_clear(vp, vap, &oldva, cr); if (err) { ZFS_EXIT(zfsvfs); return (err); } trim_mask |= AT_MODE; } else { need_policy = TRUE; } } if (need_policy) { /* * If trim_mask is set then take ownership * has been granted or write_acl is present and user * has the ability to modify mode. In that case remove * UID|GID and or MODE from mask so that * secpolicy_vnode_setattr() doesn't revoke it. */ if (trim_mask) { saved_mask = vap->va_mask; vap->va_mask &= ~trim_mask; if (trim_mask & AT_MODE) { /* * Save the mode, as secpolicy_vnode_setattr() * will overwrite it with ova.va_mode. */ saved_mode = vap->va_mode; } } err = secpolicy_vnode_setattr(cr, vp, vap, &oldva, flags, (int (*)(void *, int, cred_t *))zfs_zaccess_unix, zp); if (err) { ZFS_EXIT(zfsvfs); return (err); } if (trim_mask) { vap->va_mask |= saved_mask; if (trim_mask & AT_MODE) { /* * Recover the mode after * secpolicy_vnode_setattr(). */ vap->va_mode = saved_mode; } } } /* * secpolicy_vnode_setattr, or take ownership may have * changed va_mask */ mask = vap->va_mask; if ((mask & (AT_UID | AT_GID))) { err = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &xattr_obj, sizeof (xattr_obj)); if (err == 0 && xattr_obj) { err = zfs_zget(zp->z_zfsvfs, xattr_obj, &attrzp); if (err == 0) { err = vn_lock(ZTOV(attrzp), LK_EXCLUSIVE); if (err != 0) vrele(ZTOV(attrzp)); } if (err) goto out2; } if (mask & AT_UID) { new_uid = zfs_fuid_create(zfsvfs, (uint64_t)vap->va_uid, cr, ZFS_OWNER, &fuidp); if (new_uid != zp->z_uid && zfs_fuid_overquota(zfsvfs, B_FALSE, new_uid)) { if (attrzp) vput(ZTOV(attrzp)); err = SET_ERROR(EDQUOT); goto out2; } } if (mask & AT_GID) { new_gid = zfs_fuid_create(zfsvfs, (uint64_t)vap->va_gid, cr, ZFS_GROUP, &fuidp); if (new_gid != zp->z_gid && zfs_fuid_overquota(zfsvfs, B_TRUE, new_gid)) { if (attrzp) vput(ZTOV(attrzp)); err = SET_ERROR(EDQUOT); goto out2; } } } tx = dmu_tx_create(zfsvfs->z_os); if (mask & AT_MODE) { uint64_t pmode = zp->z_mode; uint64_t acl_obj; new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT); if (zp->z_zfsvfs->z_acl_mode == ZFS_ACL_RESTRICTED && !(zp->z_pflags & ZFS_ACL_TRIVIAL)) { err = SET_ERROR(EPERM); goto out; } if (err = zfs_acl_chmod_setattr(zp, &aclp, new_mode)) goto out; if (!zp->z_is_sa && ((acl_obj = zfs_external_acl(zp)) != 0)) { /* * Are we upgrading ACL from old V0 format * to V1 format? */ if (zfsvfs->z_version >= ZPL_VERSION_FUID && zfs_znode_acl_version(zp) == ZFS_ACL_VERSION_INITIAL) { dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END); dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, aclp->z_acl_bytes); } else { dmu_tx_hold_write(tx, acl_obj, 0, aclp->z_acl_bytes); } } else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) { dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, aclp->z_acl_bytes); } dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); } else { if ((mask & AT_XVATTR) && XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); else dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); } if (attrzp) { dmu_tx_hold_sa(tx, attrzp->z_sa_hdl, B_FALSE); } fuid_dirtied = zfsvfs->z_fuid_dirty; if (fuid_dirtied) zfs_fuid_txhold(zfsvfs, tx); zfs_sa_upgrade_txholds(tx, zp); err = dmu_tx_assign(tx, TXG_WAIT); if (err) goto out; count = 0; /* * Set each attribute requested. * We group settings according to the locks they need to acquire. * * Note: you cannot set ctime directly, although it will be * updated as a side-effect of calling this function. */ if (mask & (AT_UID|AT_GID|AT_MODE)) mutex_enter(&zp->z_acl_lock); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, &zp->z_pflags, sizeof (zp->z_pflags)); if (attrzp) { if (mask & (AT_UID|AT_GID|AT_MODE)) mutex_enter(&attrzp->z_acl_lock); SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, SA_ZPL_FLAGS(zfsvfs), NULL, &attrzp->z_pflags, sizeof (attrzp->z_pflags)); } if (mask & (AT_UID|AT_GID)) { if (mask & AT_UID) { SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, &new_uid, sizeof (new_uid)); zp->z_uid = new_uid; if (attrzp) { SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, SA_ZPL_UID(zfsvfs), NULL, &new_uid, sizeof (new_uid)); attrzp->z_uid = new_uid; } } if (mask & AT_GID) { SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL, &new_gid, sizeof (new_gid)); zp->z_gid = new_gid; if (attrzp) { SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, SA_ZPL_GID(zfsvfs), NULL, &new_gid, sizeof (new_gid)); attrzp->z_gid = new_gid; } } if (!(mask & AT_MODE)) { SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, &new_mode, sizeof (new_mode)); new_mode = zp->z_mode; } err = zfs_acl_chown_setattr(zp); ASSERT(err == 0); if (attrzp) { err = zfs_acl_chown_setattr(attrzp); ASSERT(err == 0); } } if (mask & AT_MODE) { SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, &new_mode, sizeof (new_mode)); zp->z_mode = new_mode; ASSERT3U((uintptr_t)aclp, !=, 0); err = zfs_aclset_common(zp, aclp, cr, tx); ASSERT0(err); if (zp->z_acl_cached) zfs_acl_free(zp->z_acl_cached); zp->z_acl_cached = aclp; aclp = NULL; } if (mask & AT_ATIME) { ZFS_TIME_ENCODE(&vap->va_atime, zp->z_atime); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL, &zp->z_atime, sizeof (zp->z_atime)); } if (mask & AT_MTIME) { ZFS_TIME_ENCODE(&vap->va_mtime, mtime); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, mtime, sizeof (mtime)); } /* XXX - shouldn't this be done *before* the ATIME/MTIME checks? */ if (mask & AT_SIZE && !(mask & AT_MTIME)) { SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, mtime, sizeof (mtime)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, sizeof (ctime)); zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime, B_TRUE); } else if (mask != 0) { SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, sizeof (ctime)); zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime, ctime, B_TRUE); if (attrzp) { SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, sizeof (ctime)); zfs_tstamp_update_setup(attrzp, STATE_CHANGED, mtime, ctime, B_TRUE); } } /* * Do this after setting timestamps to prevent timestamp * update from toggling bit */ if (xoap && (mask & AT_XVATTR)) { /* * restore trimmed off masks * so that return masks can be set for caller. */ if (XVA_ISSET_REQ(&tmpxvattr, XAT_APPENDONLY)) { XVA_SET_REQ(xvap, XAT_APPENDONLY); } if (XVA_ISSET_REQ(&tmpxvattr, XAT_NOUNLINK)) { XVA_SET_REQ(xvap, XAT_NOUNLINK); } if (XVA_ISSET_REQ(&tmpxvattr, XAT_IMMUTABLE)) { XVA_SET_REQ(xvap, XAT_IMMUTABLE); } if (XVA_ISSET_REQ(&tmpxvattr, XAT_NODUMP)) { XVA_SET_REQ(xvap, XAT_NODUMP); } if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_MODIFIED)) { XVA_SET_REQ(xvap, XAT_AV_MODIFIED); } if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_QUARANTINED)) { XVA_SET_REQ(xvap, XAT_AV_QUARANTINED); } if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) ASSERT(vp->v_type == VREG); zfs_xvattr_set(zp, xvap, tx); } if (fuid_dirtied) zfs_fuid_sync(zfsvfs, tx); if (mask != 0) zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp); if (mask & (AT_UID|AT_GID|AT_MODE)) mutex_exit(&zp->z_acl_lock); if (attrzp) { if (mask & (AT_UID|AT_GID|AT_MODE)) mutex_exit(&attrzp->z_acl_lock); } out: if (err == 0 && attrzp) { err2 = sa_bulk_update(attrzp->z_sa_hdl, xattr_bulk, xattr_count, tx); ASSERT(err2 == 0); } if (attrzp) vput(ZTOV(attrzp)); if (aclp) zfs_acl_free(aclp); if (fuidp) { zfs_fuid_info_free(fuidp); fuidp = NULL; } if (err) { dmu_tx_abort(tx); } else { err2 = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); dmu_tx_commit(tx); } out2: if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); ZFS_EXIT(zfsvfs); return (err); } /* * We acquire all but fdvp locks using non-blocking acquisitions. If we * fail to acquire any lock in the path we will drop all held locks, * acquire the new lock in a blocking fashion, and then release it and * restart the rename. This acquire/release step ensures that we do not * spin on a lock waiting for release. On error release all vnode locks * and decrement references the way tmpfs_rename() would do. */ static int zfs_rename_relock(struct vnode *sdvp, struct vnode **svpp, struct vnode *tdvp, struct vnode **tvpp, const struct componentname *scnp, const struct componentname *tcnp) { zfsvfs_t *zfsvfs; struct vnode *nvp, *svp, *tvp; znode_t *sdzp, *tdzp, *szp, *tzp; const char *snm = scnp->cn_nameptr; const char *tnm = tcnp->cn_nameptr; int error; VOP_UNLOCK(tdvp, 0); if (*tvpp != NULL && *tvpp != tdvp) VOP_UNLOCK(*tvpp, 0); relock: error = vn_lock(sdvp, LK_EXCLUSIVE); if (error) goto out; sdzp = VTOZ(sdvp); error = vn_lock(tdvp, LK_EXCLUSIVE | LK_NOWAIT); if (error != 0) { VOP_UNLOCK(sdvp, 0); if (error != EBUSY) goto out; error = vn_lock(tdvp, LK_EXCLUSIVE); if (error) goto out; VOP_UNLOCK(tdvp, 0); goto relock; } tdzp = VTOZ(tdvp); /* * Before using sdzp and tdzp we must ensure that they are live. * As a porting legacy from illumos we have two things to worry * about. One is typical for FreeBSD and it is that the vnode is * not reclaimed (doomed). The other is that the znode is live. * The current code can invalidate the znode without acquiring the * corresponding vnode lock if the object represented by the znode * and vnode is no longer valid after a rollback or receive operation. * z_teardown_lock hidden behind ZFS_ENTER and ZFS_EXIT is the lock * that protects the znodes from the invalidation. */ zfsvfs = sdzp->z_zfsvfs; ASSERT3P(zfsvfs, ==, tdzp->z_zfsvfs); ZFS_ENTER(zfsvfs); /* * We can not use ZFS_VERIFY_ZP() here because it could directly return * bypassing the cleanup code in the case of an error. */ if (tdzp->z_sa_hdl == NULL || sdzp->z_sa_hdl == NULL) { ZFS_EXIT(zfsvfs); VOP_UNLOCK(sdvp, 0); VOP_UNLOCK(tdvp, 0); error = SET_ERROR(EIO); goto out; } /* * Re-resolve svp to be certain it still exists and fetch the * correct vnode. */ error = zfs_dirent_lookup(sdzp, snm, &szp, ZEXISTS); if (error != 0) { /* Source entry invalid or not there. */ ZFS_EXIT(zfsvfs); VOP_UNLOCK(sdvp, 0); VOP_UNLOCK(tdvp, 0); if ((scnp->cn_flags & ISDOTDOT) != 0 || (scnp->cn_namelen == 1 && scnp->cn_nameptr[0] == '.')) error = SET_ERROR(EINVAL); goto out; } svp = ZTOV(szp); /* * Re-resolve tvp, if it disappeared we just carry on. */ error = zfs_dirent_lookup(tdzp, tnm, &tzp, 0); if (error != 0) { ZFS_EXIT(zfsvfs); VOP_UNLOCK(sdvp, 0); VOP_UNLOCK(tdvp, 0); vrele(svp); if ((tcnp->cn_flags & ISDOTDOT) != 0) error = SET_ERROR(EINVAL); goto out; } if (tzp != NULL) tvp = ZTOV(tzp); else tvp = NULL; /* * At present the vnode locks must be acquired before z_teardown_lock, * although it would be more logical to use the opposite order. */ ZFS_EXIT(zfsvfs); /* * Now try acquire locks on svp and tvp. */ nvp = svp; error = vn_lock(nvp, LK_EXCLUSIVE | LK_NOWAIT); if (error != 0) { VOP_UNLOCK(sdvp, 0); VOP_UNLOCK(tdvp, 0); if (tvp != NULL) vrele(tvp); if (error != EBUSY) { vrele(nvp); goto out; } error = vn_lock(nvp, LK_EXCLUSIVE); if (error != 0) { vrele(nvp); goto out; } VOP_UNLOCK(nvp, 0); /* * Concurrent rename race. * XXX ? */ if (nvp == tdvp) { vrele(nvp); error = SET_ERROR(EINVAL); goto out; } vrele(*svpp); *svpp = nvp; goto relock; } vrele(*svpp); *svpp = nvp; if (*tvpp != NULL) vrele(*tvpp); *tvpp = NULL; if (tvp != NULL) { nvp = tvp; error = vn_lock(nvp, LK_EXCLUSIVE | LK_NOWAIT); if (error != 0) { VOP_UNLOCK(sdvp, 0); VOP_UNLOCK(tdvp, 0); VOP_UNLOCK(*svpp, 0); if (error != EBUSY) { vrele(nvp); goto out; } error = vn_lock(nvp, LK_EXCLUSIVE); if (error != 0) { vrele(nvp); goto out; } vput(nvp); goto relock; } *tvpp = nvp; } return (0); out: return (error); } /* * Note that we must use VRELE_ASYNC in this function as it walks * up the directory tree and vrele may need to acquire an exclusive * lock if a last reference to a vnode is dropped. */ static int zfs_rename_check(znode_t *szp, znode_t *sdzp, znode_t *tdzp) { zfsvfs_t *zfsvfs; znode_t *zp, *zp1; uint64_t parent; int error; zfsvfs = tdzp->z_zfsvfs; if (tdzp == szp) return (SET_ERROR(EINVAL)); if (tdzp == sdzp) return (0); if (tdzp->z_id == zfsvfs->z_root) return (0); zp = tdzp; for (;;) { ASSERT(!zp->z_unlinked); if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), &parent, sizeof (parent))) != 0) break; if (parent == szp->z_id) { error = SET_ERROR(EINVAL); break; } if (parent == zfsvfs->z_root) break; if (parent == sdzp->z_id) break; error = zfs_zget(zfsvfs, parent, &zp1); if (error != 0) break; if (zp != tdzp) VN_RELE_ASYNC(ZTOV(zp), dsl_pool_vnrele_taskq(dmu_objset_pool(zfsvfs->z_os))); zp = zp1; } if (error == ENOTDIR) panic("checkpath: .. not a directory\n"); if (zp != tdzp) VN_RELE_ASYNC(ZTOV(zp), dsl_pool_vnrele_taskq(dmu_objset_pool(zfsvfs->z_os))); return (error); } /* * Move an entry from the provided source directory to the target * directory. Change the entry name as indicated. * * IN: sdvp - Source directory containing the "old entry". * snm - Old entry name. * tdvp - Target directory to contain the "new entry". * tnm - New entry name. * cr - credentials of caller. * ct - caller context * flags - case flags * * RETURN: 0 on success, error code on failure. * * Timestamps: * sdvp,tdvp - ctime|mtime updated */ /*ARGSUSED*/ static int zfs_rename(vnode_t *sdvp, vnode_t **svpp, struct componentname *scnp, vnode_t *tdvp, vnode_t **tvpp, struct componentname *tcnp, cred_t *cr) { zfsvfs_t *zfsvfs; znode_t *sdzp, *tdzp, *szp, *tzp; zilog_t *zilog = NULL; dmu_tx_t *tx; char *snm = scnp->cn_nameptr; char *tnm = tcnp->cn_nameptr; int error = 0; /* Reject renames across filesystems. */ if ((*svpp)->v_mount != tdvp->v_mount || ((*tvpp) != NULL && (*svpp)->v_mount != (*tvpp)->v_mount)) { error = SET_ERROR(EXDEV); goto out; } if (zfsctl_is_node(tdvp)) { error = SET_ERROR(EXDEV); goto out; } /* * Lock all four vnodes to ensure safety and semantics of renaming. */ error = zfs_rename_relock(sdvp, svpp, tdvp, tvpp, scnp, tcnp); if (error != 0) { /* no vnodes are locked in the case of error here */ return (error); } tdzp = VTOZ(tdvp); sdzp = VTOZ(sdvp); zfsvfs = tdzp->z_zfsvfs; zilog = zfsvfs->z_log; /* * After we re-enter ZFS_ENTER() we will have to revalidate all * znodes involved. */ ZFS_ENTER(zfsvfs); if (zfsvfs->z_utf8 && u8_validate(tnm, strlen(tnm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { error = SET_ERROR(EILSEQ); goto unlockout; } /* If source and target are the same file, there is nothing to do. */ if ((*svpp) == (*tvpp)) { error = 0; goto unlockout; } if (((*svpp)->v_type == VDIR && (*svpp)->v_mountedhere != NULL) || ((*tvpp) != NULL && (*tvpp)->v_type == VDIR && (*tvpp)->v_mountedhere != NULL)) { error = SET_ERROR(EXDEV); goto unlockout; } /* * We can not use ZFS_VERIFY_ZP() here because it could directly return * bypassing the cleanup code in the case of an error. */ if (tdzp->z_sa_hdl == NULL || sdzp->z_sa_hdl == NULL) { error = SET_ERROR(EIO); goto unlockout; } szp = VTOZ(*svpp); tzp = *tvpp == NULL ? NULL : VTOZ(*tvpp); if (szp->z_sa_hdl == NULL || (tzp != NULL && tzp->z_sa_hdl == NULL)) { error = SET_ERROR(EIO); goto unlockout; } /* * This is to prevent the creation of links into attribute space * by renaming a linked file into/outof an attribute directory. * See the comment in zfs_link() for why this is considered bad. */ if ((tdzp->z_pflags & ZFS_XATTR) != (sdzp->z_pflags & ZFS_XATTR)) { error = SET_ERROR(EINVAL); goto unlockout; } /* * Must have write access at the source to remove the old entry * and write access at the target to create the new entry. * Note that if target and source are the same, this can be * done in a single check. */ if (error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr)) goto unlockout; if ((*svpp)->v_type == VDIR) { /* * Avoid ".", "..", and aliases of "." for obvious reasons. */ if ((scnp->cn_namelen == 1 && scnp->cn_nameptr[0] == '.') || sdzp == szp || (scnp->cn_flags | tcnp->cn_flags) & ISDOTDOT) { error = EINVAL; goto unlockout; } /* * Check to make sure rename is valid. * Can't do a move like this: /usr/a/b to /usr/a/b/c/d */ if (error = zfs_rename_check(szp, sdzp, tdzp)) goto unlockout; } /* * Does target exist? */ if (tzp) { /* * Source and target must be the same type. */ if ((*svpp)->v_type == VDIR) { if ((*tvpp)->v_type != VDIR) { error = SET_ERROR(ENOTDIR); goto unlockout; } else { cache_purge(tdvp); if (sdvp != tdvp) cache_purge(sdvp); } } else { if ((*tvpp)->v_type == VDIR) { error = SET_ERROR(EISDIR); goto unlockout; } } } vnevent_rename_src(*svpp, sdvp, scnp->cn_nameptr, ct); if (tzp) vnevent_rename_dest(*tvpp, tdvp, tnm, ct); /* * notify the target directory if it is not the same * as source directory. */ if (tdvp != sdvp) { vnevent_rename_dest_dir(tdvp, ct); } tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE); dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE); dmu_tx_hold_zap(tx, sdzp->z_id, FALSE, snm); dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm); if (sdzp != tdzp) { dmu_tx_hold_sa(tx, tdzp->z_sa_hdl, B_FALSE); zfs_sa_upgrade_txholds(tx, tdzp); } if (tzp) { dmu_tx_hold_sa(tx, tzp->z_sa_hdl, B_FALSE); zfs_sa_upgrade_txholds(tx, tzp); } zfs_sa_upgrade_txholds(tx, szp); dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); goto unlockout; } if (tzp) /* Attempt to remove the existing target */ error = zfs_link_destroy(tdzp, tnm, tzp, tx, 0, NULL); if (error == 0) { error = zfs_link_create(tdzp, tnm, szp, tx, ZRENAMING); if (error == 0) { szp->z_pflags |= ZFS_AV_MODIFIED; error = sa_update(szp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs), (void *)&szp->z_pflags, sizeof (uint64_t), tx); ASSERT0(error); error = zfs_link_destroy(sdzp, snm, szp, tx, ZRENAMING, NULL); if (error == 0) { zfs_log_rename(zilog, tx, TX_RENAME, sdzp, snm, tdzp, tnm, szp); /* * Update path information for the target vnode */ vn_renamepath(tdvp, *svpp, tnm, strlen(tnm)); } else { /* * At this point, we have successfully created * the target name, but have failed to remove * the source name. Since the create was done * with the ZRENAMING flag, there are * complications; for one, the link count is * wrong. The easiest way to deal with this * is to remove the newly created target, and * return the original error. This must * succeed; fortunately, it is very unlikely to * fail, since we just created it. */ VERIFY3U(zfs_link_destroy(tdzp, tnm, szp, tx, ZRENAMING, NULL), ==, 0); } } if (error == 0) { cache_purge(*svpp); if (*tvpp != NULL) cache_purge(*tvpp); cache_purge_negative(tdvp); } } dmu_tx_commit(tx); unlockout: /* all 4 vnodes are locked, ZFS_ENTER called */ ZFS_EXIT(zfsvfs); VOP_UNLOCK(*svpp, 0); VOP_UNLOCK(sdvp, 0); out: /* original two vnodes are locked */ if (error == 0 && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); if (*tvpp != NULL) VOP_UNLOCK(*tvpp, 0); if (tdvp != *tvpp) VOP_UNLOCK(tdvp, 0); return (error); } /* * Insert the indicated symbolic reference entry into the directory. * * IN: dvp - Directory to contain new symbolic link. * link - Name for new symlink entry. * vap - Attributes of new entry. * cr - credentials of caller. * ct - caller context * flags - case flags * * RETURN: 0 on success, error code on failure. * * Timestamps: * dvp - ctime|mtime updated */ /*ARGSUSED*/ static int zfs_symlink(vnode_t *dvp, vnode_t **vpp, char *name, vattr_t *vap, char *link, cred_t *cr, kthread_t *td) { znode_t *zp, *dzp = VTOZ(dvp); dmu_tx_t *tx; zfsvfs_t *zfsvfs = dzp->z_zfsvfs; zilog_t *zilog; uint64_t len = strlen(link); int error; zfs_acl_ids_t acl_ids; boolean_t fuid_dirtied; uint64_t txtype = TX_SYMLINK; int flags = 0; ASSERT(vap->va_type == VLNK); ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(dzp); zilog = zfsvfs->z_log; if (zfsvfs->z_utf8 && u8_validate(name, strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EILSEQ)); } if (len > MAXPATHLEN) { ZFS_EXIT(zfsvfs); return (SET_ERROR(ENAMETOOLONG)); } if ((error = zfs_acl_ids_create(dzp, 0, vap, cr, NULL, &acl_ids)) != 0) { ZFS_EXIT(zfsvfs); return (error); } /* * Attempt to lock directory; fail if entry already exists. */ error = zfs_dirent_lookup(dzp, name, &zp, ZNEW); if (error) { zfs_acl_ids_free(&acl_ids); ZFS_EXIT(zfsvfs); return (error); } if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) { zfs_acl_ids_free(&acl_ids); ZFS_EXIT(zfsvfs); return (error); } if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) { zfs_acl_ids_free(&acl_ids); ZFS_EXIT(zfsvfs); return (SET_ERROR(EDQUOT)); } getnewvnode_reserve(1); tx = dmu_tx_create(zfsvfs->z_os); fuid_dirtied = zfsvfs->z_fuid_dirty; dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len)); dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + ZFS_SA_BASE_ATTR_SIZE + len); dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE); if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, acl_ids.z_aclp->z_acl_bytes); } if (fuid_dirtied) zfs_fuid_txhold(zfsvfs, tx); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { zfs_acl_ids_free(&acl_ids); dmu_tx_abort(tx); getnewvnode_drop_reserve(); ZFS_EXIT(zfsvfs); return (error); } /* * Create a new object for the symlink. * for version 4 ZPL datsets the symlink will be an SA attribute */ zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); if (fuid_dirtied) zfs_fuid_sync(zfsvfs, tx); if (zp->z_is_sa) error = sa_update(zp->z_sa_hdl, SA_ZPL_SYMLINK(zfsvfs), link, len, tx); else zfs_sa_symlink(zp, link, len, tx); zp->z_size = len; (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs), &zp->z_size, sizeof (zp->z_size), tx); /* * Insert the new object into the directory. */ (void) zfs_link_create(dzp, name, zp, tx, ZNEW); zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link); *vpp = ZTOV(zp); zfs_acl_ids_free(&acl_ids); dmu_tx_commit(tx); getnewvnode_drop_reserve(); if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); ZFS_EXIT(zfsvfs); return (error); } /* * Return, in the buffer contained in the provided uio structure, * the symbolic path referred to by vp. * * IN: vp - vnode of symbolic link. * uio - structure to contain the link path. * cr - credentials of caller. * ct - caller context * * OUT: uio - structure containing the link path. * * RETURN: 0 on success, error code on failure. * * Timestamps: * vp - atime updated */ /* ARGSUSED */ static int zfs_readlink(vnode_t *vp, uio_t *uio, cred_t *cr, caller_context_t *ct) { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; int error; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); if (zp->z_is_sa) error = sa_lookup_uio(zp->z_sa_hdl, SA_ZPL_SYMLINK(zfsvfs), uio); else error = zfs_sa_readlink(zp, uio); ZFS_ACCESSTIME_STAMP(zfsvfs, zp); ZFS_EXIT(zfsvfs); return (error); } /* * Insert a new entry into directory tdvp referencing svp. * * IN: tdvp - Directory to contain new entry. * svp - vnode of new entry. * name - name of new entry. * cr - credentials of caller. * ct - caller context * * RETURN: 0 on success, error code on failure. * * Timestamps: * tdvp - ctime|mtime updated * svp - ctime updated */ /* ARGSUSED */ static int zfs_link(vnode_t *tdvp, vnode_t *svp, char *name, cred_t *cr, caller_context_t *ct, int flags) { znode_t *dzp = VTOZ(tdvp); znode_t *tzp, *szp; zfsvfs_t *zfsvfs = dzp->z_zfsvfs; zilog_t *zilog; dmu_tx_t *tx; int error; uint64_t parent; uid_t owner; ASSERT(tdvp->v_type == VDIR); ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(dzp); zilog = zfsvfs->z_log; /* * POSIX dictates that we return EPERM here. * Better choices include ENOTSUP or EISDIR. */ if (svp->v_type == VDIR) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EPERM)); } szp = VTOZ(svp); ZFS_VERIFY_ZP(szp); if (szp->z_pflags & (ZFS_APPENDONLY | ZFS_IMMUTABLE | ZFS_READONLY)) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EPERM)); } /* Prevent links to .zfs/shares files */ if ((error = sa_lookup(szp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), &parent, sizeof (uint64_t))) != 0) { ZFS_EXIT(zfsvfs); return (error); } if (parent == zfsvfs->z_shares_dir) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EPERM)); } if (zfsvfs->z_utf8 && u8_validate(name, strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EILSEQ)); } /* * We do not support links between attributes and non-attributes * because of the potential security risk of creating links * into "normal" file space in order to circumvent restrictions * imposed in attribute space. */ if ((szp->z_pflags & ZFS_XATTR) != (dzp->z_pflags & ZFS_XATTR)) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EINVAL)); } owner = zfs_fuid_map_id(zfsvfs, szp->z_uid, cr, ZFS_OWNER); if (owner != crgetuid(cr) && secpolicy_basic_link(svp, cr) != 0) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EPERM)); } if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) { ZFS_EXIT(zfsvfs); return (error); } /* * Attempt to lock directory; fail if entry already exists. */ error = zfs_dirent_lookup(dzp, name, &tzp, ZNEW); if (error) { ZFS_EXIT(zfsvfs); return (error); } tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE); dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); zfs_sa_upgrade_txholds(tx, szp); zfs_sa_upgrade_txholds(tx, dzp); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); ZFS_EXIT(zfsvfs); return (error); } error = zfs_link_create(dzp, name, szp, tx, 0); if (error == 0) { uint64_t txtype = TX_LINK; zfs_log_link(zilog, tx, txtype, dzp, szp, name); } dmu_tx_commit(tx); if (error == 0) { vnevent_link(svp, ct); } if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); ZFS_EXIT(zfsvfs); return (error); } /*ARGSUSED*/ void zfs_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct) { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; int error; rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER); if (zp->z_sa_hdl == NULL) { /* * The fs has been unmounted, or we did a * suspend/resume and this file no longer exists. */ rw_exit(&zfsvfs->z_teardown_inactive_lock); vrecycle(vp); return; } if (zp->z_unlinked) { /* * Fast path to recycle a vnode of a removed file. */ rw_exit(&zfsvfs->z_teardown_inactive_lock); vrecycle(vp); return; } if (zp->z_atime_dirty && zp->z_unlinked == 0) { dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); zfs_sa_upgrade_txholds(tx, zp); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); } else { (void) sa_update(zp->z_sa_hdl, SA_ZPL_ATIME(zfsvfs), (void *)&zp->z_atime, sizeof (zp->z_atime), tx); zp->z_atime_dirty = 0; dmu_tx_commit(tx); } } rw_exit(&zfsvfs->z_teardown_inactive_lock); } CTASSERT(sizeof(struct zfid_short) <= sizeof(struct fid)); CTASSERT(sizeof(struct zfid_long) <= sizeof(struct fid)); /*ARGSUSED*/ static int zfs_fid(vnode_t *vp, fid_t *fidp, caller_context_t *ct) { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; uint32_t gen; uint64_t gen64; uint64_t object = zp->z_id; zfid_short_t *zfid; int size, i, error; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs), &gen64, sizeof (uint64_t))) != 0) { ZFS_EXIT(zfsvfs); return (error); } gen = (uint32_t)gen64; size = (zfsvfs->z_parent != zfsvfs) ? LONG_FID_LEN : SHORT_FID_LEN; #ifdef illumos if (fidp->fid_len < size) { fidp->fid_len = size; ZFS_EXIT(zfsvfs); return (SET_ERROR(ENOSPC)); } #else fidp->fid_len = size; #endif zfid = (zfid_short_t *)fidp; zfid->zf_len = size; for (i = 0; i < sizeof (zfid->zf_object); i++) zfid->zf_object[i] = (uint8_t)(object >> (8 * i)); /* Must have a non-zero generation number to distinguish from .zfs */ if (gen == 0) gen = 1; for (i = 0; i < sizeof (zfid->zf_gen); i++) zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i)); if (size == LONG_FID_LEN) { uint64_t objsetid = dmu_objset_id(zfsvfs->z_os); zfid_long_t *zlfid; zlfid = (zfid_long_t *)fidp; for (i = 0; i < sizeof (zlfid->zf_setid); i++) zlfid->zf_setid[i] = (uint8_t)(objsetid >> (8 * i)); /* XXX - this should be the generation number for the objset */ for (i = 0; i < sizeof (zlfid->zf_setgen); i++) zlfid->zf_setgen[i] = 0; } ZFS_EXIT(zfsvfs); return (0); } static int zfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr, caller_context_t *ct) { znode_t *zp, *xzp; zfsvfs_t *zfsvfs; int error; switch (cmd) { case _PC_LINK_MAX: *valp = INT_MAX; return (0); case _PC_FILESIZEBITS: *valp = 64; return (0); #ifdef illumos case _PC_XATTR_EXISTS: zp = VTOZ(vp); zfsvfs = zp->z_zfsvfs; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); *valp = 0; error = zfs_dirent_lookup(zp, "", &xzp, ZXATTR | ZEXISTS | ZSHARED); if (error == 0) { if (!zfs_dirempty(xzp)) *valp = 1; vrele(ZTOV(xzp)); } else if (error == ENOENT) { /* * If there aren't extended attributes, it's the * same as having zero of them. */ error = 0; } ZFS_EXIT(zfsvfs); return (error); case _PC_SATTR_ENABLED: case _PC_SATTR_EXISTS: *valp = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) && (vp->v_type == VREG || vp->v_type == VDIR); return (0); case _PC_ACCESS_FILTERING: *valp = vfs_has_feature(vp->v_vfsp, VFSFT_ACCESS_FILTER) && vp->v_type == VDIR; return (0); case _PC_ACL_ENABLED: *valp = _ACL_ACE_ENABLED; return (0); #endif /* illumos */ case _PC_MIN_HOLE_SIZE: *valp = (int)SPA_MINBLOCKSIZE; return (0); #ifdef illumos case _PC_TIMESTAMP_RESOLUTION: /* nanosecond timestamp resolution */ *valp = 1L; return (0); #endif case _PC_ACL_EXTENDED: *valp = 0; return (0); case _PC_ACL_NFS4: *valp = 1; return (0); case _PC_ACL_PATH_MAX: *valp = ACL_MAX_ENTRIES; return (0); default: return (EOPNOTSUPP); } } /*ARGSUSED*/ static int zfs_getsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr, caller_context_t *ct) { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; int error; boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); error = zfs_getacl(zp, vsecp, skipaclchk, cr); ZFS_EXIT(zfsvfs); return (error); } /*ARGSUSED*/ int zfs_setsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr, caller_context_t *ct) { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; int error; boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; zilog_t *zilog = zfsvfs->z_log; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); error = zfs_setacl(zp, vsecp, skipaclchk, cr); if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); ZFS_EXIT(zfsvfs); return (error); } static int ioflags(int ioflags) { int flags = 0; if (ioflags & IO_APPEND) flags |= FAPPEND; if (ioflags & IO_NDELAY) flags |= FNONBLOCK; if (ioflags & IO_SYNC) flags |= (FSYNC | FDSYNC | FRSYNC); return (flags); } static int zfs_getpages(struct vnode *vp, vm_page_t *m, int count, int *rbehind, int *rahead) { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; objset_t *os = zp->z_zfsvfs->z_os; vm_page_t mlast; vm_object_t object; caddr_t va; struct sf_buf *sf; off_t startoff, endoff; int i, error; vm_pindex_t reqstart, reqend; int lsize, size; object = m[0]->object; error = 0; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); zfs_vmobject_wlock(object); if (m[count - 1]->valid != 0 && --count == 0) { zfs_vmobject_wunlock(object); goto out; } mlast = m[count - 1]; if (IDX_TO_OFF(mlast->pindex) >= object->un_pager.vnp.vnp_size) { zfs_vmobject_wunlock(object); ZFS_EXIT(zfsvfs); return (zfs_vm_pagerret_bad); } PCPU_INC(cnt.v_vnodein); PCPU_ADD(cnt.v_vnodepgsin, count); lsize = PAGE_SIZE; if (IDX_TO_OFF(mlast->pindex) + lsize > object->un_pager.vnp.vnp_size) lsize = object->un_pager.vnp.vnp_size - IDX_TO_OFF(mlast->pindex); zfs_vmobject_wunlock(object); for (i = 0; i < count; i++) { size = PAGE_SIZE; if (i == count - 1) size = lsize; va = zfs_map_page(m[i], &sf); error = dmu_read(os, zp->z_id, IDX_TO_OFF(m[i]->pindex), size, va, DMU_READ_PREFETCH); if (size != PAGE_SIZE) bzero(va + size, PAGE_SIZE - size); zfs_unmap_page(sf); if (error != 0) goto out; } zfs_vmobject_wlock(object); for (i = 0; i < count; i++) m[i]->valid = VM_PAGE_BITS_ALL; zfs_vmobject_wunlock(object); out: ZFS_ACCESSTIME_STAMP(zfsvfs, zp); ZFS_EXIT(zfsvfs); if (error == 0) { if (rbehind) *rbehind = 0; if (rahead) *rahead = 0; return (zfs_vm_pagerret_ok); } else return (zfs_vm_pagerret_error); } static int zfs_freebsd_getpages(ap) struct vop_getpages_args /* { struct vnode *a_vp; vm_page_t *a_m; int a_count; int *a_rbehind; int *a_rahead; } */ *ap; { return (zfs_getpages(ap->a_vp, ap->a_m, ap->a_count, ap->a_rbehind, ap->a_rahead)); } static int zfs_putpages(struct vnode *vp, vm_page_t *ma, size_t len, int flags, int *rtvals) { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; rl_t *rl; dmu_tx_t *tx; struct sf_buf *sf; vm_object_t object; vm_page_t m; caddr_t va; size_t tocopy; size_t lo_len; vm_ooffset_t lo_off; vm_ooffset_t off; uint_t blksz; int ncount; int pcount; int err; int i; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); object = vp->v_object; pcount = btoc(len); ncount = pcount; KASSERT(ma[0]->object == object, ("mismatching object")); KASSERT(len > 0 && (len & PAGE_MASK) == 0, ("unexpected length")); for (i = 0; i < pcount; i++) rtvals[i] = zfs_vm_pagerret_error; off = IDX_TO_OFF(ma[0]->pindex); blksz = zp->z_blksz; lo_off = rounddown(off, blksz); lo_len = roundup(len + (off - lo_off), blksz); rl = zfs_range_lock(zp, lo_off, lo_len, RL_WRITER); zfs_vmobject_wlock(object); if (len + off > object->un_pager.vnp.vnp_size) { if (object->un_pager.vnp.vnp_size > off) { int pgoff; len = object->un_pager.vnp.vnp_size - off; ncount = btoc(len); if ((pgoff = (int)len & PAGE_MASK) != 0) { /* * If the object is locked and the following * conditions hold, then the page's dirty * field cannot be concurrently changed by a * pmap operation. */ m = ma[ncount - 1]; vm_page_assert_sbusied(m); KASSERT(!pmap_page_is_write_mapped(m), ("zfs_putpages: page %p is not read-only", m)); vm_page_clear_dirty(m, pgoff, PAGE_SIZE - pgoff); } } else { len = 0; ncount = 0; } if (ncount < pcount) { for (i = ncount; i < pcount; i++) { rtvals[i] = zfs_vm_pagerret_bad; } } } zfs_vmobject_wunlock(object); if (ncount == 0) goto out; if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) || zfs_owner_overquota(zfsvfs, zp, B_TRUE)) { goto out; } top: tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_write(tx, zp->z_id, off, len); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); zfs_sa_upgrade_txholds(tx, zp); err = dmu_tx_assign(tx, TXG_NOWAIT); if (err != 0) { if (err == ERESTART) { dmu_tx_wait(tx); dmu_tx_abort(tx); goto top; } dmu_tx_abort(tx); goto out; } if (zp->z_blksz < PAGE_SIZE) { i = 0; for (i = 0; len > 0; off += tocopy, len -= tocopy, i++) { tocopy = len > PAGE_SIZE ? PAGE_SIZE : len; va = zfs_map_page(ma[i], &sf); dmu_write(zfsvfs->z_os, zp->z_id, off, tocopy, va, tx); zfs_unmap_page(sf); } } else { err = dmu_write_pages(zfsvfs->z_os, zp->z_id, off, len, ma, tx); } if (err == 0) { uint64_t mtime[2], ctime[2]; sa_bulk_attr_t bulk[3]; int count = 0; SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, &zp->z_pflags, 8); zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime, B_TRUE); (void)sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, off, len, 0); zfs_vmobject_wlock(object); for (i = 0; i < ncount; i++) { rtvals[i] = zfs_vm_pagerret_ok; vm_page_undirty(ma[i]); } zfs_vmobject_wunlock(object); PCPU_INC(cnt.v_vnodeout); PCPU_ADD(cnt.v_vnodepgsout, ncount); } dmu_tx_commit(tx); out: zfs_range_unlock(rl); if ((flags & (zfs_vm_pagerput_sync | zfs_vm_pagerput_inval)) != 0 || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zfsvfs->z_log, zp->z_id); ZFS_EXIT(zfsvfs); return (rtvals[0]); } int zfs_freebsd_putpages(ap) struct vop_putpages_args /* { struct vnode *a_vp; vm_page_t *a_m; int a_count; int a_sync; int *a_rtvals; } */ *ap; { return (zfs_putpages(ap->a_vp, ap->a_m, ap->a_count, ap->a_sync, ap->a_rtvals)); } static int zfs_freebsd_bmap(ap) struct vop_bmap_args /* { struct vnode *a_vp; daddr_t a_bn; struct bufobj **a_bop; daddr_t *a_bnp; int *a_runp; int *a_runb; } */ *ap; { if (ap->a_bop != NULL) *ap->a_bop = &ap->a_vp->v_bufobj; if (ap->a_bnp != NULL) *ap->a_bnp = ap->a_bn; if (ap->a_runp != NULL) *ap->a_runp = 0; if (ap->a_runb != NULL) *ap->a_runb = 0; return (0); } static int zfs_freebsd_open(ap) struct vop_open_args /* { struct vnode *a_vp; int a_mode; struct ucred *a_cred; struct thread *a_td; } */ *ap; { vnode_t *vp = ap->a_vp; znode_t *zp = VTOZ(vp); int error; error = zfs_open(&vp, ap->a_mode, ap->a_cred, NULL); if (error == 0) vnode_create_vobject(vp, zp->z_size, ap->a_td); return (error); } static int zfs_freebsd_close(ap) struct vop_close_args /* { struct vnode *a_vp; int a_fflag; struct ucred *a_cred; struct thread *a_td; } */ *ap; { return (zfs_close(ap->a_vp, ap->a_fflag, 1, 0, ap->a_cred, NULL)); } static int zfs_freebsd_ioctl(ap) struct vop_ioctl_args /* { struct vnode *a_vp; u_long a_command; caddr_t a_data; int a_fflag; struct ucred *cred; struct thread *td; } */ *ap; { return (zfs_ioctl(ap->a_vp, ap->a_command, (intptr_t)ap->a_data, ap->a_fflag, ap->a_cred, NULL, NULL)); } static int zfs_freebsd_read(ap) struct vop_read_args /* { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; struct ucred *a_cred; } */ *ap; { return (zfs_read(ap->a_vp, ap->a_uio, ioflags(ap->a_ioflag), ap->a_cred, NULL)); } static int zfs_freebsd_write(ap) struct vop_write_args /* { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; struct ucred *a_cred; } */ *ap; { return (zfs_write(ap->a_vp, ap->a_uio, ioflags(ap->a_ioflag), ap->a_cred, NULL)); } static int zfs_freebsd_access(ap) struct vop_access_args /* { struct vnode *a_vp; accmode_t a_accmode; struct ucred *a_cred; struct thread *a_td; } */ *ap; { vnode_t *vp = ap->a_vp; znode_t *zp = VTOZ(vp); accmode_t accmode; int error = 0; /* * ZFS itself only knowns about VREAD, VWRITE, VEXEC and VAPPEND, */ accmode = ap->a_accmode & (VREAD|VWRITE|VEXEC|VAPPEND); if (accmode != 0) error = zfs_access(ap->a_vp, accmode, 0, ap->a_cred, NULL); /* * VADMIN has to be handled by vaccess(). */ if (error == 0) { accmode = ap->a_accmode & ~(VREAD|VWRITE|VEXEC|VAPPEND); if (accmode != 0) { error = vaccess(vp->v_type, zp->z_mode, zp->z_uid, zp->z_gid, accmode, ap->a_cred, NULL); } } /* * For VEXEC, ensure that at least one execute bit is set for * non-directories. */ if (error == 0 && (ap->a_accmode & VEXEC) != 0 && vp->v_type != VDIR && (zp->z_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) == 0) { error = EACCES; } return (error); } static int zfs_freebsd_lookup(ap) struct vop_lookup_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; } */ *ap; { struct componentname *cnp = ap->a_cnp; char nm[NAME_MAX + 1]; ASSERT(cnp->cn_namelen < sizeof(nm)); strlcpy(nm, cnp->cn_nameptr, MIN(cnp->cn_namelen + 1, sizeof(nm))); return (zfs_lookup(ap->a_dvp, nm, ap->a_vpp, cnp, cnp->cn_nameiop, cnp->cn_cred, cnp->cn_thread, 0)); } static int zfs_cache_lookup(ap) struct vop_lookup_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; } */ *ap; { zfsvfs_t *zfsvfs; zfsvfs = ap->a_dvp->v_mount->mnt_data; if (zfsvfs->z_use_namecache) return (vfs_cache_lookup(ap)); else return (zfs_freebsd_lookup(ap)); } static int zfs_freebsd_create(ap) struct vop_create_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; struct vattr *a_vap; } */ *ap; { zfsvfs_t *zfsvfs; struct componentname *cnp = ap->a_cnp; vattr_t *vap = ap->a_vap; int error, mode; ASSERT(cnp->cn_flags & SAVENAME); vattr_init_mask(vap); mode = vap->va_mode & ALLPERMS; zfsvfs = ap->a_dvp->v_mount->mnt_data; error = zfs_create(ap->a_dvp, cnp->cn_nameptr, vap, !EXCL, mode, ap->a_vpp, cnp->cn_cred, cnp->cn_thread); if (zfsvfs->z_use_namecache && error == 0 && (cnp->cn_flags & MAKEENTRY) != 0) cache_enter(ap->a_dvp, *ap->a_vpp, cnp); return (error); } static int zfs_freebsd_remove(ap) struct vop_remove_args /* { struct vnode *a_dvp; struct vnode *a_vp; struct componentname *a_cnp; } */ *ap; { ASSERT(ap->a_cnp->cn_flags & SAVENAME); return (zfs_remove(ap->a_dvp, ap->a_vp, ap->a_cnp->cn_nameptr, ap->a_cnp->cn_cred)); } static int zfs_freebsd_mkdir(ap) struct vop_mkdir_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; struct vattr *a_vap; } */ *ap; { vattr_t *vap = ap->a_vap; ASSERT(ap->a_cnp->cn_flags & SAVENAME); vattr_init_mask(vap); return (zfs_mkdir(ap->a_dvp, ap->a_cnp->cn_nameptr, vap, ap->a_vpp, ap->a_cnp->cn_cred)); } static int zfs_freebsd_rmdir(ap) struct vop_rmdir_args /* { struct vnode *a_dvp; struct vnode *a_vp; struct componentname *a_cnp; } */ *ap; { struct componentname *cnp = ap->a_cnp; ASSERT(cnp->cn_flags & SAVENAME); return (zfs_rmdir(ap->a_dvp, ap->a_vp, cnp->cn_nameptr, cnp->cn_cred)); } static int zfs_freebsd_readdir(ap) struct vop_readdir_args /* { struct vnode *a_vp; struct uio *a_uio; struct ucred *a_cred; int *a_eofflag; int *a_ncookies; u_long **a_cookies; } */ *ap; { return (zfs_readdir(ap->a_vp, ap->a_uio, ap->a_cred, ap->a_eofflag, ap->a_ncookies, ap->a_cookies)); } static int zfs_freebsd_fsync(ap) struct vop_fsync_args /* { struct vnode *a_vp; int a_waitfor; struct thread *a_td; } */ *ap; { vop_stdfsync(ap); return (zfs_fsync(ap->a_vp, 0, ap->a_td->td_ucred, NULL)); } static int zfs_freebsd_getattr(ap) struct vop_getattr_args /* { struct vnode *a_vp; struct vattr *a_vap; struct ucred *a_cred; } */ *ap; { vattr_t *vap = ap->a_vap; xvattr_t xvap; u_long fflags = 0; int error; xva_init(&xvap); xvap.xva_vattr = *vap; xvap.xva_vattr.va_mask |= AT_XVATTR; /* Convert chflags into ZFS-type flags. */ /* XXX: what about SF_SETTABLE?. */ XVA_SET_REQ(&xvap, XAT_IMMUTABLE); XVA_SET_REQ(&xvap, XAT_APPENDONLY); XVA_SET_REQ(&xvap, XAT_NOUNLINK); XVA_SET_REQ(&xvap, XAT_NODUMP); XVA_SET_REQ(&xvap, XAT_READONLY); XVA_SET_REQ(&xvap, XAT_ARCHIVE); XVA_SET_REQ(&xvap, XAT_SYSTEM); XVA_SET_REQ(&xvap, XAT_HIDDEN); XVA_SET_REQ(&xvap, XAT_REPARSE); XVA_SET_REQ(&xvap, XAT_OFFLINE); XVA_SET_REQ(&xvap, XAT_SPARSE); error = zfs_getattr(ap->a_vp, (vattr_t *)&xvap, 0, ap->a_cred, NULL); if (error != 0) return (error); /* Convert ZFS xattr into chflags. */ #define FLAG_CHECK(fflag, xflag, xfield) do { \ if (XVA_ISSET_RTN(&xvap, (xflag)) && (xfield) != 0) \ fflags |= (fflag); \ } while (0) FLAG_CHECK(SF_IMMUTABLE, XAT_IMMUTABLE, xvap.xva_xoptattrs.xoa_immutable); FLAG_CHECK(SF_APPEND, XAT_APPENDONLY, xvap.xva_xoptattrs.xoa_appendonly); FLAG_CHECK(SF_NOUNLINK, XAT_NOUNLINK, xvap.xva_xoptattrs.xoa_nounlink); FLAG_CHECK(UF_ARCHIVE, XAT_ARCHIVE, xvap.xva_xoptattrs.xoa_archive); FLAG_CHECK(UF_NODUMP, XAT_NODUMP, xvap.xva_xoptattrs.xoa_nodump); FLAG_CHECK(UF_READONLY, XAT_READONLY, xvap.xva_xoptattrs.xoa_readonly); FLAG_CHECK(UF_SYSTEM, XAT_SYSTEM, xvap.xva_xoptattrs.xoa_system); FLAG_CHECK(UF_HIDDEN, XAT_HIDDEN, xvap.xva_xoptattrs.xoa_hidden); FLAG_CHECK(UF_REPARSE, XAT_REPARSE, xvap.xva_xoptattrs.xoa_reparse); FLAG_CHECK(UF_OFFLINE, XAT_OFFLINE, xvap.xva_xoptattrs.xoa_offline); FLAG_CHECK(UF_SPARSE, XAT_SPARSE, xvap.xva_xoptattrs.xoa_sparse); #undef FLAG_CHECK *vap = xvap.xva_vattr; vap->va_flags = fflags; return (0); } static int zfs_freebsd_setattr(ap) struct vop_setattr_args /* { struct vnode *a_vp; struct vattr *a_vap; struct ucred *a_cred; } */ *ap; { vnode_t *vp = ap->a_vp; vattr_t *vap = ap->a_vap; cred_t *cred = ap->a_cred; xvattr_t xvap; u_long fflags; uint64_t zflags; vattr_init_mask(vap); vap->va_mask &= ~AT_NOSET; xva_init(&xvap); xvap.xva_vattr = *vap; zflags = VTOZ(vp)->z_pflags; if (vap->va_flags != VNOVAL) { zfsvfs_t *zfsvfs = VTOZ(vp)->z_zfsvfs; int error; if (zfsvfs->z_use_fuids == B_FALSE) return (EOPNOTSUPP); fflags = vap->va_flags; /* * XXX KDM * We need to figure out whether it makes sense to allow * UF_REPARSE through, since we don't really have other * facilities to handle reparse points and zfs_setattr() * doesn't currently allow setting that attribute anyway. */ if ((fflags & ~(SF_IMMUTABLE|SF_APPEND|SF_NOUNLINK|UF_ARCHIVE| UF_NODUMP|UF_SYSTEM|UF_HIDDEN|UF_READONLY|UF_REPARSE| UF_OFFLINE|UF_SPARSE)) != 0) return (EOPNOTSUPP); /* * Unprivileged processes are not permitted to unset system * flags, or modify flags if any system flags are set. * Privileged non-jail processes may not modify system flags * if securelevel > 0 and any existing system flags are set. * Privileged jail processes behave like privileged non-jail * processes if the security.jail.chflags_allowed sysctl is * is non-zero; otherwise, they behave like unprivileged * processes. */ if (secpolicy_fs_owner(vp->v_mount, cred) == 0 || priv_check_cred(cred, PRIV_VFS_SYSFLAGS, 0) == 0) { if (zflags & (ZFS_IMMUTABLE | ZFS_APPENDONLY | ZFS_NOUNLINK)) { error = securelevel_gt(cred, 0); if (error != 0) return (error); } } else { /* * Callers may only modify the file flags on objects they * have VADMIN rights for. */ if ((error = VOP_ACCESS(vp, VADMIN, cred, curthread)) != 0) return (error); if (zflags & (ZFS_IMMUTABLE | ZFS_APPENDONLY | ZFS_NOUNLINK)) { return (EPERM); } if (fflags & (SF_IMMUTABLE | SF_APPEND | SF_NOUNLINK)) { return (EPERM); } } #define FLAG_CHANGE(fflag, zflag, xflag, xfield) do { \ if (((fflags & (fflag)) && !(zflags & (zflag))) || \ ((zflags & (zflag)) && !(fflags & (fflag)))) { \ XVA_SET_REQ(&xvap, (xflag)); \ (xfield) = ((fflags & (fflag)) != 0); \ } \ } while (0) /* Convert chflags into ZFS-type flags. */ /* XXX: what about SF_SETTABLE?. */ FLAG_CHANGE(SF_IMMUTABLE, ZFS_IMMUTABLE, XAT_IMMUTABLE, xvap.xva_xoptattrs.xoa_immutable); FLAG_CHANGE(SF_APPEND, ZFS_APPENDONLY, XAT_APPENDONLY, xvap.xva_xoptattrs.xoa_appendonly); FLAG_CHANGE(SF_NOUNLINK, ZFS_NOUNLINK, XAT_NOUNLINK, xvap.xva_xoptattrs.xoa_nounlink); FLAG_CHANGE(UF_ARCHIVE, ZFS_ARCHIVE, XAT_ARCHIVE, xvap.xva_xoptattrs.xoa_archive); FLAG_CHANGE(UF_NODUMP, ZFS_NODUMP, XAT_NODUMP, xvap.xva_xoptattrs.xoa_nodump); FLAG_CHANGE(UF_READONLY, ZFS_READONLY, XAT_READONLY, xvap.xva_xoptattrs.xoa_readonly); FLAG_CHANGE(UF_SYSTEM, ZFS_SYSTEM, XAT_SYSTEM, xvap.xva_xoptattrs.xoa_system); FLAG_CHANGE(UF_HIDDEN, ZFS_HIDDEN, XAT_HIDDEN, xvap.xva_xoptattrs.xoa_hidden); FLAG_CHANGE(UF_REPARSE, ZFS_REPARSE, XAT_REPARSE, xvap.xva_xoptattrs.xoa_hidden); FLAG_CHANGE(UF_OFFLINE, ZFS_OFFLINE, XAT_OFFLINE, xvap.xva_xoptattrs.xoa_offline); FLAG_CHANGE(UF_SPARSE, ZFS_SPARSE, XAT_SPARSE, xvap.xva_xoptattrs.xoa_sparse); #undef FLAG_CHANGE } return (zfs_setattr(vp, (vattr_t *)&xvap, 0, cred, NULL)); } static int zfs_freebsd_rename(ap) struct vop_rename_args /* { struct vnode *a_fdvp; struct vnode *a_fvp; struct componentname *a_fcnp; struct vnode *a_tdvp; struct vnode *a_tvp; struct componentname *a_tcnp; } */ *ap; { vnode_t *fdvp = ap->a_fdvp; vnode_t *fvp = ap->a_fvp; vnode_t *tdvp = ap->a_tdvp; vnode_t *tvp = ap->a_tvp; int error; ASSERT(ap->a_fcnp->cn_flags & (SAVENAME|SAVESTART)); ASSERT(ap->a_tcnp->cn_flags & (SAVENAME|SAVESTART)); error = zfs_rename(fdvp, &fvp, ap->a_fcnp, tdvp, &tvp, ap->a_tcnp, ap->a_fcnp->cn_cred); vrele(fdvp); vrele(fvp); vrele(tdvp); if (tvp != NULL) vrele(tvp); return (error); } static int zfs_freebsd_symlink(ap) struct vop_symlink_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; struct vattr *a_vap; char *a_target; } */ *ap; { struct componentname *cnp = ap->a_cnp; vattr_t *vap = ap->a_vap; ASSERT(cnp->cn_flags & SAVENAME); vap->va_type = VLNK; /* FreeBSD: Syscall only sets va_mode. */ vattr_init_mask(vap); return (zfs_symlink(ap->a_dvp, ap->a_vpp, cnp->cn_nameptr, vap, ap->a_target, cnp->cn_cred, cnp->cn_thread)); } static int zfs_freebsd_readlink(ap) struct vop_readlink_args /* { struct vnode *a_vp; struct uio *a_uio; struct ucred *a_cred; } */ *ap; { return (zfs_readlink(ap->a_vp, ap->a_uio, ap->a_cred, NULL)); } static int zfs_freebsd_link(ap) struct vop_link_args /* { struct vnode *a_tdvp; struct vnode *a_vp; struct componentname *a_cnp; } */ *ap; { struct componentname *cnp = ap->a_cnp; vnode_t *vp = ap->a_vp; vnode_t *tdvp = ap->a_tdvp; if (tdvp->v_mount != vp->v_mount) return (EXDEV); ASSERT(cnp->cn_flags & SAVENAME); return (zfs_link(tdvp, vp, cnp->cn_nameptr, cnp->cn_cred, NULL, 0)); } static int zfs_freebsd_inactive(ap) struct vop_inactive_args /* { struct vnode *a_vp; struct thread *a_td; } */ *ap; { vnode_t *vp = ap->a_vp; zfs_inactive(vp, ap->a_td->td_ucred, NULL); return (0); } static int zfs_freebsd_reclaim(ap) struct vop_reclaim_args /* { struct vnode *a_vp; struct thread *a_td; } */ *ap; { vnode_t *vp = ap->a_vp; znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; ASSERT(zp != NULL); /* Destroy the vm object and flush associated pages. */ vnode_destroy_vobject(vp); /* * z_teardown_inactive_lock protects from a race with * zfs_znode_dmu_fini in zfsvfs_teardown during * force unmount. */ rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER); if (zp->z_sa_hdl == NULL) zfs_znode_free(zp); else zfs_zinactive(zp); rw_exit(&zfsvfs->z_teardown_inactive_lock); vp->v_data = NULL; return (0); } static int zfs_freebsd_fid(ap) struct vop_fid_args /* { struct vnode *a_vp; struct fid *a_fid; } */ *ap; { return (zfs_fid(ap->a_vp, (void *)ap->a_fid, NULL)); } static int zfs_freebsd_pathconf(ap) struct vop_pathconf_args /* { struct vnode *a_vp; int a_name; register_t *a_retval; } */ *ap; { ulong_t val; int error; error = zfs_pathconf(ap->a_vp, ap->a_name, &val, curthread->td_ucred, NULL); if (error == 0) *ap->a_retval = val; else if (error == EOPNOTSUPP) error = vop_stdpathconf(ap); return (error); } static int zfs_freebsd_fifo_pathconf(ap) struct vop_pathconf_args /* { struct vnode *a_vp; int a_name; register_t *a_retval; } */ *ap; { switch (ap->a_name) { case _PC_ACL_EXTENDED: case _PC_ACL_NFS4: case _PC_ACL_PATH_MAX: case _PC_MAC_PRESENT: return (zfs_freebsd_pathconf(ap)); default: return (fifo_specops.vop_pathconf(ap)); } } /* * FreeBSD's extended attributes namespace defines file name prefix for ZFS' * extended attribute name: * * NAMESPACE PREFIX * system freebsd:system: * user (none, can be used to access ZFS fsattr(5) attributes * created on Solaris) */ static int zfs_create_attrname(int attrnamespace, const char *name, char *attrname, size_t size) { const char *namespace, *prefix, *suffix; /* We don't allow '/' character in attribute name. */ if (strchr(name, '/') != NULL) return (EINVAL); /* We don't allow attribute names that start with "freebsd:" string. */ if (strncmp(name, "freebsd:", 8) == 0) return (EINVAL); bzero(attrname, size); switch (attrnamespace) { case EXTATTR_NAMESPACE_USER: #if 0 prefix = "freebsd:"; namespace = EXTATTR_NAMESPACE_USER_STRING; suffix = ":"; #else /* * This is the default namespace by which we can access all * attributes created on Solaris. */ prefix = namespace = suffix = ""; #endif break; case EXTATTR_NAMESPACE_SYSTEM: prefix = "freebsd:"; namespace = EXTATTR_NAMESPACE_SYSTEM_STRING; suffix = ":"; break; case EXTATTR_NAMESPACE_EMPTY: default: return (EINVAL); } if (snprintf(attrname, size, "%s%s%s%s", prefix, namespace, suffix, name) >= size) { return (ENAMETOOLONG); } return (0); } /* * Vnode operating to retrieve a named extended attribute. */ static int zfs_getextattr(struct vop_getextattr_args *ap) /* vop_getextattr { IN struct vnode *a_vp; IN int a_attrnamespace; IN const char *a_name; INOUT struct uio *a_uio; OUT size_t *a_size; IN struct ucred *a_cred; IN struct thread *a_td; }; */ { zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs; struct thread *td = ap->a_td; struct nameidata nd; char attrname[255]; struct vattr va; vnode_t *xvp = NULL, *vp; int error, flags; error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, ap->a_cred, ap->a_td, VREAD); if (error != 0) return (error); error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname, sizeof(attrname)); if (error != 0) return (error); ZFS_ENTER(zfsvfs); error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td, LOOKUP_XATTR); if (error != 0) { ZFS_EXIT(zfsvfs); return (error); } flags = FREAD; NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, attrname, xvp, td); error = vn_open_cred(&nd, &flags, 0, 0, ap->a_cred, NULL); vp = nd.ni_vp; NDFREE(&nd, NDF_ONLY_PNBUF); if (error != 0) { ZFS_EXIT(zfsvfs); if (error == ENOENT) error = ENOATTR; return (error); } if (ap->a_size != NULL) { error = VOP_GETATTR(vp, &va, ap->a_cred); if (error == 0) *ap->a_size = (size_t)va.va_size; } else if (ap->a_uio != NULL) error = VOP_READ(vp, ap->a_uio, IO_UNIT, ap->a_cred); VOP_UNLOCK(vp, 0); vn_close(vp, flags, ap->a_cred, td); ZFS_EXIT(zfsvfs); return (error); } /* * Vnode operation to remove a named attribute. */ int zfs_deleteextattr(struct vop_deleteextattr_args *ap) /* vop_deleteextattr { IN struct vnode *a_vp; IN int a_attrnamespace; IN const char *a_name; IN struct ucred *a_cred; IN struct thread *a_td; }; */ { zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs; struct thread *td = ap->a_td; struct nameidata nd; char attrname[255]; struct vattr va; vnode_t *xvp = NULL, *vp; int error, flags; error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, ap->a_cred, ap->a_td, VWRITE); if (error != 0) return (error); error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname, sizeof(attrname)); if (error != 0) return (error); ZFS_ENTER(zfsvfs); error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td, LOOKUP_XATTR); if (error != 0) { ZFS_EXIT(zfsvfs); return (error); } NDINIT_ATVP(&nd, DELETE, NOFOLLOW | LOCKPARENT | LOCKLEAF, UIO_SYSSPACE, attrname, xvp, td); error = namei(&nd); vp = nd.ni_vp; if (error != 0) { ZFS_EXIT(zfsvfs); NDFREE(&nd, NDF_ONLY_PNBUF); if (error == ENOENT) error = ENOATTR; return (error); } error = VOP_REMOVE(nd.ni_dvp, vp, &nd.ni_cnd); NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_dvp); if (vp == nd.ni_dvp) vrele(vp); else vput(vp); ZFS_EXIT(zfsvfs); return (error); } /* * Vnode operation to set a named attribute. */ static int zfs_setextattr(struct vop_setextattr_args *ap) /* vop_setextattr { IN struct vnode *a_vp; IN int a_attrnamespace; IN const char *a_name; INOUT struct uio *a_uio; IN struct ucred *a_cred; IN struct thread *a_td; }; */ { zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs; struct thread *td = ap->a_td; struct nameidata nd; char attrname[255]; struct vattr va; vnode_t *xvp = NULL, *vp; int error, flags; error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, ap->a_cred, ap->a_td, VWRITE); if (error != 0) return (error); error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname, sizeof(attrname)); if (error != 0) return (error); ZFS_ENTER(zfsvfs); error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td, LOOKUP_XATTR | CREATE_XATTR_DIR); if (error != 0) { ZFS_EXIT(zfsvfs); return (error); } flags = FFLAGS(O_WRONLY | O_CREAT); NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, attrname, xvp, td); error = vn_open_cred(&nd, &flags, 0600, 0, ap->a_cred, NULL); vp = nd.ni_vp; NDFREE(&nd, NDF_ONLY_PNBUF); if (error != 0) { ZFS_EXIT(zfsvfs); return (error); } VATTR_NULL(&va); va.va_size = 0; error = VOP_SETATTR(vp, &va, ap->a_cred); if (error == 0) VOP_WRITE(vp, ap->a_uio, IO_UNIT, ap->a_cred); VOP_UNLOCK(vp, 0); vn_close(vp, flags, ap->a_cred, td); ZFS_EXIT(zfsvfs); return (error); } /* * Vnode operation to retrieve extended attributes on a vnode. */ static int zfs_listextattr(struct vop_listextattr_args *ap) /* vop_listextattr { IN struct vnode *a_vp; IN int a_attrnamespace; INOUT struct uio *a_uio; OUT size_t *a_size; IN struct ucred *a_cred; IN struct thread *a_td; }; */ { zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs; struct thread *td = ap->a_td; struct nameidata nd; char attrprefix[16]; u_char dirbuf[sizeof(struct dirent)]; struct dirent *dp; struct iovec aiov; struct uio auio, *uio = ap->a_uio; size_t *sizep = ap->a_size; size_t plen; vnode_t *xvp = NULL, *vp; int done, error, eof, pos; error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, ap->a_cred, ap->a_td, VREAD); if (error != 0) return (error); error = zfs_create_attrname(ap->a_attrnamespace, "", attrprefix, sizeof(attrprefix)); if (error != 0) return (error); plen = strlen(attrprefix); ZFS_ENTER(zfsvfs); if (sizep != NULL) *sizep = 0; error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td, LOOKUP_XATTR); if (error != 0) { ZFS_EXIT(zfsvfs); /* * ENOATTR means that the EA directory does not yet exist, * i.e. there are no extended attributes there. */ if (error == ENOATTR) error = 0; return (error); } NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | LOCKSHARED, UIO_SYSSPACE, ".", xvp, td); error = namei(&nd); vp = nd.ni_vp; NDFREE(&nd, NDF_ONLY_PNBUF); if (error != 0) { ZFS_EXIT(zfsvfs); return (error); } auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_segflg = UIO_SYSSPACE; auio.uio_td = td; auio.uio_rw = UIO_READ; auio.uio_offset = 0; do { u_char nlen; aiov.iov_base = (void *)dirbuf; aiov.iov_len = sizeof(dirbuf); auio.uio_resid = sizeof(dirbuf); error = VOP_READDIR(vp, &auio, ap->a_cred, &eof, NULL, NULL); done = sizeof(dirbuf) - auio.uio_resid; if (error != 0) break; for (pos = 0; pos < done;) { dp = (struct dirent *)(dirbuf + pos); pos += dp->d_reclen; /* * XXX: Temporarily we also accept DT_UNKNOWN, as this * is what we get when attribute was created on Solaris. */ if (dp->d_type != DT_REG && dp->d_type != DT_UNKNOWN) continue; if (plen == 0 && strncmp(dp->d_name, "freebsd:", 8) == 0) continue; else if (strncmp(dp->d_name, attrprefix, plen) != 0) continue; nlen = dp->d_namlen - plen; if (sizep != NULL) *sizep += 1 + nlen; else if (uio != NULL) { /* * Format of extattr name entry is one byte for * length and the rest for name. */ error = uiomove(&nlen, 1, uio->uio_rw, uio); if (error == 0) { error = uiomove(dp->d_name + plen, nlen, uio->uio_rw, uio); } if (error != 0) break; } } } while (!eof && error == 0); vput(vp); ZFS_EXIT(zfsvfs); return (error); } int zfs_freebsd_getacl(ap) struct vop_getacl_args /* { struct vnode *vp; acl_type_t type; struct acl *aclp; struct ucred *cred; struct thread *td; } */ *ap; { int error; vsecattr_t vsecattr; if (ap->a_type != ACL_TYPE_NFS4) return (EINVAL); vsecattr.vsa_mask = VSA_ACE | VSA_ACECNT; if (error = zfs_getsecattr(ap->a_vp, &vsecattr, 0, ap->a_cred, NULL)) return (error); error = acl_from_aces(ap->a_aclp, vsecattr.vsa_aclentp, vsecattr.vsa_aclcnt); if (vsecattr.vsa_aclentp != NULL) kmem_free(vsecattr.vsa_aclentp, vsecattr.vsa_aclentsz); return (error); } int zfs_freebsd_setacl(ap) struct vop_setacl_args /* { struct vnode *vp; acl_type_t type; struct acl *aclp; struct ucred *cred; struct thread *td; } */ *ap; { int error; vsecattr_t vsecattr; int aclbsize; /* size of acl list in bytes */ aclent_t *aaclp; if (ap->a_type != ACL_TYPE_NFS4) return (EINVAL); if (ap->a_aclp->acl_cnt < 1 || ap->a_aclp->acl_cnt > MAX_ACL_ENTRIES) return (EINVAL); /* * With NFSv4 ACLs, chmod(2) may need to add additional entries, * splitting every entry into two and appending "canonical six" * entries at the end. Don't allow for setting an ACL that would * cause chmod(2) to run out of ACL entries. */ if (ap->a_aclp->acl_cnt * 2 + 6 > ACL_MAX_ENTRIES) return (ENOSPC); error = acl_nfs4_check(ap->a_aclp, ap->a_vp->v_type == VDIR); if (error != 0) return (error); vsecattr.vsa_mask = VSA_ACE; aclbsize = ap->a_aclp->acl_cnt * sizeof(ace_t); vsecattr.vsa_aclentp = kmem_alloc(aclbsize, KM_SLEEP); aaclp = vsecattr.vsa_aclentp; vsecattr.vsa_aclentsz = aclbsize; aces_from_acl(vsecattr.vsa_aclentp, &vsecattr.vsa_aclcnt, ap->a_aclp); error = zfs_setsecattr(ap->a_vp, &vsecattr, 0, ap->a_cred, NULL); kmem_free(aaclp, aclbsize); return (error); } int zfs_freebsd_aclcheck(ap) struct vop_aclcheck_args /* { struct vnode *vp; acl_type_t type; struct acl *aclp; struct ucred *cred; struct thread *td; } */ *ap; { return (EOPNOTSUPP); } static int zfs_vptocnp(struct vop_vptocnp_args *ap) { vnode_t *covered_vp; vnode_t *vp = ap->a_vp;; zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data; znode_t *zp = VTOZ(vp); uint64_t parent; int ltype; int error; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); /* * If we are a snapshot mounted under .zfs, run the operation * on the covered vnode. */ if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), &parent, sizeof (parent))) != 0) { ZFS_EXIT(zfsvfs); return (error); } if (zp->z_id != parent || zfsvfs->z_parent == zfsvfs) { + char name[MAXNAMLEN + 1]; + znode_t *dzp; + size_t len; + + error = zfs_znode_parent_and_name(zp, &dzp, name); + if (error == 0) { + len = strlen(name); + *ap->a_buflen -= len; + bcopy(name, ap->a_buf + *ap->a_buflen, len); + *ap->a_vpp = ZTOV(dzp); + } ZFS_EXIT(zfsvfs); - return (vop_stdvptocnp(ap)); + return (error); } ZFS_EXIT(zfsvfs); covered_vp = vp->v_mount->mnt_vnodecovered; vhold(covered_vp); ltype = VOP_ISLOCKED(vp); VOP_UNLOCK(vp, 0); error = vget(covered_vp, LK_EXCLUSIVE | LK_VNHELD, curthread); if (error == 0) { error = VOP_VPTOCNP(covered_vp, ap->a_vpp, ap->a_cred, ap->a_buf, ap->a_buflen); vput(covered_vp); } vn_lock(vp, ltype | LK_RETRY); if ((vp->v_iflag & VI_DOOMED) != 0) error = SET_ERROR(ENOENT); return (error); } #ifdef DIAGNOSTIC static int zfs_lock(ap) struct vop_lock1_args /* { struct vnode *a_vp; int a_flags; char *file; int line; } */ *ap; { zfsvfs_t *zfsvfs; znode_t *zp; vnode_t *vp; int flags; int err; vp = ap->a_vp; flags = ap->a_flags; if ((flags & LK_INTERLOCK) == 0 && (flags & LK_NOWAIT) == 0 && (vp->v_iflag & VI_DOOMED) == 0 && (zp = vp->v_data) != NULL && (zp->z_pflags & ZFS_XATTR) == 0) { zfsvfs = zp->z_zfsvfs; VERIFY(!RRM_LOCK_HELD(&zfsvfs->z_teardown_lock)); } err = vop_stdlock(ap); if ((flags & LK_INTERLOCK) != 0 && (flags & LK_NOWAIT) == 0 && (vp->v_iflag & VI_DOOMED) == 0 && (zp = vp->v_data) != NULL && (zp->z_pflags & ZFS_XATTR) == 0) { zfsvfs = zp->z_zfsvfs; VERIFY(!RRM_LOCK_HELD(&zfsvfs->z_teardown_lock)); } return (err); } #endif struct vop_vector zfs_vnodeops; struct vop_vector zfs_fifoops; struct vop_vector zfs_shareops; struct vop_vector zfs_vnodeops = { .vop_default = &default_vnodeops, .vop_inactive = zfs_freebsd_inactive, .vop_reclaim = zfs_freebsd_reclaim, .vop_access = zfs_freebsd_access, .vop_lookup = zfs_cache_lookup, .vop_cachedlookup = zfs_freebsd_lookup, .vop_getattr = zfs_freebsd_getattr, .vop_setattr = zfs_freebsd_setattr, .vop_create = zfs_freebsd_create, .vop_mknod = zfs_freebsd_create, .vop_mkdir = zfs_freebsd_mkdir, .vop_readdir = zfs_freebsd_readdir, .vop_fsync = zfs_freebsd_fsync, .vop_open = zfs_freebsd_open, .vop_close = zfs_freebsd_close, .vop_rmdir = zfs_freebsd_rmdir, .vop_ioctl = zfs_freebsd_ioctl, .vop_link = zfs_freebsd_link, .vop_symlink = zfs_freebsd_symlink, .vop_readlink = zfs_freebsd_readlink, .vop_read = zfs_freebsd_read, .vop_write = zfs_freebsd_write, .vop_remove = zfs_freebsd_remove, .vop_rename = zfs_freebsd_rename, .vop_pathconf = zfs_freebsd_pathconf, .vop_bmap = zfs_freebsd_bmap, .vop_fid = zfs_freebsd_fid, .vop_getextattr = zfs_getextattr, .vop_deleteextattr = zfs_deleteextattr, .vop_setextattr = zfs_setextattr, .vop_listextattr = zfs_listextattr, .vop_getacl = zfs_freebsd_getacl, .vop_setacl = zfs_freebsd_setacl, .vop_aclcheck = zfs_freebsd_aclcheck, .vop_getpages = zfs_freebsd_getpages, .vop_putpages = zfs_freebsd_putpages, .vop_vptocnp = zfs_vptocnp, #ifdef DIAGNOSTIC .vop_lock1 = zfs_lock, #endif }; struct vop_vector zfs_fifoops = { .vop_default = &fifo_specops, .vop_fsync = zfs_freebsd_fsync, .vop_access = zfs_freebsd_access, .vop_getattr = zfs_freebsd_getattr, .vop_inactive = zfs_freebsd_inactive, .vop_read = VOP_PANIC, .vop_reclaim = zfs_freebsd_reclaim, .vop_setattr = zfs_freebsd_setattr, .vop_write = VOP_PANIC, .vop_pathconf = zfs_freebsd_fifo_pathconf, .vop_fid = zfs_freebsd_fid, .vop_getacl = zfs_freebsd_getacl, .vop_setacl = zfs_freebsd_setacl, .vop_aclcheck = zfs_freebsd_aclcheck, }; /* * special share hidden files vnode operations template */ struct vop_vector zfs_shareops = { .vop_default = &default_vnodeops, .vop_access = zfs_freebsd_access, .vop_inactive = zfs_freebsd_inactive, .vop_reclaim = zfs_freebsd_reclaim, .vop_fid = zfs_freebsd_fid, .vop_pathconf = zfs_freebsd_pathconf, }; Index: user/alc/PQ_LAUNDRY/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c =================================================================== --- user/alc/PQ_LAUNDRY/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c (revision 306831) +++ user/alc/PQ_LAUNDRY/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c (revision 306832) @@ -1,2194 +1,2225 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2014 by Delphix. All rights reserved. * Copyright (c) 2014 Integros [integros.com] */ /* Portions Copyright 2007 Jeremy Teo */ /* Portions Copyright 2011 Martin Matuska */ #ifdef _KERNEL #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #endif /* _KERNEL */ #include #include #include #include #include #include #include #include #include #include #include "zfs_prop.h" #include "zfs_comutil.h" /* Used by fstat(1). */ SYSCTL_INT(_debug_sizeof, OID_AUTO, znode, CTLFLAG_RD, SYSCTL_NULL_INT_PTR, sizeof(znode_t), "sizeof(znode_t)"); /* * Define ZNODE_STATS to turn on statistic gathering. By default, it is only * turned on when DEBUG is also defined. */ #ifdef DEBUG #define ZNODE_STATS #endif /* DEBUG */ #ifdef ZNODE_STATS #define ZNODE_STAT_ADD(stat) ((stat)++) #else #define ZNODE_STAT_ADD(stat) /* nothing */ #endif /* ZNODE_STATS */ /* * Functions needed for userland (ie: libzpool) are not put under * #ifdef_KERNEL; the rest of the functions have dependencies * (such as VFS logic) that will not compile easily in userland. */ #ifdef _KERNEL /* * Needed to close a small window in zfs_znode_move() that allows the zfsvfs to * be freed before it can be safely accessed. */ krwlock_t zfsvfs_lock; static kmem_cache_t *znode_cache = NULL; /*ARGSUSED*/ static void znode_evict_error(dmu_buf_t *dbuf, void *user_ptr) { /* * We should never drop all dbuf refs without first clearing * the eviction callback. */ panic("evicting znode %p\n", user_ptr); } extern struct vop_vector zfs_vnodeops; extern struct vop_vector zfs_fifoops; extern struct vop_vector zfs_shareops; static int zfs_znode_cache_constructor(void *buf, void *arg, int kmflags) { znode_t *zp = buf; POINTER_INVALIDATE(&zp->z_zfsvfs); list_link_init(&zp->z_link_node); mutex_init(&zp->z_acl_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&zp->z_range_lock, NULL, MUTEX_DEFAULT, NULL); avl_create(&zp->z_range_avl, zfs_range_compare, sizeof (rl_t), offsetof(rl_t, r_node)); zp->z_acl_cached = NULL; zp->z_vnode = NULL; zp->z_moved = 0; return (0); } /*ARGSUSED*/ static void zfs_znode_cache_destructor(void *buf, void *arg) { znode_t *zp = buf; ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs)); ASSERT(ZTOV(zp) == NULL); vn_free(ZTOV(zp)); ASSERT(!list_link_active(&zp->z_link_node)); mutex_destroy(&zp->z_acl_lock); avl_destroy(&zp->z_range_avl); mutex_destroy(&zp->z_range_lock); ASSERT(zp->z_acl_cached == NULL); } #ifdef ZNODE_STATS static struct { uint64_t zms_zfsvfs_invalid; uint64_t zms_zfsvfs_recheck1; uint64_t zms_zfsvfs_unmounted; uint64_t zms_zfsvfs_recheck2; uint64_t zms_obj_held; uint64_t zms_vnode_locked; uint64_t zms_not_only_dnlc; } znode_move_stats; #endif /* ZNODE_STATS */ #ifdef illumos static void zfs_znode_move_impl(znode_t *ozp, znode_t *nzp) { vnode_t *vp; /* Copy fields. */ nzp->z_zfsvfs = ozp->z_zfsvfs; /* Swap vnodes. */ vp = nzp->z_vnode; nzp->z_vnode = ozp->z_vnode; ozp->z_vnode = vp; /* let destructor free the overwritten vnode */ ZTOV(ozp)->v_data = ozp; ZTOV(nzp)->v_data = nzp; nzp->z_id = ozp->z_id; ASSERT(ozp->z_dirlocks == NULL); /* znode not in use */ ASSERT(avl_numnodes(&ozp->z_range_avl) == 0); nzp->z_unlinked = ozp->z_unlinked; nzp->z_atime_dirty = ozp->z_atime_dirty; nzp->z_zn_prefetch = ozp->z_zn_prefetch; nzp->z_blksz = ozp->z_blksz; nzp->z_seq = ozp->z_seq; nzp->z_mapcnt = ozp->z_mapcnt; nzp->z_gen = ozp->z_gen; nzp->z_sync_cnt = ozp->z_sync_cnt; nzp->z_is_sa = ozp->z_is_sa; nzp->z_sa_hdl = ozp->z_sa_hdl; bcopy(ozp->z_atime, nzp->z_atime, sizeof (uint64_t) * 2); nzp->z_links = ozp->z_links; nzp->z_size = ozp->z_size; nzp->z_pflags = ozp->z_pflags; nzp->z_uid = ozp->z_uid; nzp->z_gid = ozp->z_gid; nzp->z_mode = ozp->z_mode; /* * Since this is just an idle znode and kmem is already dealing with * memory pressure, release any cached ACL. */ if (ozp->z_acl_cached) { zfs_acl_free(ozp->z_acl_cached); ozp->z_acl_cached = NULL; } sa_set_userp(nzp->z_sa_hdl, nzp); /* * Invalidate the original znode by clearing fields that provide a * pointer back to the znode. Set the low bit of the vfs pointer to * ensure that zfs_znode_move() recognizes the znode as invalid in any * subsequent callback. */ ozp->z_sa_hdl = NULL; POINTER_INVALIDATE(&ozp->z_zfsvfs); /* * Mark the znode. */ nzp->z_moved = 1; ozp->z_moved = (uint8_t)-1; } /*ARGSUSED*/ static kmem_cbrc_t zfs_znode_move(void *buf, void *newbuf, size_t size, void *arg) { znode_t *ozp = buf, *nzp = newbuf; zfsvfs_t *zfsvfs; vnode_t *vp; /* * The znode is on the file system's list of known znodes if the vfs * pointer is valid. We set the low bit of the vfs pointer when freeing * the znode to invalidate it, and the memory patterns written by kmem * (baddcafe and deadbeef) set at least one of the two low bits. A newly * created znode sets the vfs pointer last of all to indicate that the * znode is known and in a valid state to be moved by this function. */ zfsvfs = ozp->z_zfsvfs; if (!POINTER_IS_VALID(zfsvfs)) { ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_invalid); return (KMEM_CBRC_DONT_KNOW); } /* * Close a small window in which it's possible that the filesystem could * be unmounted and freed, and zfsvfs, though valid in the previous * statement, could point to unrelated memory by the time we try to * prevent the filesystem from being unmounted. */ rw_enter(&zfsvfs_lock, RW_WRITER); if (zfsvfs != ozp->z_zfsvfs) { rw_exit(&zfsvfs_lock); ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_recheck1); return (KMEM_CBRC_DONT_KNOW); } /* * If the znode is still valid, then so is the file system. We know that * no valid file system can be freed while we hold zfsvfs_lock, so we * can safely ensure that the filesystem is not and will not be * unmounted. The next statement is equivalent to ZFS_ENTER(). */ rrm_enter(&zfsvfs->z_teardown_lock, RW_READER, FTAG); if (zfsvfs->z_unmounted) { ZFS_EXIT(zfsvfs); rw_exit(&zfsvfs_lock); ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_unmounted); return (KMEM_CBRC_DONT_KNOW); } rw_exit(&zfsvfs_lock); mutex_enter(&zfsvfs->z_znodes_lock); /* * Recheck the vfs pointer in case the znode was removed just before * acquiring the lock. */ if (zfsvfs != ozp->z_zfsvfs) { mutex_exit(&zfsvfs->z_znodes_lock); ZFS_EXIT(zfsvfs); ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_recheck2); return (KMEM_CBRC_DONT_KNOW); } /* * At this point we know that as long as we hold z_znodes_lock, the * znode cannot be freed and fields within the znode can be safely * accessed. Now, prevent a race with zfs_zget(). */ if (ZFS_OBJ_HOLD_TRYENTER(zfsvfs, ozp->z_id) == 0) { mutex_exit(&zfsvfs->z_znodes_lock); ZFS_EXIT(zfsvfs); ZNODE_STAT_ADD(znode_move_stats.zms_obj_held); return (KMEM_CBRC_LATER); } vp = ZTOV(ozp); if (mutex_tryenter(&vp->v_lock) == 0) { ZFS_OBJ_HOLD_EXIT(zfsvfs, ozp->z_id); mutex_exit(&zfsvfs->z_znodes_lock); ZFS_EXIT(zfsvfs); ZNODE_STAT_ADD(znode_move_stats.zms_vnode_locked); return (KMEM_CBRC_LATER); } /* Only move znodes that are referenced _only_ by the DNLC. */ if (vp->v_count != 1 || !vn_in_dnlc(vp)) { mutex_exit(&vp->v_lock); ZFS_OBJ_HOLD_EXIT(zfsvfs, ozp->z_id); mutex_exit(&zfsvfs->z_znodes_lock); ZFS_EXIT(zfsvfs); ZNODE_STAT_ADD(znode_move_stats.zms_not_only_dnlc); return (KMEM_CBRC_LATER); } /* * The znode is known and in a valid state to move. We're holding the * locks needed to execute the critical section. */ zfs_znode_move_impl(ozp, nzp); mutex_exit(&vp->v_lock); ZFS_OBJ_HOLD_EXIT(zfsvfs, ozp->z_id); list_link_replace(&ozp->z_link_node, &nzp->z_link_node); mutex_exit(&zfsvfs->z_znodes_lock); ZFS_EXIT(zfsvfs); return (KMEM_CBRC_YES); } #endif /* illumos */ void zfs_znode_init(void) { /* * Initialize zcache */ rw_init(&zfsvfs_lock, NULL, RW_DEFAULT, NULL); ASSERT(znode_cache == NULL); znode_cache = kmem_cache_create("zfs_znode_cache", sizeof (znode_t), 0, zfs_znode_cache_constructor, zfs_znode_cache_destructor, NULL, NULL, NULL, 0); kmem_cache_set_move(znode_cache, zfs_znode_move); } void zfs_znode_fini(void) { #ifdef illumos /* * Cleanup vfs & vnode ops */ zfs_remove_op_tables(); #endif /* * Cleanup zcache */ if (znode_cache) kmem_cache_destroy(znode_cache); znode_cache = NULL; rw_destroy(&zfsvfs_lock); } #ifdef illumos struct vnodeops *zfs_dvnodeops; struct vnodeops *zfs_fvnodeops; struct vnodeops *zfs_symvnodeops; struct vnodeops *zfs_xdvnodeops; struct vnodeops *zfs_evnodeops; struct vnodeops *zfs_sharevnodeops; void zfs_remove_op_tables() { /* * Remove vfs ops */ ASSERT(zfsfstype); (void) vfs_freevfsops_by_type(zfsfstype); zfsfstype = 0; /* * Remove vnode ops */ if (zfs_dvnodeops) vn_freevnodeops(zfs_dvnodeops); if (zfs_fvnodeops) vn_freevnodeops(zfs_fvnodeops); if (zfs_symvnodeops) vn_freevnodeops(zfs_symvnodeops); if (zfs_xdvnodeops) vn_freevnodeops(zfs_xdvnodeops); if (zfs_evnodeops) vn_freevnodeops(zfs_evnodeops); if (zfs_sharevnodeops) vn_freevnodeops(zfs_sharevnodeops); zfs_dvnodeops = NULL; zfs_fvnodeops = NULL; zfs_symvnodeops = NULL; zfs_xdvnodeops = NULL; zfs_evnodeops = NULL; zfs_sharevnodeops = NULL; } extern const fs_operation_def_t zfs_dvnodeops_template[]; extern const fs_operation_def_t zfs_fvnodeops_template[]; extern const fs_operation_def_t zfs_xdvnodeops_template[]; extern const fs_operation_def_t zfs_symvnodeops_template[]; extern const fs_operation_def_t zfs_evnodeops_template[]; extern const fs_operation_def_t zfs_sharevnodeops_template[]; int zfs_create_op_tables() { int error; /* * zfs_dvnodeops can be set if mod_remove() calls mod_installfs() * due to a failure to remove the the 2nd modlinkage (zfs_modldrv). * In this case we just return as the ops vectors are already set up. */ if (zfs_dvnodeops) return (0); error = vn_make_ops(MNTTYPE_ZFS, zfs_dvnodeops_template, &zfs_dvnodeops); if (error) return (error); error = vn_make_ops(MNTTYPE_ZFS, zfs_fvnodeops_template, &zfs_fvnodeops); if (error) return (error); error = vn_make_ops(MNTTYPE_ZFS, zfs_symvnodeops_template, &zfs_symvnodeops); if (error) return (error); error = vn_make_ops(MNTTYPE_ZFS, zfs_xdvnodeops_template, &zfs_xdvnodeops); if (error) return (error); error = vn_make_ops(MNTTYPE_ZFS, zfs_evnodeops_template, &zfs_evnodeops); if (error) return (error); error = vn_make_ops(MNTTYPE_ZFS, zfs_sharevnodeops_template, &zfs_sharevnodeops); return (error); } #endif /* illumos */ int zfs_create_share_dir(zfsvfs_t *zfsvfs, dmu_tx_t *tx) { zfs_acl_ids_t acl_ids; vattr_t vattr; znode_t *sharezp; znode_t *zp; int error; vattr.va_mask = AT_MODE|AT_UID|AT_GID|AT_TYPE; vattr.va_type = VDIR; vattr.va_mode = S_IFDIR|0555; vattr.va_uid = crgetuid(kcred); vattr.va_gid = crgetgid(kcred); sharezp = kmem_cache_alloc(znode_cache, KM_SLEEP); ASSERT(!POINTER_IS_VALID(sharezp->z_zfsvfs)); sharezp->z_moved = 0; sharezp->z_unlinked = 0; sharezp->z_atime_dirty = 0; sharezp->z_zfsvfs = zfsvfs; sharezp->z_is_sa = zfsvfs->z_use_sa; VERIFY(0 == zfs_acl_ids_create(sharezp, IS_ROOT_NODE, &vattr, kcred, NULL, &acl_ids)); zfs_mknode(sharezp, &vattr, tx, kcred, IS_ROOT_NODE, &zp, &acl_ids); ASSERT3P(zp, ==, sharezp); POINTER_INVALIDATE(&sharezp->z_zfsvfs); error = zap_add(zfsvfs->z_os, MASTER_NODE_OBJ, ZFS_SHARES_DIR, 8, 1, &sharezp->z_id, tx); zfsvfs->z_shares_dir = sharezp->z_id; zfs_acl_ids_free(&acl_ids); sa_handle_destroy(sharezp->z_sa_hdl); kmem_cache_free(znode_cache, sharezp); return (error); } /* * define a couple of values we need available * for both 64 and 32 bit environments. */ #ifndef NBITSMINOR64 #define NBITSMINOR64 32 #endif #ifndef MAXMAJ64 #define MAXMAJ64 0xffffffffUL #endif #ifndef MAXMIN64 #define MAXMIN64 0xffffffffUL #endif /* * Create special expldev for ZFS private use. * Can't use standard expldev since it doesn't do * what we want. The standard expldev() takes a * dev32_t in LP64 and expands it to a long dev_t. * We need an interface that takes a dev32_t in ILP32 * and expands it to a long dev_t. */ static uint64_t zfs_expldev(dev_t dev) { return (((uint64_t)major(dev) << NBITSMINOR64) | minor(dev)); } /* * Special cmpldev for ZFS private use. * Can't use standard cmpldev since it takes * a long dev_t and compresses it to dev32_t in * LP64. We need to do a compaction of a long dev_t * to a dev32_t in ILP32. */ dev_t zfs_cmpldev(uint64_t dev) { return (makedev((dev >> NBITSMINOR64), (dev & MAXMIN64))); } static void zfs_znode_sa_init(zfsvfs_t *zfsvfs, znode_t *zp, dmu_buf_t *db, dmu_object_type_t obj_type, sa_handle_t *sa_hdl) { ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs) || (zfsvfs == zp->z_zfsvfs)); ASSERT(MUTEX_HELD(ZFS_OBJ_MUTEX(zfsvfs, zp->z_id))); ASSERT(zp->z_sa_hdl == NULL); ASSERT(zp->z_acl_cached == NULL); if (sa_hdl == NULL) { VERIFY(0 == sa_handle_get_from_db(zfsvfs->z_os, db, zp, SA_HDL_SHARED, &zp->z_sa_hdl)); } else { zp->z_sa_hdl = sa_hdl; sa_set_userp(sa_hdl, zp); } zp->z_is_sa = (obj_type == DMU_OT_SA) ? B_TRUE : B_FALSE; /* * Slap on VROOT if we are the root znode unless we are the root * node of a snapshot mounted under .zfs. */ if (zp->z_id == zfsvfs->z_root && zfsvfs->z_parent == zfsvfs) ZTOV(zp)->v_flag |= VROOT; vn_exists(ZTOV(zp)); } void zfs_znode_dmu_fini(znode_t *zp) { ASSERT(MUTEX_HELD(ZFS_OBJ_MUTEX(zp->z_zfsvfs, zp->z_id)) || zp->z_unlinked || RW_WRITE_HELD(&zp->z_zfsvfs->z_teardown_inactive_lock)); sa_handle_destroy(zp->z_sa_hdl); zp->z_sa_hdl = NULL; } static void zfs_vnode_forget(vnode_t *vp) { /* copied from insmntque_stddtr */ vp->v_data = NULL; vp->v_op = &dead_vnodeops; vgone(vp); vput(vp); } /* * Construct a new znode/vnode and intialize. * * This does not do a call to dmu_set_user() that is * up to the caller to do, in case you don't want to * return the znode */ static znode_t * zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz, dmu_object_type_t obj_type, sa_handle_t *hdl) { znode_t *zp; vnode_t *vp; uint64_t mode; uint64_t parent; sa_bulk_attr_t bulk[9]; int count = 0; int error; zp = kmem_cache_alloc(znode_cache, KM_SLEEP); KASSERT(curthread->td_vp_reserv > 0, ("zfs_znode_alloc: getnewvnode without any vnodes reserved")); error = getnewvnode("zfs", zfsvfs->z_parent->z_vfs, &zfs_vnodeops, &vp); if (error != 0) { kmem_cache_free(znode_cache, zp); return (NULL); } zp->z_vnode = vp; vp->v_data = zp; ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs)); zp->z_moved = 0; /* * Defer setting z_zfsvfs until the znode is ready to be a candidate for * the zfs_znode_move() callback. */ zp->z_sa_hdl = NULL; zp->z_unlinked = 0; zp->z_atime_dirty = 0; zp->z_mapcnt = 0; zp->z_id = db->db_object; zp->z_blksz = blksz; zp->z_seq = 0x7A4653; zp->z_sync_cnt = 0; vp = ZTOV(zp); zfs_znode_sa_init(zfsvfs, zp, db, obj_type, hdl); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GEN(zfsvfs), NULL, &zp->z_gen, 8); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL, &zp->z_size, 8); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL, &zp->z_links, 8); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, &zp->z_pflags, 8); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PARENT(zfsvfs), NULL, &parent, 8); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL, &zp->z_atime, 16); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, &zp->z_uid, 8); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL, &zp->z_gid, 8); if (sa_bulk_lookup(zp->z_sa_hdl, bulk, count) != 0 || zp->z_gen == 0) { if (hdl == NULL) sa_handle_destroy(zp->z_sa_hdl); zfs_vnode_forget(vp); zp->z_vnode = NULL; kmem_cache_free(znode_cache, zp); return (NULL); } zp->z_mode = mode; vp->v_type = IFTOVT((mode_t)mode); switch (vp->v_type) { case VDIR: zp->z_zn_prefetch = B_TRUE; /* z_prefetch default is enabled */ break; #ifdef illumos case VBLK: case VCHR: { uint64_t rdev; VERIFY(sa_lookup(zp->z_sa_hdl, SA_ZPL_RDEV(zfsvfs), &rdev, sizeof (rdev)) == 0); vp->v_rdev = zfs_cmpldev(rdev); } break; #endif case VFIFO: #ifdef illumos case VSOCK: case VDOOR: #endif vp->v_op = &zfs_fifoops; break; case VREG: if (parent == zfsvfs->z_shares_dir) { ASSERT(zp->z_uid == 0 && zp->z_gid == 0); vp->v_op = &zfs_shareops; } break; #ifdef illumos case VLNK: vn_setops(vp, zfs_symvnodeops); break; default: vn_setops(vp, zfs_evnodeops); break; #endif } mutex_enter(&zfsvfs->z_znodes_lock); list_insert_tail(&zfsvfs->z_all_znodes, zp); membar_producer(); /* * Everything else must be valid before assigning z_zfsvfs makes the * znode eligible for zfs_znode_move(). */ zp->z_zfsvfs = zfsvfs; mutex_exit(&zfsvfs->z_znodes_lock); /* * Acquire vnode lock before making it available to the world. */ #ifdef DIAGNOSTIC vop_lock1_t *orig_lock = vp->v_op->vop_lock1; vp->v_op->vop_lock1 = vop_stdlock; vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); vp->v_op->vop_lock1 = orig_lock; #else vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); #endif VN_LOCK_AREC(vp); if (vp->v_type != VFIFO) VN_LOCK_ASHARE(vp); #ifdef illumos VFS_HOLD(zfsvfs->z_vfs); #endif return (zp); } static uint64_t empty_xattr; static uint64_t pad[4]; static zfs_acl_phys_t acl_phys; /* * Create a new DMU object to hold a zfs znode. * * IN: dzp - parent directory for new znode * vap - file attributes for new znode * tx - dmu transaction id for zap operations * cr - credentials of caller * flag - flags: * IS_ROOT_NODE - new object will be root * IS_XATTR - new object is an attribute * bonuslen - length of bonus buffer * setaclp - File/Dir initial ACL * fuidp - Tracks fuid allocation. * * OUT: zpp - allocated znode * */ void zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr, uint_t flag, znode_t **zpp, zfs_acl_ids_t *acl_ids) { uint64_t crtime[2], atime[2], mtime[2], ctime[2]; uint64_t mode, size, links, parent, pflags; uint64_t dzp_pflags = 0; uint64_t rdev = 0; zfsvfs_t *zfsvfs = dzp->z_zfsvfs; dmu_buf_t *db; timestruc_t now; uint64_t gen, obj; int err; int bonuslen; sa_handle_t *sa_hdl; dmu_object_type_t obj_type; sa_bulk_attr_t sa_attrs[ZPL_END]; int cnt = 0; zfs_acl_locator_cb_t locate = { 0 }; ASSERT(vap && (vap->va_mask & (AT_TYPE|AT_MODE)) == (AT_TYPE|AT_MODE)); if (zfsvfs->z_replay) { obj = vap->va_nodeid; now = vap->va_ctime; /* see zfs_replay_create() */ gen = vap->va_nblocks; /* ditto */ } else { obj = 0; vfs_timestamp(&now); gen = dmu_tx_get_txg(tx); } obj_type = zfsvfs->z_use_sa ? DMU_OT_SA : DMU_OT_ZNODE; bonuslen = (obj_type == DMU_OT_SA) ? DN_MAX_BONUSLEN : ZFS_OLD_ZNODE_PHYS_SIZE; /* * Create a new DMU object. */ /* * There's currently no mechanism for pre-reading the blocks that will * be needed to allocate a new object, so we accept the small chance * that there will be an i/o error and we will fail one of the * assertions below. */ if (vap->va_type == VDIR) { if (zfsvfs->z_replay) { VERIFY0(zap_create_claim_norm(zfsvfs->z_os, obj, zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS, obj_type, bonuslen, tx)); } else { obj = zap_create_norm(zfsvfs->z_os, zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS, obj_type, bonuslen, tx); } } else { if (zfsvfs->z_replay) { VERIFY0(dmu_object_claim(zfsvfs->z_os, obj, DMU_OT_PLAIN_FILE_CONTENTS, 0, obj_type, bonuslen, tx)); } else { obj = dmu_object_alloc(zfsvfs->z_os, DMU_OT_PLAIN_FILE_CONTENTS, 0, obj_type, bonuslen, tx); } } ZFS_OBJ_HOLD_ENTER(zfsvfs, obj); VERIFY(0 == sa_buf_hold(zfsvfs->z_os, obj, NULL, &db)); /* * If this is the root, fix up the half-initialized parent pointer * to reference the just-allocated physical data area. */ if (flag & IS_ROOT_NODE) { dzp->z_id = obj; } else { dzp_pflags = dzp->z_pflags; } /* * If parent is an xattr, so am I. */ if (dzp_pflags & ZFS_XATTR) { flag |= IS_XATTR; } if (zfsvfs->z_use_fuids) pflags = ZFS_ARCHIVE | ZFS_AV_MODIFIED; else pflags = 0; if (vap->va_type == VDIR) { size = 2; /* contents ("." and "..") */ links = (flag & (IS_ROOT_NODE | IS_XATTR)) ? 2 : 1; } else { size = links = 0; } if (vap->va_type == VBLK || vap->va_type == VCHR) { rdev = zfs_expldev(vap->va_rdev); } parent = dzp->z_id; mode = acl_ids->z_mode; if (flag & IS_XATTR) pflags |= ZFS_XATTR; /* * No execs denied will be deterimed when zfs_mode_compute() is called. */ pflags |= acl_ids->z_aclp->z_hints & (ZFS_ACL_TRIVIAL|ZFS_INHERIT_ACE|ZFS_ACL_AUTO_INHERIT| ZFS_ACL_DEFAULTED|ZFS_ACL_PROTECTED); ZFS_TIME_ENCODE(&now, crtime); ZFS_TIME_ENCODE(&now, ctime); if (vap->va_mask & AT_ATIME) { ZFS_TIME_ENCODE(&vap->va_atime, atime); } else { ZFS_TIME_ENCODE(&now, atime); } if (vap->va_mask & AT_MTIME) { ZFS_TIME_ENCODE(&vap->va_mtime, mtime); } else { ZFS_TIME_ENCODE(&now, mtime); } /* Now add in all of the "SA" attributes */ VERIFY(0 == sa_handle_get_from_db(zfsvfs->z_os, db, NULL, SA_HDL_SHARED, &sa_hdl)); /* * Setup the array of attributes to be replaced/set on the new file * * order for DMU_OT_ZNODE is critical since it needs to be constructed * in the old znode_phys_t format. Don't change this ordering */ if (obj_type == DMU_OT_ZNODE) { SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ATIME(zfsvfs), NULL, &atime, 16); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CRTIME(zfsvfs), NULL, &crtime, 16); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GEN(zfsvfs), NULL, &gen, 8); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_SIZE(zfsvfs), NULL, &size, 8); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PARENT(zfsvfs), NULL, &parent, 8); } else { SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_SIZE(zfsvfs), NULL, &size, 8); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GEN(zfsvfs), NULL, &gen, 8); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_UID(zfsvfs), NULL, &acl_ids->z_fuid, 8); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GID(zfsvfs), NULL, &acl_ids->z_fgid, 8); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PARENT(zfsvfs), NULL, &parent, 8); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_FLAGS(zfsvfs), NULL, &pflags, 8); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ATIME(zfsvfs), NULL, &atime, 16); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CRTIME(zfsvfs), NULL, &crtime, 16); } SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_LINKS(zfsvfs), NULL, &links, 8); if (obj_type == DMU_OT_ZNODE) { SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_XATTR(zfsvfs), NULL, &empty_xattr, 8); } if (obj_type == DMU_OT_ZNODE || (vap->va_type == VBLK || vap->va_type == VCHR)) { SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_RDEV(zfsvfs), NULL, &rdev, 8); } if (obj_type == DMU_OT_ZNODE) { SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_FLAGS(zfsvfs), NULL, &pflags, 8); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_UID(zfsvfs), NULL, &acl_ids->z_fuid, 8); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GID(zfsvfs), NULL, &acl_ids->z_fgid, 8); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PAD(zfsvfs), NULL, pad, sizeof (uint64_t) * 4); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ZNODE_ACL(zfsvfs), NULL, &acl_phys, sizeof (zfs_acl_phys_t)); } else if (acl_ids->z_aclp->z_version >= ZFS_ACL_VERSION_FUID) { SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_DACL_COUNT(zfsvfs), NULL, &acl_ids->z_aclp->z_acl_count, 8); locate.cb_aclp = acl_ids->z_aclp; SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_DACL_ACES(zfsvfs), zfs_acl_data_locator, &locate, acl_ids->z_aclp->z_acl_bytes); mode = zfs_mode_compute(mode, acl_ids->z_aclp, &pflags, acl_ids->z_fuid, acl_ids->z_fgid); } VERIFY(sa_replace_all_by_template(sa_hdl, sa_attrs, cnt, tx) == 0); if (!(flag & IS_ROOT_NODE)) { *zpp = zfs_znode_alloc(zfsvfs, db, 0, obj_type, sa_hdl); ASSERT(*zpp != NULL); } else { /* * If we are creating the root node, the "parent" we * passed in is the znode for the root. */ *zpp = dzp; (*zpp)->z_sa_hdl = sa_hdl; } (*zpp)->z_pflags = pflags; (*zpp)->z_mode = mode; if (vap->va_mask & AT_XVATTR) zfs_xvattr_set(*zpp, (xvattr_t *)vap, tx); if (obj_type == DMU_OT_ZNODE || acl_ids->z_aclp->z_version < ZFS_ACL_VERSION_FUID) { VERIFY0(zfs_aclset_common(*zpp, acl_ids->z_aclp, cr, tx)); } if (!(flag & IS_ROOT_NODE)) { vnode_t *vp; vp = ZTOV(*zpp); vp->v_vflag |= VV_FORCEINSMQ; err = insmntque(vp, zfsvfs->z_vfs); vp->v_vflag &= ~VV_FORCEINSMQ; KASSERT(err == 0, ("insmntque() failed: error %d", err)); } ZFS_OBJ_HOLD_EXIT(zfsvfs, obj); } /* * Update in-core attributes. It is assumed the caller will be doing an * sa_bulk_update to push the changes out. */ void zfs_xvattr_set(znode_t *zp, xvattr_t *xvap, dmu_tx_t *tx) { xoptattr_t *xoap; xoap = xva_getxoptattr(xvap); ASSERT(xoap); if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) { uint64_t times[2]; ZFS_TIME_ENCODE(&xoap->xoa_createtime, times); (void) sa_update(zp->z_sa_hdl, SA_ZPL_CRTIME(zp->z_zfsvfs), ×, sizeof (times), tx); XVA_SET_RTN(xvap, XAT_CREATETIME); } if (XVA_ISSET_REQ(xvap, XAT_READONLY)) { ZFS_ATTR_SET(zp, ZFS_READONLY, xoap->xoa_readonly, zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_READONLY); } if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) { ZFS_ATTR_SET(zp, ZFS_HIDDEN, xoap->xoa_hidden, zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_HIDDEN); } if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) { ZFS_ATTR_SET(zp, ZFS_SYSTEM, xoap->xoa_system, zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_SYSTEM); } if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) { ZFS_ATTR_SET(zp, ZFS_ARCHIVE, xoap->xoa_archive, zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_ARCHIVE); } if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) { ZFS_ATTR_SET(zp, ZFS_IMMUTABLE, xoap->xoa_immutable, zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_IMMUTABLE); } if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) { ZFS_ATTR_SET(zp, ZFS_NOUNLINK, xoap->xoa_nounlink, zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_NOUNLINK); } if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) { ZFS_ATTR_SET(zp, ZFS_APPENDONLY, xoap->xoa_appendonly, zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_APPENDONLY); } if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) { ZFS_ATTR_SET(zp, ZFS_NODUMP, xoap->xoa_nodump, zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_NODUMP); } if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) { ZFS_ATTR_SET(zp, ZFS_OPAQUE, xoap->xoa_opaque, zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_OPAQUE); } if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) { ZFS_ATTR_SET(zp, ZFS_AV_QUARANTINED, xoap->xoa_av_quarantined, zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_AV_QUARANTINED); } if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) { ZFS_ATTR_SET(zp, ZFS_AV_MODIFIED, xoap->xoa_av_modified, zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_AV_MODIFIED); } if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) { zfs_sa_set_scanstamp(zp, xvap, tx); XVA_SET_RTN(xvap, XAT_AV_SCANSTAMP); } if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) { ZFS_ATTR_SET(zp, ZFS_REPARSE, xoap->xoa_reparse, zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_REPARSE); } if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) { ZFS_ATTR_SET(zp, ZFS_OFFLINE, xoap->xoa_offline, zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_OFFLINE); } if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) { ZFS_ATTR_SET(zp, ZFS_SPARSE, xoap->xoa_sparse, zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_SPARSE); } } int zfs_zget(zfsvfs_t *zfsvfs, uint64_t obj_num, znode_t **zpp) { dmu_object_info_t doi; dmu_buf_t *db; znode_t *zp; vnode_t *vp; sa_handle_t *hdl; struct thread *td; int locked; int err; td = curthread; getnewvnode_reserve(1); again: *zpp = NULL; ZFS_OBJ_HOLD_ENTER(zfsvfs, obj_num); err = sa_buf_hold(zfsvfs->z_os, obj_num, NULL, &db); if (err) { ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); getnewvnode_drop_reserve(); return (err); } dmu_object_info_from_db(db, &doi); if (doi.doi_bonus_type != DMU_OT_SA && (doi.doi_bonus_type != DMU_OT_ZNODE || (doi.doi_bonus_type == DMU_OT_ZNODE && doi.doi_bonus_size < sizeof (znode_phys_t)))) { sa_buf_rele(db, NULL); ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); #ifdef __FreeBSD__ getnewvnode_drop_reserve(); #endif return (SET_ERROR(EINVAL)); } hdl = dmu_buf_get_user(db); if (hdl != NULL) { zp = sa_get_userdata(hdl); /* * Since "SA" does immediate eviction we * should never find a sa handle that doesn't * know about the znode. */ ASSERT3P(zp, !=, NULL); ASSERT3U(zp->z_id, ==, obj_num); *zpp = zp; vp = ZTOV(zp); /* Don't let the vnode disappear after ZFS_OBJ_HOLD_EXIT. */ VN_HOLD(vp); sa_buf_rele(db, NULL); ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); locked = VOP_ISLOCKED(vp); VI_LOCK(vp); if ((vp->v_iflag & VI_DOOMED) != 0 && locked != LK_EXCLUSIVE) { /* * The vnode is doomed and this thread doesn't * hold the exclusive lock on it, so the vnode * must be being reclaimed by another thread. * Otherwise the doomed vnode is being reclaimed * by this thread and zfs_zget is called from * ZIL internals. */ VI_UNLOCK(vp); /* * XXX vrele() locks the vnode when the last reference * is dropped. Although in this case the vnode is * doomed / dead and so no inactivation is required, * the vnode lock is still acquired. That could result * in a LOR with z_teardown_lock if another thread holds * the vnode's lock and tries to take z_teardown_lock. * But that is only possible if the other thread peforms * a ZFS vnode operation on the vnode. That either * should not happen if the vnode is dead or the thread * should also have a refrence to the vnode and thus * our reference is not last. */ VN_RELE(vp); goto again; } VI_UNLOCK(vp); getnewvnode_drop_reserve(); return (0); } /* * Not found create new znode/vnode * but only if file exists. * * There is a small window where zfs_vget() could * find this object while a file create is still in * progress. This is checked for in zfs_znode_alloc() * * if zfs_znode_alloc() fails it will drop the hold on the * bonus buffer. */ zp = zfs_znode_alloc(zfsvfs, db, doi.doi_data_block_size, doi.doi_bonus_type, NULL); if (zp == NULL) { err = SET_ERROR(ENOENT); } else { *zpp = zp; } if (err == 0) { vnode_t *vp = ZTOV(zp); err = insmntque(vp, zfsvfs->z_vfs); if (err == 0) { vp->v_hash = obj_num; VOP_UNLOCK(vp, 0); } else { zp->z_vnode = NULL; zfs_znode_dmu_fini(zp); zfs_znode_free(zp); *zpp = NULL; } } ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); getnewvnode_drop_reserve(); return (err); } int zfs_rezget(znode_t *zp) { zfsvfs_t *zfsvfs = zp->z_zfsvfs; dmu_object_info_t doi; dmu_buf_t *db; vnode_t *vp; uint64_t obj_num = zp->z_id; uint64_t mode, size; sa_bulk_attr_t bulk[8]; int err; int count = 0; uint64_t gen; ZFS_OBJ_HOLD_ENTER(zfsvfs, obj_num); mutex_enter(&zp->z_acl_lock); if (zp->z_acl_cached) { zfs_acl_free(zp->z_acl_cached); zp->z_acl_cached = NULL; } mutex_exit(&zp->z_acl_lock); ASSERT(zp->z_sa_hdl == NULL); err = sa_buf_hold(zfsvfs->z_os, obj_num, NULL, &db); if (err) { ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); return (err); } dmu_object_info_from_db(db, &doi); if (doi.doi_bonus_type != DMU_OT_SA && (doi.doi_bonus_type != DMU_OT_ZNODE || (doi.doi_bonus_type == DMU_OT_ZNODE && doi.doi_bonus_size < sizeof (znode_phys_t)))) { sa_buf_rele(db, NULL); ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); return (SET_ERROR(EINVAL)); } zfs_znode_sa_init(zfsvfs, zp, db, doi.doi_bonus_type, NULL); size = zp->z_size; /* reload cached values */ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GEN(zfsvfs), NULL, &gen, sizeof (gen)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL, &zp->z_size, sizeof (zp->z_size)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL, &zp->z_links, sizeof (zp->z_links)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, &zp->z_pflags, sizeof (zp->z_pflags)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL, &zp->z_atime, sizeof (zp->z_atime)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, &zp->z_uid, sizeof (zp->z_uid)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL, &zp->z_gid, sizeof (zp->z_gid)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, &mode, sizeof (mode)); if (sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) { zfs_znode_dmu_fini(zp); ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); return (SET_ERROR(EIO)); } zp->z_mode = mode; if (gen != zp->z_gen) { zfs_znode_dmu_fini(zp); ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); return (SET_ERROR(EIO)); } /* * It is highly improbable but still quite possible that two * objects in different datasets are created with the same * object numbers and in transaction groups with the same * numbers. znodes corresponding to those objects would * have the same z_id and z_gen, but their other attributes * may be different. * zfs recv -F may replace one of such objects with the other. * As a result file properties recorded in the replaced * object's vnode may no longer match the received object's * properties. At present the only cached property is the * files type recorded in v_type. * So, handle this case by leaving the old vnode and znode * disassociated from the actual object. A new vnode and a * znode will be created if the object is accessed * (e.g. via a look-up). The old vnode and znode will be * recycled when the last vnode reference is dropped. */ vp = ZTOV(zp); if (vp->v_type != IFTOVT((mode_t)zp->z_mode)) { zfs_znode_dmu_fini(zp); ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); return (EIO); } zp->z_unlinked = (zp->z_links == 0); zp->z_blksz = doi.doi_data_block_size; vn_pages_remove(vp, 0, 0); if (zp->z_size != size) vnode_pager_setsize(vp, zp->z_size); ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); return (0); } void zfs_znode_delete(znode_t *zp, dmu_tx_t *tx) { zfsvfs_t *zfsvfs = zp->z_zfsvfs; objset_t *os = zfsvfs->z_os; uint64_t obj = zp->z_id; uint64_t acl_obj = zfs_external_acl(zp); ZFS_OBJ_HOLD_ENTER(zfsvfs, obj); if (acl_obj) { VERIFY(!zp->z_is_sa); VERIFY(0 == dmu_object_free(os, acl_obj, tx)); } VERIFY(0 == dmu_object_free(os, obj, tx)); zfs_znode_dmu_fini(zp); ZFS_OBJ_HOLD_EXIT(zfsvfs, obj); zfs_znode_free(zp); } void zfs_zinactive(znode_t *zp) { zfsvfs_t *zfsvfs = zp->z_zfsvfs; uint64_t z_id = zp->z_id; ASSERT(zp->z_sa_hdl); /* * Don't allow a zfs_zget() while were trying to release this znode */ ZFS_OBJ_HOLD_ENTER(zfsvfs, z_id); /* * If this was the last reference to a file with no links, * remove the file from the file system. */ if (zp->z_unlinked) { ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id); zfs_rmnode(zp); return; } zfs_znode_dmu_fini(zp); ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id); zfs_znode_free(zp); } void zfs_znode_free(znode_t *zp) { zfsvfs_t *zfsvfs = zp->z_zfsvfs; ASSERT(zp->z_sa_hdl == NULL); zp->z_vnode = NULL; mutex_enter(&zfsvfs->z_znodes_lock); POINTER_INVALIDATE(&zp->z_zfsvfs); list_remove(&zfsvfs->z_all_znodes, zp); mutex_exit(&zfsvfs->z_znodes_lock); if (zp->z_acl_cached) { zfs_acl_free(zp->z_acl_cached); zp->z_acl_cached = NULL; } kmem_cache_free(znode_cache, zp); #ifdef illumos VFS_RELE(zfsvfs->z_vfs); #endif } void zfs_tstamp_update_setup(znode_t *zp, uint_t flag, uint64_t mtime[2], uint64_t ctime[2], boolean_t have_tx) { timestruc_t now; vfs_timestamp(&now); if (have_tx) { /* will sa_bulk_update happen really soon? */ zp->z_atime_dirty = 0; zp->z_seq++; } else { zp->z_atime_dirty = 1; } if (flag & AT_ATIME) { ZFS_TIME_ENCODE(&now, zp->z_atime); } if (flag & AT_MTIME) { ZFS_TIME_ENCODE(&now, mtime); if (zp->z_zfsvfs->z_use_fuids) { zp->z_pflags |= (ZFS_ARCHIVE | ZFS_AV_MODIFIED); } } if (flag & AT_CTIME) { ZFS_TIME_ENCODE(&now, ctime); if (zp->z_zfsvfs->z_use_fuids) zp->z_pflags |= ZFS_ARCHIVE; } } /* * Grow the block size for a file. * * IN: zp - znode of file to free data in. * size - requested block size * tx - open transaction. * * NOTE: this function assumes that the znode is write locked. */ void zfs_grow_blocksize(znode_t *zp, uint64_t size, dmu_tx_t *tx) { int error; u_longlong_t dummy; if (size <= zp->z_blksz) return; /* * If the file size is already greater than the current blocksize, * we will not grow. If there is more than one block in a file, * the blocksize cannot change. */ if (zp->z_blksz && zp->z_size > zp->z_blksz) return; error = dmu_object_set_blocksize(zp->z_zfsvfs->z_os, zp->z_id, size, 0, tx); if (error == ENOTSUP) return; ASSERT0(error); /* What blocksize did we actually get? */ dmu_object_size_from_db(sa_get_db(zp->z_sa_hdl), &zp->z_blksz, &dummy); } #ifdef illumos /* * This is a dummy interface used when pvn_vplist_dirty() should *not* * be calling back into the fs for a putpage(). E.g.: when truncating * a file, the pages being "thrown away* don't need to be written out. */ /* ARGSUSED */ static int zfs_no_putpage(vnode_t *vp, page_t *pp, u_offset_t *offp, size_t *lenp, int flags, cred_t *cr) { ASSERT(0); return (0); } #endif /* * Increase the file length * * IN: zp - znode of file to free data in. * end - new end-of-file * * RETURN: 0 on success, error code on failure */ static int zfs_extend(znode_t *zp, uint64_t end) { zfsvfs_t *zfsvfs = zp->z_zfsvfs; dmu_tx_t *tx; rl_t *rl; uint64_t newblksz; int error; /* * We will change zp_size, lock the whole file. */ rl = zfs_range_lock(zp, 0, UINT64_MAX, RL_WRITER); /* * Nothing to do if file already at desired length. */ if (end <= zp->z_size) { zfs_range_unlock(rl); return (0); } tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); zfs_sa_upgrade_txholds(tx, zp); if (end > zp->z_blksz && (!ISP2(zp->z_blksz) || zp->z_blksz < zfsvfs->z_max_blksz)) { /* * We are growing the file past the current block size. */ if (zp->z_blksz > zp->z_zfsvfs->z_max_blksz) { /* * File's blocksize is already larger than the * "recordsize" property. Only let it grow to * the next power of 2. */ ASSERT(!ISP2(zp->z_blksz)); newblksz = MIN(end, 1 << highbit64(zp->z_blksz)); } else { newblksz = MIN(end, zp->z_zfsvfs->z_max_blksz); } dmu_tx_hold_write(tx, zp->z_id, 0, newblksz); } else { newblksz = 0; } error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); zfs_range_unlock(rl); return (error); } if (newblksz) zfs_grow_blocksize(zp, newblksz, tx); zp->z_size = end; VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zp->z_zfsvfs), &zp->z_size, sizeof (zp->z_size), tx)); vnode_pager_setsize(ZTOV(zp), end); zfs_range_unlock(rl); dmu_tx_commit(tx); return (0); } /* * Free space in a file. * * IN: zp - znode of file to free data in. * off - start of section to free. * len - length of section to free. * * RETURN: 0 on success, error code on failure */ static int zfs_free_range(znode_t *zp, uint64_t off, uint64_t len) { zfsvfs_t *zfsvfs = zp->z_zfsvfs; rl_t *rl; int error; /* * Lock the range being freed. */ rl = zfs_range_lock(zp, off, len, RL_WRITER); /* * Nothing to do if file already at desired length. */ if (off >= zp->z_size) { zfs_range_unlock(rl); return (0); } if (off + len > zp->z_size) len = zp->z_size - off; error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, off, len); if (error == 0) { /* * In FreeBSD we cannot free block in the middle of a file, * but only at the end of a file, so this code path should * never happen. */ vnode_pager_setsize(ZTOV(zp), off); } zfs_range_unlock(rl); return (error); } /* * Truncate a file * * IN: zp - znode of file to free data in. * end - new end-of-file. * * RETURN: 0 on success, error code on failure */ static int zfs_trunc(znode_t *zp, uint64_t end) { zfsvfs_t *zfsvfs = zp->z_zfsvfs; vnode_t *vp = ZTOV(zp); dmu_tx_t *tx; rl_t *rl; int error; sa_bulk_attr_t bulk[2]; int count = 0; /* * We will change zp_size, lock the whole file. */ rl = zfs_range_lock(zp, 0, UINT64_MAX, RL_WRITER); /* * Nothing to do if file already at desired length. */ if (end >= zp->z_size) { zfs_range_unlock(rl); return (0); } error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, end, -1); if (error) { zfs_range_unlock(rl); return (error); } tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); zfs_sa_upgrade_txholds(tx, zp); dmu_tx_mark_netfree(tx); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); zfs_range_unlock(rl); return (error); } zp->z_size = end; SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL, &zp->z_size, sizeof (zp->z_size)); if (end == 0) { zp->z_pflags &= ~ZFS_SPARSE; SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, &zp->z_pflags, 8); } VERIFY(sa_bulk_update(zp->z_sa_hdl, bulk, count, tx) == 0); dmu_tx_commit(tx); /* * Clear any mapped pages in the truncated region. This has to * happen outside of the transaction to avoid the possibility of * a deadlock with someone trying to push a page that we are * about to invalidate. */ vnode_pager_setsize(vp, end); zfs_range_unlock(rl); return (0); } /* * Free space in a file * * IN: zp - znode of file to free data in. * off - start of range * len - end of range (0 => EOF) * flag - current file open mode flags. * log - TRUE if this action should be logged * * RETURN: 0 on success, error code on failure */ int zfs_freesp(znode_t *zp, uint64_t off, uint64_t len, int flag, boolean_t log) { vnode_t *vp = ZTOV(zp); dmu_tx_t *tx; zfsvfs_t *zfsvfs = zp->z_zfsvfs; zilog_t *zilog = zfsvfs->z_log; uint64_t mode; uint64_t mtime[2], ctime[2]; sa_bulk_attr_t bulk[3]; int count = 0; int error; if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs), &mode, sizeof (mode))) != 0) return (error); if (off > zp->z_size) { error = zfs_extend(zp, off+len); if (error == 0 && log) goto log; else return (error); } /* * Check for any locks in the region to be freed. */ if (MANDLOCK(vp, (mode_t)mode)) { uint64_t length = (len ? len : zp->z_size - off); if (error = chklock(vp, FWRITE, off, length, flag, NULL)) return (error); } if (len == 0) { error = zfs_trunc(zp, off); } else { if ((error = zfs_free_range(zp, off, len)) == 0 && off + len > zp->z_size) error = zfs_extend(zp, off+len); } if (error || !log) return (error); log: tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); zfs_sa_upgrade_txholds(tx, zp); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); return (error); } SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, mtime, 16); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, ctime, 16); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, &zp->z_pflags, 8); zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime, B_TRUE); error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); ASSERT(error == 0); zfs_log_truncate(zilog, tx, TX_TRUNCATE, zp, off, len); dmu_tx_commit(tx); return (0); } void zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx) { uint64_t moid, obj, sa_obj, version; uint64_t sense = ZFS_CASE_SENSITIVE; uint64_t norm = 0; nvpair_t *elem; int error; int i; znode_t *rootzp = NULL; zfsvfs_t *zfsvfs; vattr_t vattr; znode_t *zp; zfs_acl_ids_t acl_ids; /* * First attempt to create master node. */ /* * In an empty objset, there are no blocks to read and thus * there can be no i/o errors (which we assert below). */ moid = MASTER_NODE_OBJ; error = zap_create_claim(os, moid, DMU_OT_MASTER_NODE, DMU_OT_NONE, 0, tx); ASSERT(error == 0); /* * Set starting attributes. */ version = zfs_zpl_version_map(spa_version(dmu_objset_spa(os))); elem = NULL; while ((elem = nvlist_next_nvpair(zplprops, elem)) != NULL) { /* For the moment we expect all zpl props to be uint64_ts */ uint64_t val; char *name; ASSERT(nvpair_type(elem) == DATA_TYPE_UINT64); VERIFY(nvpair_value_uint64(elem, &val) == 0); name = nvpair_name(elem); if (strcmp(name, zfs_prop_to_name(ZFS_PROP_VERSION)) == 0) { if (val < version) version = val; } else { error = zap_update(os, moid, name, 8, 1, &val, tx); } ASSERT(error == 0); if (strcmp(name, zfs_prop_to_name(ZFS_PROP_NORMALIZE)) == 0) norm = val; else if (strcmp(name, zfs_prop_to_name(ZFS_PROP_CASE)) == 0) sense = val; } ASSERT(version != 0); error = zap_update(os, moid, ZPL_VERSION_STR, 8, 1, &version, tx); /* * Create zap object used for SA attribute registration */ if (version >= ZPL_VERSION_SA) { sa_obj = zap_create(os, DMU_OT_SA_MASTER_NODE, DMU_OT_NONE, 0, tx); error = zap_add(os, moid, ZFS_SA_ATTRS, 8, 1, &sa_obj, tx); ASSERT(error == 0); } else { sa_obj = 0; } /* * Create a delete queue. */ obj = zap_create(os, DMU_OT_UNLINKED_SET, DMU_OT_NONE, 0, tx); error = zap_add(os, moid, ZFS_UNLINKED_SET, 8, 1, &obj, tx); ASSERT(error == 0); /* * Create root znode. Create minimal znode/vnode/zfsvfs * to allow zfs_mknode to work. */ VATTR_NULL(&vattr); vattr.va_mask = AT_MODE|AT_UID|AT_GID|AT_TYPE; vattr.va_type = VDIR; vattr.va_mode = S_IFDIR|0755; vattr.va_uid = crgetuid(cr); vattr.va_gid = crgetgid(cr); zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP); rootzp = kmem_cache_alloc(znode_cache, KM_SLEEP); ASSERT(!POINTER_IS_VALID(rootzp->z_zfsvfs)); rootzp->z_moved = 0; rootzp->z_unlinked = 0; rootzp->z_atime_dirty = 0; rootzp->z_is_sa = USE_SA(version, os); zfsvfs->z_os = os; zfsvfs->z_parent = zfsvfs; zfsvfs->z_version = version; zfsvfs->z_use_fuids = USE_FUIDS(version, os); zfsvfs->z_use_sa = USE_SA(version, os); zfsvfs->z_norm = norm; error = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END, &zfsvfs->z_attr_table); ASSERT(error == 0); /* * Fold case on file systems that are always or sometimes case * insensitive. */ if (sense == ZFS_CASE_INSENSITIVE || sense == ZFS_CASE_MIXED) zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER; mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL); list_create(&zfsvfs->z_all_znodes, sizeof (znode_t), offsetof(znode_t, z_link_node)); for (i = 0; i != ZFS_OBJ_MTX_SZ; i++) mutex_init(&zfsvfs->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL); rootzp->z_zfsvfs = zfsvfs; VERIFY(0 == zfs_acl_ids_create(rootzp, IS_ROOT_NODE, &vattr, cr, NULL, &acl_ids)); zfs_mknode(rootzp, &vattr, tx, cr, IS_ROOT_NODE, &zp, &acl_ids); ASSERT3P(zp, ==, rootzp); error = zap_add(os, moid, ZFS_ROOT_OBJ, 8, 1, &rootzp->z_id, tx); ASSERT(error == 0); zfs_acl_ids_free(&acl_ids); POINTER_INVALIDATE(&rootzp->z_zfsvfs); sa_handle_destroy(rootzp->z_sa_hdl); kmem_cache_free(znode_cache, rootzp); /* * Create shares directory */ error = zfs_create_share_dir(zfsvfs, tx); ASSERT(error == 0); for (i = 0; i != ZFS_OBJ_MTX_SZ; i++) mutex_destroy(&zfsvfs->z_hold_mtx[i]); kmem_free(zfsvfs, sizeof (zfsvfs_t)); } - #endif /* _KERNEL */ static int zfs_sa_setup(objset_t *osp, sa_attr_type_t **sa_table) { uint64_t sa_obj = 0; int error; error = zap_lookup(osp, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1, &sa_obj); if (error != 0 && error != ENOENT) return (error); error = sa_setup(osp, sa_obj, zfs_attr_table, ZPL_END, sa_table); return (error); } static int zfs_grab_sa_handle(objset_t *osp, uint64_t obj, sa_handle_t **hdlp, dmu_buf_t **db, void *tag) { dmu_object_info_t doi; int error; if ((error = sa_buf_hold(osp, obj, tag, db)) != 0) return (error); dmu_object_info_from_db(*db, &doi); if ((doi.doi_bonus_type != DMU_OT_SA && doi.doi_bonus_type != DMU_OT_ZNODE) || doi.doi_bonus_type == DMU_OT_ZNODE && doi.doi_bonus_size < sizeof (znode_phys_t)) { sa_buf_rele(*db, tag); return (SET_ERROR(ENOTSUP)); } error = sa_handle_get(osp, obj, NULL, SA_HDL_PRIVATE, hdlp); if (error != 0) { sa_buf_rele(*db, tag); return (error); } return (0); } void zfs_release_sa_handle(sa_handle_t *hdl, dmu_buf_t *db, void *tag) { sa_handle_destroy(hdl); sa_buf_rele(db, tag); } /* * Given an object number, return its parent object number and whether * or not the object is an extended attribute directory. */ static int zfs_obj_to_pobj(objset_t *osp, sa_handle_t *hdl, sa_attr_type_t *sa_table, uint64_t *pobjp, int *is_xattrdir) { uint64_t parent; uint64_t pflags; uint64_t mode; uint64_t parent_mode; sa_bulk_attr_t bulk[3]; sa_handle_t *sa_hdl; dmu_buf_t *sa_db; int count = 0; int error; SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_PARENT], NULL, &parent, sizeof (parent)); SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_FLAGS], NULL, &pflags, sizeof (pflags)); SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_MODE], NULL, &mode, sizeof (mode)); if ((error = sa_bulk_lookup(hdl, bulk, count)) != 0) return (error); /* * When a link is removed its parent pointer is not changed and will * be invalid. There are two cases where a link is removed but the * file stays around, when it goes to the delete queue and when there * are additional links. */ error = zfs_grab_sa_handle(osp, parent, &sa_hdl, &sa_db, FTAG); if (error != 0) return (error); error = sa_lookup(sa_hdl, ZPL_MODE, &parent_mode, sizeof (parent_mode)); zfs_release_sa_handle(sa_hdl, sa_db, FTAG); if (error != 0) return (error); *is_xattrdir = ((pflags & ZFS_XATTR) != 0) && S_ISDIR(mode); /* * Extended attributes can be applied to files, directories, etc. * Otherwise the parent must be a directory. */ if (!*is_xattrdir && !S_ISDIR(parent_mode)) return (SET_ERROR(EINVAL)); *pobjp = parent; return (0); } /* * Given an object number, return some zpl level statistics */ static int zfs_obj_to_stats_impl(sa_handle_t *hdl, sa_attr_type_t *sa_table, zfs_stat_t *sb) { sa_bulk_attr_t bulk[4]; int count = 0; SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_MODE], NULL, &sb->zs_mode, sizeof (sb->zs_mode)); SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_GEN], NULL, &sb->zs_gen, sizeof (sb->zs_gen)); SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_LINKS], NULL, &sb->zs_links, sizeof (sb->zs_links)); SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_CTIME], NULL, &sb->zs_ctime, sizeof (sb->zs_ctime)); return (sa_bulk_lookup(hdl, bulk, count)); } static int zfs_obj_to_path_impl(objset_t *osp, uint64_t obj, sa_handle_t *hdl, sa_attr_type_t *sa_table, char *buf, int len) { sa_handle_t *sa_hdl; sa_handle_t *prevhdl = NULL; dmu_buf_t *prevdb = NULL; dmu_buf_t *sa_db = NULL; char *path = buf + len - 1; int error; *path = '\0'; sa_hdl = hdl; for (;;) { uint64_t pobj; char component[MAXNAMELEN + 2]; size_t complen; int is_xattrdir; if (prevdb) zfs_release_sa_handle(prevhdl, prevdb, FTAG); if ((error = zfs_obj_to_pobj(osp, sa_hdl, sa_table, &pobj, &is_xattrdir)) != 0) break; if (pobj == obj) { if (path[0] != '/') *--path = '/'; break; } component[0] = '/'; if (is_xattrdir) { (void) sprintf(component + 1, ""); } else { error = zap_value_search(osp, pobj, obj, ZFS_DIRENT_OBJ(-1ULL), component + 1); if (error != 0) break; } complen = strlen(component); path -= complen; ASSERT(path >= buf); bcopy(component, path, complen); obj = pobj; if (sa_hdl != hdl) { prevhdl = sa_hdl; prevdb = sa_db; } error = zfs_grab_sa_handle(osp, obj, &sa_hdl, &sa_db, FTAG); if (error != 0) { sa_hdl = prevhdl; sa_db = prevdb; break; } } if (sa_hdl != NULL && sa_hdl != hdl) { ASSERT(sa_db != NULL); zfs_release_sa_handle(sa_hdl, sa_db, FTAG); } if (error == 0) (void) memmove(buf, path, buf + len - path); return (error); } int zfs_obj_to_path(objset_t *osp, uint64_t obj, char *buf, int len) { sa_attr_type_t *sa_table; sa_handle_t *hdl; dmu_buf_t *db; int error; error = zfs_sa_setup(osp, &sa_table); if (error != 0) return (error); error = zfs_grab_sa_handle(osp, obj, &hdl, &db, FTAG); if (error != 0) return (error); error = zfs_obj_to_path_impl(osp, obj, hdl, sa_table, buf, len); zfs_release_sa_handle(hdl, db, FTAG); return (error); } int zfs_obj_to_stats(objset_t *osp, uint64_t obj, zfs_stat_t *sb, char *buf, int len) { char *path = buf + len - 1; sa_attr_type_t *sa_table; sa_handle_t *hdl; dmu_buf_t *db; int error; *path = '\0'; error = zfs_sa_setup(osp, &sa_table); if (error != 0) return (error); error = zfs_grab_sa_handle(osp, obj, &hdl, &db, FTAG); if (error != 0) return (error); error = zfs_obj_to_stats_impl(hdl, sa_table, sb); if (error != 0) { zfs_release_sa_handle(hdl, db, FTAG); return (error); } error = zfs_obj_to_path_impl(osp, obj, hdl, sa_table, buf, len); zfs_release_sa_handle(hdl, db, FTAG); return (error); } + +#ifdef _KERNEL +int +zfs_znode_parent_and_name(znode_t *zp, znode_t **dzpp, char *buf) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + uint64_t parent; + int is_xattrdir; + int err; + + /* Extended attributes should not be visible as regular files. */ + if ((zp->z_pflags & ZFS_XATTR) != 0) + return (SET_ERROR(EINVAL)); + + err = zfs_obj_to_pobj(zfsvfs->z_os, zp->z_sa_hdl, zfsvfs->z_attr_table, + &parent, &is_xattrdir); + if (err != 0) + return (err); + ASSERT0(is_xattrdir); + + /* No name as this is a root object. */ + if (parent == zp->z_id) + return (SET_ERROR(EINVAL)); + + err = zap_value_search(zfsvfs->z_os, parent, zp->z_id, + ZFS_DIRENT_OBJ(-1ULL), buf); + if (err != 0) + return (err); + err = zfs_zget(zfsvfs, parent, dzpp); + return (err); +} +#endif /* _KERNEL */ Index: user/alc/PQ_LAUNDRY/sys/cddl/contrib/opensolaris =================================================================== --- user/alc/PQ_LAUNDRY/sys/cddl/contrib/opensolaris (revision 306831) +++ user/alc/PQ_LAUNDRY/sys/cddl/contrib/opensolaris (revision 306832) Property changes on: user/alc/PQ_LAUNDRY/sys/cddl/contrib/opensolaris ___________________________________________________________________ Modified: svn:mergeinfo ## -0,0 +0,1 ## Merged /head/sys/cddl/contrib/opensolaris:r306708-306831 Index: user/alc/PQ_LAUNDRY/sys/dev/cxgbe/t4_ioctl.h =================================================================== --- user/alc/PQ_LAUNDRY/sys/dev/cxgbe/t4_ioctl.h (revision 306831) +++ user/alc/PQ_LAUNDRY/sys/dev/cxgbe/t4_ioctl.h (revision 306832) @@ -1,347 +1,349 @@ /*- * Copyright (c) 2011 Chelsio Communications, Inc. * All rights reserved. * Written by: Navdeep Parhar * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ * */ #ifndef __T4_IOCTL_H__ #define __T4_IOCTL_H__ #include #include /* * Ioctl commands specific to this driver. */ enum { T4_GETREG = 0x40, /* read register */ T4_SETREG, /* write register */ T4_REGDUMP, /* dump of all registers */ T4_GET_FILTER_MODE, /* get global filter mode */ T4_SET_FILTER_MODE, /* set global filter mode */ T4_GET_FILTER, /* get information about a filter */ T4_SET_FILTER, /* program a filter */ T4_DEL_FILTER, /* delete a filter */ T4_GET_SGE_CONTEXT, /* get SGE context for a queue */ T4_LOAD_FW, /* flash firmware */ T4_GET_MEM, /* read memory */ T4_GET_I2C, /* read from i2c addressible device */ T4_CLEAR_STATS, /* clear a port's MAC statistics */ T4_SET_OFLD_POLICY, /* Set offload policy */ T4_SET_SCHED_CLASS, /* set sched class */ T4_SET_SCHED_QUEUE, /* set queue class */ T4_GET_TRACER, /* get information about a tracer */ T4_SET_TRACER, /* program a tracer */ + T4_LOAD_CFG, /* copy a config file to card's flash */ }; struct t4_reg { uint32_t addr; uint32_t size; uint64_t val; }; #define T4_REGDUMP_SIZE (160 * 1024) #define T5_REGDUMP_SIZE (332 * 1024) struct t4_regdump { uint32_t version; uint32_t len; /* bytes */ uint32_t *data; }; struct t4_data { uint32_t len; uint8_t *data; }; struct t4_i2c_data { uint8_t port_id; uint8_t dev_addr; uint8_t offset; uint8_t len; uint8_t data[8]; }; /* * A hardware filter is some valid combination of these. */ #define T4_FILTER_IPv4 0x1 /* IPv4 packet */ #define T4_FILTER_IPv6 0x2 /* IPv6 packet */ #define T4_FILTER_IP_SADDR 0x4 /* Source IP address or network */ #define T4_FILTER_IP_DADDR 0x8 /* Destination IP address or network */ #define T4_FILTER_IP_SPORT 0x10 /* Source IP port */ #define T4_FILTER_IP_DPORT 0x20 /* Destination IP port */ #define T4_FILTER_FCoE 0x40 /* Fibre Channel over Ethernet packet */ #define T4_FILTER_PORT 0x80 /* Physical ingress port */ #define T4_FILTER_VNIC 0x100 /* VNIC id or outer VLAN */ #define T4_FILTER_VLAN 0x200 /* VLAN ID */ #define T4_FILTER_IP_TOS 0x400 /* IPv4 TOS/IPv6 Traffic Class */ #define T4_FILTER_IP_PROTO 0x800 /* IP protocol */ #define T4_FILTER_ETH_TYPE 0x1000 /* Ethernet Type */ #define T4_FILTER_MAC_IDX 0x2000 /* MPS MAC address match index */ #define T4_FILTER_MPS_HIT_TYPE 0x4000 /* MPS match type */ #define T4_FILTER_IP_FRAGMENT 0x8000 /* IP fragment */ #define T4_FILTER_IC_VNIC 0x80000000 /* TP Ingress Config's F_VNIC bit. It indicates whether T4_FILTER_VNIC bit means VNIC id (PF/VF) or outer VLAN. 0 = oVLAN, 1 = VNIC */ /* Filter action */ enum { FILTER_PASS = 0, /* default */ FILTER_DROP, FILTER_SWITCH }; /* 802.1q manipulation on FILTER_SWITCH */ enum { VLAN_NOCHANGE = 0, /* default */ VLAN_REMOVE, VLAN_INSERT, VLAN_REWRITE }; /* MPS match type */ enum { UCAST_EXACT = 0, /* exact unicast match */ UCAST_HASH = 1, /* inexact (hashed) unicast match */ MCAST_EXACT = 2, /* exact multicast match */ MCAST_HASH = 3, /* inexact (hashed) multicast match */ PROMISC = 4, /* no match but port is promiscuous */ HYPPROMISC = 5, /* port is hypervisor-promisuous + not bcast */ BCAST = 6, /* broadcast packet */ }; /* Rx steering */ enum { DST_MODE_QUEUE, /* queue is directly specified by filter */ DST_MODE_RSS_QUEUE, /* filter specifies RSS entry containing queue */ DST_MODE_RSS, /* queue selected by default RSS hash lookup */ DST_MODE_FILT_RSS /* queue selected by hashing in filter-specified RSS subtable */ }; struct t4_filter_tuple { /* * These are always available. */ uint8_t sip[16]; /* source IP address (IPv4 in [3:0]) */ uint8_t dip[16]; /* destinatin IP address (IPv4 in [3:0]) */ uint16_t sport; /* source port */ uint16_t dport; /* destination port */ /* * A combination of these (up to 36 bits) is available. TP_VLAN_PRI_MAP * is used to select the global mode and all filters are limited to the * set of fields allowed by the global mode. */ uint16_t vnic; /* VNIC id (PF/VF) or outer VLAN tag */ uint16_t vlan; /* VLAN tag */ uint16_t ethtype; /* Ethernet type */ uint8_t tos; /* TOS/Traffic Type */ uint8_t proto; /* protocol type */ uint32_t fcoe:1; /* FCoE packet */ uint32_t iport:3; /* ingress port */ uint32_t matchtype:3; /* MPS match type */ uint32_t frag:1; /* fragmentation extension header */ uint32_t macidx:9; /* exact match MAC index */ uint32_t vlan_vld:1; /* VLAN valid */ uint32_t ovlan_vld:1; /* outer VLAN tag valid, value in "vnic" */ uint32_t pfvf_vld:1; /* VNIC id (PF/VF) valid, value in "vnic" */ }; struct t4_filter_specification { uint32_t hitcnts:1; /* count filter hits in TCB */ uint32_t prio:1; /* filter has priority over active/server */ uint32_t type:1; /* 0 => IPv4, 1 => IPv6 */ uint32_t action:2; /* drop, pass, switch */ uint32_t rpttid:1; /* report TID in RSS hash field */ uint32_t dirsteer:1; /* 0 => RSS, 1 => steer to iq */ uint32_t iq:10; /* ingress queue */ uint32_t maskhash:1; /* dirsteer=0: store RSS hash in TCB */ uint32_t dirsteerhash:1;/* dirsteer=1: 0 => TCB contains RSS hash */ /* 1 => TCB contains IQ ID */ /* * Switch proxy/rewrite fields. An ingress packet which matches a * filter with "switch" set will be looped back out as an egress * packet -- potentially with some Ethernet header rewriting. */ uint32_t eport:2; /* egress port to switch packet out */ uint32_t newdmac:1; /* rewrite destination MAC address */ uint32_t newsmac:1; /* rewrite source MAC address */ uint32_t newvlan:2; /* rewrite VLAN Tag */ uint8_t dmac[ETHER_ADDR_LEN]; /* new destination MAC address */ uint8_t smac[ETHER_ADDR_LEN]; /* new source MAC address */ uint16_t vlan; /* VLAN Tag to insert */ /* * Filter rule value/mask pairs. */ struct t4_filter_tuple val; struct t4_filter_tuple mask; }; struct t4_filter { uint32_t idx; uint16_t l2tidx; uint16_t smtidx; uint64_t hits; struct t4_filter_specification fs; }; /* Tx Scheduling Class parameters */ struct t4_sched_class_params { int8_t level; /* scheduler hierarchy level */ int8_t mode; /* per-class or per-flow */ int8_t rateunit; /* bit or packet rate */ int8_t ratemode; /* %port relative or kbps absolute */ int8_t channel; /* scheduler channel [0..N] */ int8_t cl; /* scheduler class [0..N] */ int32_t minrate; /* minimum rate */ int32_t maxrate; /* maximum rate */ int16_t weight; /* percent weight */ int16_t pktsize; /* average packet size */ }; /* * Support for "sched-class" command to allow a TX Scheduling Class to be * programmed with various parameters. */ struct t4_sched_params { int8_t subcmd; /* sub-command */ int8_t type; /* packet or flow */ union { struct { /* sub-command SCHED_CLASS_CONFIG */ int8_t minmax; /* minmax enable */ } config; struct t4_sched_class_params params; uint8_t reserved[6 + 8 * 8]; } u; }; enum { SCHED_CLASS_SUBCMD_CONFIG, /* config sub-command */ SCHED_CLASS_SUBCMD_PARAMS, /* params sub-command */ }; enum { SCHED_CLASS_TYPE_PACKET, }; enum { SCHED_CLASS_LEVEL_CL_RL, /* class rate limiter */ SCHED_CLASS_LEVEL_CL_WRR, /* class weighted round robin */ SCHED_CLASS_LEVEL_CH_RL, /* channel rate limiter */ }; enum { SCHED_CLASS_MODE_CLASS, /* per-class scheduling */ SCHED_CLASS_MODE_FLOW, /* per-flow scheduling */ }; enum { SCHED_CLASS_RATEUNIT_BITS, /* bit rate scheduling */ SCHED_CLASS_RATEUNIT_PKTS, /* packet rate scheduling */ }; enum { SCHED_CLASS_RATEMODE_REL, /* percent of port bandwidth */ SCHED_CLASS_RATEMODE_ABS, /* Kb/s */ }; /* * Support for "sched_queue" command to allow one or more NIC TX Queues to be * bound to a TX Scheduling Class. */ struct t4_sched_queue { uint8_t port; int8_t queue; /* queue index; -1 => all queues */ int8_t cl; /* class index; -1 => unbind */ }; #define T4_SGE_CONTEXT_SIZE 24 enum { SGE_CONTEXT_EGRESS, SGE_CONTEXT_INGRESS, SGE_CONTEXT_FLM, SGE_CONTEXT_CNM }; struct t4_sge_context { uint32_t mem_id; uint32_t cid; uint32_t data[T4_SGE_CONTEXT_SIZE / 4]; }; struct t4_mem_range { uint32_t addr; uint32_t len; uint32_t *data; }; #define T4_TRACE_LEN 112 struct t4_trace_params { uint32_t data[T4_TRACE_LEN / 4]; uint32_t mask[T4_TRACE_LEN / 4]; uint16_t snap_len; uint16_t min_len; uint8_t skip_ofst; uint8_t skip_len; uint8_t invert; uint8_t port; }; struct t4_tracer { uint8_t idx; uint8_t enabled; uint8_t valid; struct t4_trace_params tp; }; #define CHELSIO_T4_GETREG _IOWR('f', T4_GETREG, struct t4_reg) #define CHELSIO_T4_SETREG _IOW('f', T4_SETREG, struct t4_reg) #define CHELSIO_T4_REGDUMP _IOWR('f', T4_REGDUMP, struct t4_regdump) #define CHELSIO_T4_GET_FILTER_MODE _IOWR('f', T4_GET_FILTER_MODE, uint32_t) #define CHELSIO_T4_SET_FILTER_MODE _IOW('f', T4_SET_FILTER_MODE, uint32_t) #define CHELSIO_T4_GET_FILTER _IOWR('f', T4_GET_FILTER, struct t4_filter) #define CHELSIO_T4_SET_FILTER _IOW('f', T4_SET_FILTER, struct t4_filter) #define CHELSIO_T4_DEL_FILTER _IOW('f', T4_DEL_FILTER, struct t4_filter) #define CHELSIO_T4_GET_SGE_CONTEXT _IOWR('f', T4_GET_SGE_CONTEXT, \ struct t4_sge_context) #define CHELSIO_T4_LOAD_FW _IOW('f', T4_LOAD_FW, struct t4_data) #define CHELSIO_T4_GET_MEM _IOW('f', T4_GET_MEM, struct t4_mem_range) #define CHELSIO_T4_GET_I2C _IOWR('f', T4_GET_I2C, struct t4_i2c_data) #define CHELSIO_T4_CLEAR_STATS _IOW('f', T4_CLEAR_STATS, uint32_t) #define CHELSIO_T4_SCHED_CLASS _IOW('f', T4_SET_SCHED_CLASS, \ struct t4_sched_params) #define CHELSIO_T4_SCHED_QUEUE _IOW('f', T4_SET_SCHED_QUEUE, \ struct t4_sched_queue) #define CHELSIO_T4_GET_TRACER _IOWR('f', T4_GET_TRACER, struct t4_tracer) #define CHELSIO_T4_SET_TRACER _IOW('f', T4_SET_TRACER, struct t4_tracer) +#define CHELSIO_T4_LOAD_CFG _IOW('f', T4_LOAD_CFG, struct t4_data) #endif Index: user/alc/PQ_LAUNDRY/sys/dev/cxgbe/t4_main.c =================================================================== --- user/alc/PQ_LAUNDRY/sys/dev/cxgbe/t4_main.c (revision 306831) +++ user/alc/PQ_LAUNDRY/sys/dev/cxgbe/t4_main.c (revision 306832) @@ -1,9823 +1,9859 @@ /*- * Copyright (c) 2011 Chelsio Communications, Inc. * All rights reserved. * Written by: Navdeep Parhar * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include "opt_inet.h" #include "opt_inet6.h" #include "opt_rss.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef RSS #include #endif #if defined(__i386__) || defined(__amd64__) #include #include #endif #ifdef DDB #include #include #endif #include "common/common.h" #include "common/t4_msg.h" #include "common/t4_regs.h" #include "common/t4_regs_values.h" #include "t4_ioctl.h" #include "t4_l2t.h" #include "t4_mp_ring.h" #include "t4_if.h" /* T4 bus driver interface */ static int t4_probe(device_t); static int t4_attach(device_t); static int t4_detach(device_t); static int t4_ready(device_t); static int t4_read_port_device(device_t, int, device_t *); static device_method_t t4_methods[] = { DEVMETHOD(device_probe, t4_probe), DEVMETHOD(device_attach, t4_attach), DEVMETHOD(device_detach, t4_detach), DEVMETHOD(t4_is_main_ready, t4_ready), DEVMETHOD(t4_read_port_device, t4_read_port_device), DEVMETHOD_END }; static driver_t t4_driver = { "t4nex", t4_methods, sizeof(struct adapter) }; /* T4 port (cxgbe) interface */ static int cxgbe_probe(device_t); static int cxgbe_attach(device_t); static int cxgbe_detach(device_t); device_method_t cxgbe_methods[] = { DEVMETHOD(device_probe, cxgbe_probe), DEVMETHOD(device_attach, cxgbe_attach), DEVMETHOD(device_detach, cxgbe_detach), { 0, 0 } }; static driver_t cxgbe_driver = { "cxgbe", cxgbe_methods, sizeof(struct port_info) }; /* T4 VI (vcxgbe) interface */ static int vcxgbe_probe(device_t); static int vcxgbe_attach(device_t); static int vcxgbe_detach(device_t); static device_method_t vcxgbe_methods[] = { DEVMETHOD(device_probe, vcxgbe_probe), DEVMETHOD(device_attach, vcxgbe_attach), DEVMETHOD(device_detach, vcxgbe_detach), { 0, 0 } }; static driver_t vcxgbe_driver = { "vcxgbe", vcxgbe_methods, sizeof(struct vi_info) }; static d_ioctl_t t4_ioctl; static struct cdevsw t4_cdevsw = { .d_version = D_VERSION, .d_ioctl = t4_ioctl, .d_name = "t4nex", }; /* T5 bus driver interface */ static int t5_probe(device_t); static device_method_t t5_methods[] = { DEVMETHOD(device_probe, t5_probe), DEVMETHOD(device_attach, t4_attach), DEVMETHOD(device_detach, t4_detach), DEVMETHOD(t4_is_main_ready, t4_ready), DEVMETHOD(t4_read_port_device, t4_read_port_device), DEVMETHOD_END }; static driver_t t5_driver = { "t5nex", t5_methods, sizeof(struct adapter) }; /* T5 port (cxl) interface */ static driver_t cxl_driver = { "cxl", cxgbe_methods, sizeof(struct port_info) }; /* T5 VI (vcxl) interface */ static driver_t vcxl_driver = { "vcxl", vcxgbe_methods, sizeof(struct vi_info) }; /* T6 bus driver interface */ static int t6_probe(device_t); static device_method_t t6_methods[] = { DEVMETHOD(device_probe, t6_probe), DEVMETHOD(device_attach, t4_attach), DEVMETHOD(device_detach, t4_detach), DEVMETHOD(t4_is_main_ready, t4_ready), DEVMETHOD(t4_read_port_device, t4_read_port_device), DEVMETHOD_END }; static driver_t t6_driver = { "t6nex", t6_methods, sizeof(struct adapter) }; /* T6 port (cc) interface */ static driver_t cc_driver = { "cc", cxgbe_methods, sizeof(struct port_info) }; /* T6 VI (vcc) interface */ static driver_t vcc_driver = { "vcc", vcxgbe_methods, sizeof(struct vi_info) }; /* ifnet + media interface */ static void cxgbe_init(void *); static int cxgbe_ioctl(struct ifnet *, unsigned long, caddr_t); static int cxgbe_transmit(struct ifnet *, struct mbuf *); static void cxgbe_qflush(struct ifnet *); static int cxgbe_media_change(struct ifnet *); static void cxgbe_media_status(struct ifnet *, struct ifmediareq *); MALLOC_DEFINE(M_CXGBE, "cxgbe", "Chelsio T4/T5 Ethernet driver and services"); /* * Correct lock order when you need to acquire multiple locks is t4_list_lock, * then ADAPTER_LOCK, then t4_uld_list_lock. */ static struct sx t4_list_lock; SLIST_HEAD(, adapter) t4_list; #ifdef TCP_OFFLOAD static struct sx t4_uld_list_lock; SLIST_HEAD(, uld_info) t4_uld_list; #endif /* * Tunables. See tweak_tunables() too. * * Each tunable is set to a default value here if it's known at compile-time. * Otherwise it is set to -1 as an indication to tweak_tunables() that it should * provide a reasonable default when the driver is loaded. * * Tunables applicable to both T4 and T5 are under hw.cxgbe. Those specific to * T5 are under hw.cxl. */ /* * Number of queues for tx and rx, 10G and 1G, NIC and offload. */ #define NTXQ_10G 16 int t4_ntxq10g = -1; TUNABLE_INT("hw.cxgbe.ntxq10g", &t4_ntxq10g); #define NRXQ_10G 8 int t4_nrxq10g = -1; TUNABLE_INT("hw.cxgbe.nrxq10g", &t4_nrxq10g); #define NTXQ_1G 4 int t4_ntxq1g = -1; TUNABLE_INT("hw.cxgbe.ntxq1g", &t4_ntxq1g); #define NRXQ_1G 2 int t4_nrxq1g = -1; TUNABLE_INT("hw.cxgbe.nrxq1g", &t4_nrxq1g); #define NTXQ_VI 1 static int t4_ntxq_vi = -1; TUNABLE_INT("hw.cxgbe.ntxq_vi", &t4_ntxq_vi); #define NRXQ_VI 1 static int t4_nrxq_vi = -1; TUNABLE_INT("hw.cxgbe.nrxq_vi", &t4_nrxq_vi); static int t4_rsrv_noflowq = 0; TUNABLE_INT("hw.cxgbe.rsrv_noflowq", &t4_rsrv_noflowq); #ifdef TCP_OFFLOAD #define NOFLDTXQ_10G 8 static int t4_nofldtxq10g = -1; TUNABLE_INT("hw.cxgbe.nofldtxq10g", &t4_nofldtxq10g); #define NOFLDRXQ_10G 2 static int t4_nofldrxq10g = -1; TUNABLE_INT("hw.cxgbe.nofldrxq10g", &t4_nofldrxq10g); #define NOFLDTXQ_1G 2 static int t4_nofldtxq1g = -1; TUNABLE_INT("hw.cxgbe.nofldtxq1g", &t4_nofldtxq1g); #define NOFLDRXQ_1G 1 static int t4_nofldrxq1g = -1; TUNABLE_INT("hw.cxgbe.nofldrxq1g", &t4_nofldrxq1g); #define NOFLDTXQ_VI 1 static int t4_nofldtxq_vi = -1; TUNABLE_INT("hw.cxgbe.nofldtxq_vi", &t4_nofldtxq_vi); #define NOFLDRXQ_VI 1 static int t4_nofldrxq_vi = -1; TUNABLE_INT("hw.cxgbe.nofldrxq_vi", &t4_nofldrxq_vi); #endif #ifdef DEV_NETMAP #define NNMTXQ_VI 2 static int t4_nnmtxq_vi = -1; TUNABLE_INT("hw.cxgbe.nnmtxq_vi", &t4_nnmtxq_vi); #define NNMRXQ_VI 2 static int t4_nnmrxq_vi = -1; TUNABLE_INT("hw.cxgbe.nnmrxq_vi", &t4_nnmrxq_vi); #endif /* * Holdoff parameters for 10G and 1G ports. */ #define TMR_IDX_10G 1 int t4_tmr_idx_10g = TMR_IDX_10G; TUNABLE_INT("hw.cxgbe.holdoff_timer_idx_10G", &t4_tmr_idx_10g); #define PKTC_IDX_10G (-1) int t4_pktc_idx_10g = PKTC_IDX_10G; TUNABLE_INT("hw.cxgbe.holdoff_pktc_idx_10G", &t4_pktc_idx_10g); #define TMR_IDX_1G 1 int t4_tmr_idx_1g = TMR_IDX_1G; TUNABLE_INT("hw.cxgbe.holdoff_timer_idx_1G", &t4_tmr_idx_1g); #define PKTC_IDX_1G (-1) int t4_pktc_idx_1g = PKTC_IDX_1G; TUNABLE_INT("hw.cxgbe.holdoff_pktc_idx_1G", &t4_pktc_idx_1g); /* * Size (# of entries) of each tx and rx queue. */ unsigned int t4_qsize_txq = TX_EQ_QSIZE; TUNABLE_INT("hw.cxgbe.qsize_txq", &t4_qsize_txq); unsigned int t4_qsize_rxq = RX_IQ_QSIZE; TUNABLE_INT("hw.cxgbe.qsize_rxq", &t4_qsize_rxq); /* * Interrupt types allowed (bits 0, 1, 2 = INTx, MSI, MSI-X respectively). */ int t4_intr_types = INTR_MSIX | INTR_MSI | INTR_INTX; TUNABLE_INT("hw.cxgbe.interrupt_types", &t4_intr_types); /* * Configuration file. */ #define DEFAULT_CF "default" #define FLASH_CF "flash" #define UWIRE_CF "uwire" #define FPGA_CF "fpga" static char t4_cfg_file[32] = DEFAULT_CF; TUNABLE_STR("hw.cxgbe.config_file", t4_cfg_file, sizeof(t4_cfg_file)); /* * PAUSE settings (bit 0, 1 = rx_pause, tx_pause respectively). * rx_pause = 1 to heed incoming PAUSE frames, 0 to ignore them. * tx_pause = 1 to emit PAUSE frames when the rx FIFO reaches its high water * mark or when signalled to do so, 0 to never emit PAUSE. */ static int t4_pause_settings = PAUSE_TX | PAUSE_RX; TUNABLE_INT("hw.cxgbe.pause_settings", &t4_pause_settings); /* * Firmware auto-install by driver during attach (0, 1, 2 = prohibited, allowed, * encouraged respectively). */ static unsigned int t4_fw_install = 1; TUNABLE_INT("hw.cxgbe.fw_install", &t4_fw_install); /* * ASIC features that will be used. Disable the ones you don't want so that the * chip resources aren't wasted on features that will not be used. */ static int t4_nbmcaps_allowed = 0; TUNABLE_INT("hw.cxgbe.nbmcaps_allowed", &t4_nbmcaps_allowed); static int t4_linkcaps_allowed = 0; /* No DCBX, PPP, etc. by default */ TUNABLE_INT("hw.cxgbe.linkcaps_allowed", &t4_linkcaps_allowed); static int t4_switchcaps_allowed = FW_CAPS_CONFIG_SWITCH_INGRESS | FW_CAPS_CONFIG_SWITCH_EGRESS; TUNABLE_INT("hw.cxgbe.switchcaps_allowed", &t4_switchcaps_allowed); static int t4_niccaps_allowed = FW_CAPS_CONFIG_NIC; TUNABLE_INT("hw.cxgbe.niccaps_allowed", &t4_niccaps_allowed); static int t4_toecaps_allowed = -1; TUNABLE_INT("hw.cxgbe.toecaps_allowed", &t4_toecaps_allowed); static int t4_rdmacaps_allowed = -1; TUNABLE_INT("hw.cxgbe.rdmacaps_allowed", &t4_rdmacaps_allowed); static int t4_cryptocaps_allowed = 0; TUNABLE_INT("hw.cxgbe.cryptocaps_allowed", &t4_cryptocaps_allowed); static int t4_iscsicaps_allowed = -1; TUNABLE_INT("hw.cxgbe.iscsicaps_allowed", &t4_iscsicaps_allowed); static int t4_fcoecaps_allowed = 0; TUNABLE_INT("hw.cxgbe.fcoecaps_allowed", &t4_fcoecaps_allowed); static int t5_write_combine = 0; TUNABLE_INT("hw.cxl.write_combine", &t5_write_combine); static int t4_num_vis = 1; TUNABLE_INT("hw.cxgbe.num_vis", &t4_num_vis); /* Functions used by extra VIs to obtain unique MAC addresses for each VI. */ static int vi_mac_funcs[] = { FW_VI_FUNC_OFLD, FW_VI_FUNC_IWARP, FW_VI_FUNC_OPENISCSI, FW_VI_FUNC_OPENFCOE, FW_VI_FUNC_FOISCSI, FW_VI_FUNC_FOFCOE, }; struct intrs_and_queues { uint16_t intr_type; /* INTx, MSI, or MSI-X */ uint16_t nirq; /* Total # of vectors */ uint16_t intr_flags_10g;/* Interrupt flags for each 10G port */ uint16_t intr_flags_1g; /* Interrupt flags for each 1G port */ uint16_t ntxq10g; /* # of NIC txq's for each 10G port */ uint16_t nrxq10g; /* # of NIC rxq's for each 10G port */ uint16_t ntxq1g; /* # of NIC txq's for each 1G port */ uint16_t nrxq1g; /* # of NIC rxq's for each 1G port */ uint16_t rsrv_noflowq; /* Flag whether to reserve queue 0 */ uint16_t nofldtxq10g; /* # of TOE txq's for each 10G port */ uint16_t nofldrxq10g; /* # of TOE rxq's for each 10G port */ uint16_t nofldtxq1g; /* # of TOE txq's for each 1G port */ uint16_t nofldrxq1g; /* # of TOE rxq's for each 1G port */ /* The vcxgbe/vcxl interfaces use these and not the ones above. */ uint16_t ntxq_vi; /* # of NIC txq's */ uint16_t nrxq_vi; /* # of NIC rxq's */ uint16_t nofldtxq_vi; /* # of TOE txq's */ uint16_t nofldrxq_vi; /* # of TOE rxq's */ uint16_t nnmtxq_vi; /* # of netmap txq's */ uint16_t nnmrxq_vi; /* # of netmap rxq's */ }; struct filter_entry { uint32_t valid:1; /* filter allocated and valid */ uint32_t locked:1; /* filter is administratively locked */ uint32_t pending:1; /* filter action is pending firmware reply */ uint32_t smtidx:8; /* Source MAC Table index for smac */ struct l2t_entry *l2t; /* Layer Two Table entry for dmac */ struct t4_filter_specification fs; }; static void setup_memwin(struct adapter *); static void position_memwin(struct adapter *, int, uint32_t); static int rw_via_memwin(struct adapter *, int, uint32_t, uint32_t *, int, int); static inline int read_via_memwin(struct adapter *, int, uint32_t, uint32_t *, int); static inline int write_via_memwin(struct adapter *, int, uint32_t, const uint32_t *, int); static int validate_mem_range(struct adapter *, uint32_t, int); static int fwmtype_to_hwmtype(int); static int validate_mt_off_len(struct adapter *, int, uint32_t, int, uint32_t *); static int fixup_devlog_params(struct adapter *); static int cfg_itype_and_nqueues(struct adapter *, int, int, int, struct intrs_and_queues *); static int prep_firmware(struct adapter *); static int partition_resources(struct adapter *, const struct firmware *, const char *); static int get_params__pre_init(struct adapter *); static int get_params__post_init(struct adapter *); static int set_params__post_init(struct adapter *); static void t4_set_desc(struct adapter *); static void build_medialist(struct port_info *, struct ifmedia *); static int cxgbe_init_synchronized(struct vi_info *); static int cxgbe_uninit_synchronized(struct vi_info *); static void quiesce_txq(struct adapter *, struct sge_txq *); static void quiesce_wrq(struct adapter *, struct sge_wrq *); static void quiesce_iq(struct adapter *, struct sge_iq *); static void quiesce_fl(struct adapter *, struct sge_fl *); static int t4_alloc_irq(struct adapter *, struct irq *, int rid, driver_intr_t *, void *, char *); static int t4_free_irq(struct adapter *, struct irq *); static void get_regs(struct adapter *, struct t4_regdump *, uint8_t *); static void vi_refresh_stats(struct adapter *, struct vi_info *); static void cxgbe_refresh_stats(struct adapter *, struct port_info *); static void cxgbe_tick(void *); static void cxgbe_vlan_config(void *, struct ifnet *, uint16_t); static void cxgbe_sysctls(struct port_info *); static int sysctl_int_array(SYSCTL_HANDLER_ARGS); static int sysctl_bitfield(SYSCTL_HANDLER_ARGS); static int sysctl_btphy(SYSCTL_HANDLER_ARGS); static int sysctl_noflowq(SYSCTL_HANDLER_ARGS); static int sysctl_holdoff_tmr_idx(SYSCTL_HANDLER_ARGS); static int sysctl_holdoff_pktc_idx(SYSCTL_HANDLER_ARGS); static int sysctl_qsize_rxq(SYSCTL_HANDLER_ARGS); static int sysctl_qsize_txq(SYSCTL_HANDLER_ARGS); static int sysctl_pause_settings(SYSCTL_HANDLER_ARGS); static int sysctl_handle_t4_reg64(SYSCTL_HANDLER_ARGS); static int sysctl_temperature(SYSCTL_HANDLER_ARGS); #ifdef SBUF_DRAIN static int sysctl_cctrl(SYSCTL_HANDLER_ARGS); static int sysctl_cim_ibq_obq(SYSCTL_HANDLER_ARGS); static int sysctl_cim_la(SYSCTL_HANDLER_ARGS); static int sysctl_cim_la_t6(SYSCTL_HANDLER_ARGS); static int sysctl_cim_ma_la(SYSCTL_HANDLER_ARGS); static int sysctl_cim_pif_la(SYSCTL_HANDLER_ARGS); static int sysctl_cim_qcfg(SYSCTL_HANDLER_ARGS); static int sysctl_cpl_stats(SYSCTL_HANDLER_ARGS); static int sysctl_ddp_stats(SYSCTL_HANDLER_ARGS); static int sysctl_devlog(SYSCTL_HANDLER_ARGS); static int sysctl_fcoe_stats(SYSCTL_HANDLER_ARGS); static int sysctl_hw_sched(SYSCTL_HANDLER_ARGS); static int sysctl_lb_stats(SYSCTL_HANDLER_ARGS); static int sysctl_linkdnrc(SYSCTL_HANDLER_ARGS); static int sysctl_meminfo(SYSCTL_HANDLER_ARGS); static int sysctl_mps_tcam(SYSCTL_HANDLER_ARGS); static int sysctl_mps_tcam_t6(SYSCTL_HANDLER_ARGS); static int sysctl_path_mtus(SYSCTL_HANDLER_ARGS); static int sysctl_pm_stats(SYSCTL_HANDLER_ARGS); static int sysctl_rdma_stats(SYSCTL_HANDLER_ARGS); static int sysctl_tcp_stats(SYSCTL_HANDLER_ARGS); static int sysctl_tids(SYSCTL_HANDLER_ARGS); static int sysctl_tp_err_stats(SYSCTL_HANDLER_ARGS); static int sysctl_tp_la_mask(SYSCTL_HANDLER_ARGS); static int sysctl_tp_la(SYSCTL_HANDLER_ARGS); static int sysctl_tx_rate(SYSCTL_HANDLER_ARGS); static int sysctl_ulprx_la(SYSCTL_HANDLER_ARGS); static int sysctl_wcwr_stats(SYSCTL_HANDLER_ARGS); static int sysctl_tc_params(SYSCTL_HANDLER_ARGS); #endif #ifdef TCP_OFFLOAD static int sysctl_tp_tick(SYSCTL_HANDLER_ARGS); static int sysctl_tp_dack_timer(SYSCTL_HANDLER_ARGS); static int sysctl_tp_timer(SYSCTL_HANDLER_ARGS); #endif static uint32_t fconf_iconf_to_mode(uint32_t, uint32_t); static uint32_t mode_to_fconf(uint32_t); static uint32_t mode_to_iconf(uint32_t); static int check_fspec_against_fconf_iconf(struct adapter *, struct t4_filter_specification *); static int get_filter_mode(struct adapter *, uint32_t *); static int set_filter_mode(struct adapter *, uint32_t); static inline uint64_t get_filter_hits(struct adapter *, uint32_t); static int get_filter(struct adapter *, struct t4_filter *); static int set_filter(struct adapter *, struct t4_filter *); static int del_filter(struct adapter *, struct t4_filter *); static void clear_filter(struct filter_entry *); static int set_filter_wr(struct adapter *, int); static int del_filter_wr(struct adapter *, int); static int set_tcb_rpl(struct sge_iq *, const struct rss_header *, struct mbuf *); static int get_sge_context(struct adapter *, struct t4_sge_context *); static int load_fw(struct adapter *, struct t4_data *); +static int load_cfg(struct adapter *, struct t4_data *); static int read_card_mem(struct adapter *, int, struct t4_mem_range *); static int read_i2c(struct adapter *, struct t4_i2c_data *); #ifdef TCP_OFFLOAD static int toe_capability(struct vi_info *, int); #endif static int mod_event(module_t, int, void *); static int notify_siblings(device_t, int); struct { uint16_t device; char *desc; } t4_pciids[] = { {0xa000, "Chelsio Terminator 4 FPGA"}, {0x4400, "Chelsio T440-dbg"}, {0x4401, "Chelsio T420-CR"}, {0x4402, "Chelsio T422-CR"}, {0x4403, "Chelsio T440-CR"}, {0x4404, "Chelsio T420-BCH"}, {0x4405, "Chelsio T440-BCH"}, {0x4406, "Chelsio T440-CH"}, {0x4407, "Chelsio T420-SO"}, {0x4408, "Chelsio T420-CX"}, {0x4409, "Chelsio T420-BT"}, {0x440a, "Chelsio T404-BT"}, {0x440e, "Chelsio T440-LP-CR"}, }, t5_pciids[] = { {0xb000, "Chelsio Terminator 5 FPGA"}, {0x5400, "Chelsio T580-dbg"}, {0x5401, "Chelsio T520-CR"}, /* 2 x 10G */ {0x5402, "Chelsio T522-CR"}, /* 2 x 10G, 2 X 1G */ {0x5403, "Chelsio T540-CR"}, /* 4 x 10G */ {0x5407, "Chelsio T520-SO"}, /* 2 x 10G, nomem */ {0x5409, "Chelsio T520-BT"}, /* 2 x 10GBaseT */ {0x540a, "Chelsio T504-BT"}, /* 4 x 1G */ {0x540d, "Chelsio T580-CR"}, /* 2 x 40G */ {0x540e, "Chelsio T540-LP-CR"}, /* 4 x 10G */ {0x5410, "Chelsio T580-LP-CR"}, /* 2 x 40G */ {0x5411, "Chelsio T520-LL-CR"}, /* 2 x 10G */ {0x5412, "Chelsio T560-CR"}, /* 1 x 40G, 2 x 10G */ {0x5414, "Chelsio T580-LP-SO-CR"}, /* 2 x 40G, nomem */ {0x5415, "Chelsio T502-BT"}, /* 2 x 1G */ #ifdef notyet {0x5404, "Chelsio T520-BCH"}, {0x5405, "Chelsio T540-BCH"}, {0x5406, "Chelsio T540-CH"}, {0x5408, "Chelsio T520-CX"}, {0x540b, "Chelsio B520-SR"}, {0x540c, "Chelsio B504-BT"}, {0x540f, "Chelsio Amsterdam"}, {0x5413, "Chelsio T580-CHR"}, #endif }, t6_pciids[] = { {0xc006, "Chelsio Terminator 6 FPGA"}, /* T6 PE10K6 FPGA (PF0) */ {0x6401, "Chelsio T6225-CR"}, /* 2 x 10/25G */ {0x6402, "Chelsio T6225-SO-CR"}, /* 2 x 10/25G, nomem */ {0x6407, "Chelsio T62100-LP-CR"}, /* 2 x 40/50/100G */ {0x6408, "Chelsio T62100-SO-CR"}, /* 2 x 40/50/100G, nomem */ {0x640d, "Chelsio T62100-CR"}, /* 2 x 40/50/100G */ {0x6410, "Chelsio T62100-DBG"}, /* 2 x 40/50/100G, debug */ }; #ifdef TCP_OFFLOAD /* * service_iq() has an iq and needs the fl. Offset of fl from the iq should be * exactly the same for both rxq and ofld_rxq. */ CTASSERT(offsetof(struct sge_ofld_rxq, iq) == offsetof(struct sge_rxq, iq)); CTASSERT(offsetof(struct sge_ofld_rxq, fl) == offsetof(struct sge_rxq, fl)); #endif CTASSERT(sizeof(struct cluster_metadata) <= CL_METADATA_SIZE); static int t4_probe(device_t dev) { int i; uint16_t v = pci_get_vendor(dev); uint16_t d = pci_get_device(dev); uint8_t f = pci_get_function(dev); if (v != PCI_VENDOR_ID_CHELSIO) return (ENXIO); /* Attach only to PF0 of the FPGA */ if (d == 0xa000 && f != 0) return (ENXIO); for (i = 0; i < nitems(t4_pciids); i++) { if (d == t4_pciids[i].device) { device_set_desc(dev, t4_pciids[i].desc); return (BUS_PROBE_DEFAULT); } } return (ENXIO); } static int t5_probe(device_t dev) { int i; uint16_t v = pci_get_vendor(dev); uint16_t d = pci_get_device(dev); uint8_t f = pci_get_function(dev); if (v != PCI_VENDOR_ID_CHELSIO) return (ENXIO); /* Attach only to PF0 of the FPGA */ if (d == 0xb000 && f != 0) return (ENXIO); for (i = 0; i < nitems(t5_pciids); i++) { if (d == t5_pciids[i].device) { device_set_desc(dev, t5_pciids[i].desc); return (BUS_PROBE_DEFAULT); } } return (ENXIO); } static int t6_probe(device_t dev) { int i; uint16_t v = pci_get_vendor(dev); uint16_t d = pci_get_device(dev); if (v != PCI_VENDOR_ID_CHELSIO) return (ENXIO); for (i = 0; i < nitems(t6_pciids); i++) { if (d == t6_pciids[i].device) { device_set_desc(dev, t6_pciids[i].desc); return (BUS_PROBE_DEFAULT); } } return (ENXIO); } static void t5_attribute_workaround(device_t dev) { device_t root_port; uint32_t v; /* * The T5 chips do not properly echo the No Snoop and Relaxed * Ordering attributes when replying to a TLP from a Root * Port. As a workaround, find the parent Root Port and * disable No Snoop and Relaxed Ordering. Note that this * affects all devices under this root port. */ root_port = pci_find_pcie_root_port(dev); if (root_port == NULL) { device_printf(dev, "Unable to find parent root port\n"); return; } v = pcie_adjust_config(root_port, PCIER_DEVICE_CTL, PCIEM_CTL_RELAXED_ORD_ENABLE | PCIEM_CTL_NOSNOOP_ENABLE, 0, 2); if ((v & (PCIEM_CTL_RELAXED_ORD_ENABLE | PCIEM_CTL_NOSNOOP_ENABLE)) != 0) device_printf(dev, "Disabled No Snoop/Relaxed Ordering on %s\n", device_get_nameunit(root_port)); } static const struct devnames devnames[] = { { .nexus_name = "t4nex", .ifnet_name = "cxgbe", .vi_ifnet_name = "vcxgbe", .pf03_drv_name = "t4iov", .vf_nexus_name = "t4vf", .vf_ifnet_name = "cxgbev" }, { .nexus_name = "t5nex", .ifnet_name = "cxl", .vi_ifnet_name = "vcxl", .pf03_drv_name = "t5iov", .vf_nexus_name = "t5vf", .vf_ifnet_name = "cxlv" }, { .nexus_name = "t6nex", .ifnet_name = "cc", .vi_ifnet_name = "vcc", .pf03_drv_name = "t6iov", .vf_nexus_name = "t6vf", .vf_ifnet_name = "ccv" } }; void t4_init_devnames(struct adapter *sc) { int id; id = chip_id(sc); if (id >= CHELSIO_T4 && id - CHELSIO_T4 < nitems(devnames)) sc->names = &devnames[id - CHELSIO_T4]; else { device_printf(sc->dev, "chip id %d is not supported.\n", id); sc->names = NULL; } } static int t4_attach(device_t dev) { struct adapter *sc; int rc = 0, i, j, n10g, n1g, rqidx, tqidx; struct make_dev_args mda; struct intrs_and_queues iaq; struct sge *s; uint8_t *buf; #ifdef TCP_OFFLOAD int ofld_rqidx, ofld_tqidx; #endif #ifdef DEV_NETMAP int nm_rqidx, nm_tqidx; #endif int num_vis; sc = device_get_softc(dev); sc->dev = dev; TUNABLE_INT_FETCH("hw.cxgbe.dflags", &sc->debug_flags); if ((pci_get_device(dev) & 0xff00) == 0x5400) t5_attribute_workaround(dev); pci_enable_busmaster(dev); if (pci_find_cap(dev, PCIY_EXPRESS, &i) == 0) { uint32_t v; pci_set_max_read_req(dev, 4096); v = pci_read_config(dev, i + PCIER_DEVICE_CTL, 2); v |= PCIEM_CTL_RELAXED_ORD_ENABLE; pci_write_config(dev, i + PCIER_DEVICE_CTL, v, 2); sc->params.pci.mps = 128 << ((v & PCIEM_CTL_MAX_PAYLOAD) >> 5); } sc->sge_gts_reg = MYPF_REG(A_SGE_PF_GTS); sc->sge_kdoorbell_reg = MYPF_REG(A_SGE_PF_KDOORBELL); sc->traceq = -1; mtx_init(&sc->ifp_lock, sc->ifp_lockname, 0, MTX_DEF); snprintf(sc->ifp_lockname, sizeof(sc->ifp_lockname), "%s tracer", device_get_nameunit(dev)); snprintf(sc->lockname, sizeof(sc->lockname), "%s", device_get_nameunit(dev)); mtx_init(&sc->sc_lock, sc->lockname, 0, MTX_DEF); t4_add_adapter(sc); mtx_init(&sc->sfl_lock, "starving freelists", 0, MTX_DEF); TAILQ_INIT(&sc->sfl); callout_init_mtx(&sc->sfl_callout, &sc->sfl_lock, 0); mtx_init(&sc->reg_lock, "indirect register access", 0, MTX_DEF); rc = t4_map_bars_0_and_4(sc); if (rc != 0) goto done; /* error message displayed already */ memset(sc->chan_map, 0xff, sizeof(sc->chan_map)); /* Prepare the adapter for operation. */ buf = malloc(PAGE_SIZE, M_CXGBE, M_ZERO | M_WAITOK); rc = -t4_prep_adapter(sc, buf); free(buf, M_CXGBE); if (rc != 0) { device_printf(dev, "failed to prepare adapter: %d.\n", rc); goto done; } /* * This is the real PF# to which we're attaching. Works from within PCI * passthrough environments too, where pci_get_function() could return a * different PF# depending on the passthrough configuration. We need to * use the real PF# in all our communication with the firmware. */ j = t4_read_reg(sc, A_PL_WHOAMI); sc->pf = chip_id(sc) <= CHELSIO_T5 ? G_SOURCEPF(j) : G_T6_SOURCEPF(j); sc->mbox = sc->pf; t4_init_devnames(sc); if (sc->names == NULL) { rc = ENOTSUP; goto done; /* error message displayed already */ } /* * Do this really early, with the memory windows set up even before the * character device. The userland tool's register i/o and mem read * will work even in "recovery mode". */ setup_memwin(sc); if (t4_init_devlog_params(sc, 0) == 0) fixup_devlog_params(sc); make_dev_args_init(&mda); mda.mda_devsw = &t4_cdevsw; mda.mda_uid = UID_ROOT; mda.mda_gid = GID_WHEEL; mda.mda_mode = 0600; mda.mda_si_drv1 = sc; rc = make_dev_s(&mda, &sc->cdev, "%s", device_get_nameunit(dev)); if (rc != 0) device_printf(dev, "failed to create nexus char device: %d.\n", rc); /* Go no further if recovery mode has been requested. */ if (TUNABLE_INT_FETCH("hw.cxgbe.sos", &i) && i != 0) { device_printf(dev, "recovery mode.\n"); goto done; } #if defined(__i386__) if ((cpu_feature & CPUID_CX8) == 0) { device_printf(dev, "64 bit atomics not available.\n"); rc = ENOTSUP; goto done; } #endif /* Prepare the firmware for operation */ rc = prep_firmware(sc); if (rc != 0) goto done; /* error message displayed already */ rc = get_params__post_init(sc); if (rc != 0) goto done; /* error message displayed already */ rc = set_params__post_init(sc); if (rc != 0) goto done; /* error message displayed already */ rc = t4_map_bar_2(sc); if (rc != 0) goto done; /* error message displayed already */ rc = t4_create_dma_tag(sc); if (rc != 0) goto done; /* error message displayed already */ /* * Number of VIs to create per-port. The first VI is the "main" regular * VI for the port. The rest are additional virtual interfaces on the * same physical port. Note that the main VI does not have native * netmap support but the extra VIs do. * * Limit the number of VIs per port to the number of available * MAC addresses per port. */ if (t4_num_vis >= 1) num_vis = t4_num_vis; else num_vis = 1; if (num_vis > nitems(vi_mac_funcs)) { num_vis = nitems(vi_mac_funcs); device_printf(dev, "Number of VIs limited to %d\n", num_vis); } /* * First pass over all the ports - allocate VIs and initialize some * basic parameters like mac address, port type, etc. We also figure * out whether a port is 10G or 1G and use that information when * calculating how many interrupts to attempt to allocate. */ n10g = n1g = 0; for_each_port(sc, i) { struct port_info *pi; pi = malloc(sizeof(*pi), M_CXGBE, M_ZERO | M_WAITOK); sc->port[i] = pi; /* These must be set before t4_port_init */ pi->adapter = sc; pi->port_id = i; /* * XXX: vi[0] is special so we can't delay this allocation until * pi->nvi's final value is known. */ pi->vi = malloc(sizeof(struct vi_info) * num_vis, M_CXGBE, M_ZERO | M_WAITOK); /* * Allocate the "main" VI and initialize parameters * like mac addr. */ rc = -t4_port_init(sc, sc->mbox, sc->pf, 0, i); if (rc != 0) { device_printf(dev, "unable to initialize port %d: %d\n", i, rc); free(pi->vi, M_CXGBE); free(pi, M_CXGBE); sc->port[i] = NULL; goto done; } pi->link_cfg.requested_fc &= ~(PAUSE_TX | PAUSE_RX); pi->link_cfg.requested_fc |= t4_pause_settings; pi->link_cfg.fc &= ~(PAUSE_TX | PAUSE_RX); pi->link_cfg.fc |= t4_pause_settings; rc = -t4_link_l1cfg(sc, sc->mbox, pi->tx_chan, &pi->link_cfg); if (rc != 0) { device_printf(dev, "port %d l1cfg failed: %d\n", i, rc); free(pi->vi, M_CXGBE); free(pi, M_CXGBE); sc->port[i] = NULL; goto done; } snprintf(pi->lockname, sizeof(pi->lockname), "%sp%d", device_get_nameunit(dev), i); mtx_init(&pi->pi_lock, pi->lockname, 0, MTX_DEF); sc->chan_map[pi->tx_chan] = i; pi->tc = malloc(sizeof(struct tx_sched_class) * sc->chip_params->nsched_cls, M_CXGBE, M_ZERO | M_WAITOK); if (port_top_speed(pi) >= 10) { n10g++; } else { n1g++; } pi->linkdnrc = -1; pi->dev = device_add_child(dev, sc->names->ifnet_name, -1); if (pi->dev == NULL) { device_printf(dev, "failed to add device for port %d.\n", i); rc = ENXIO; goto done; } pi->vi[0].dev = pi->dev; device_set_softc(pi->dev, pi); } /* * Interrupt type, # of interrupts, # of rx/tx queues, etc. */ rc = cfg_itype_and_nqueues(sc, n10g, n1g, num_vis, &iaq); if (rc != 0) goto done; /* error message displayed already */ if (iaq.nrxq_vi + iaq.nofldrxq_vi + iaq.nnmrxq_vi == 0) num_vis = 1; sc->intr_type = iaq.intr_type; sc->intr_count = iaq.nirq; s = &sc->sge; s->nrxq = n10g * iaq.nrxq10g + n1g * iaq.nrxq1g; s->ntxq = n10g * iaq.ntxq10g + n1g * iaq.ntxq1g; if (num_vis > 1) { s->nrxq += (n10g + n1g) * (num_vis - 1) * iaq.nrxq_vi; s->ntxq += (n10g + n1g) * (num_vis - 1) * iaq.ntxq_vi; } s->neq = s->ntxq + s->nrxq; /* the free list in an rxq is an eq */ s->neq += sc->params.nports + 1;/* ctrl queues: 1 per port + 1 mgmt */ s->niq = s->nrxq + 1; /* 1 extra for firmware event queue */ #ifdef TCP_OFFLOAD if (is_offload(sc)) { s->nofldrxq = n10g * iaq.nofldrxq10g + n1g * iaq.nofldrxq1g; s->nofldtxq = n10g * iaq.nofldtxq10g + n1g * iaq.nofldtxq1g; if (num_vis > 1) { s->nofldrxq += (n10g + n1g) * (num_vis - 1) * iaq.nofldrxq_vi; s->nofldtxq += (n10g + n1g) * (num_vis - 1) * iaq.nofldtxq_vi; } s->neq += s->nofldtxq + s->nofldrxq; s->niq += s->nofldrxq; s->ofld_rxq = malloc(s->nofldrxq * sizeof(struct sge_ofld_rxq), M_CXGBE, M_ZERO | M_WAITOK); s->ofld_txq = malloc(s->nofldtxq * sizeof(struct sge_wrq), M_CXGBE, M_ZERO | M_WAITOK); } #endif #ifdef DEV_NETMAP if (num_vis > 1) { s->nnmrxq = (n10g + n1g) * (num_vis - 1) * iaq.nnmrxq_vi; s->nnmtxq = (n10g + n1g) * (num_vis - 1) * iaq.nnmtxq_vi; } s->neq += s->nnmtxq + s->nnmrxq; s->niq += s->nnmrxq; s->nm_rxq = malloc(s->nnmrxq * sizeof(struct sge_nm_rxq), M_CXGBE, M_ZERO | M_WAITOK); s->nm_txq = malloc(s->nnmtxq * sizeof(struct sge_nm_txq), M_CXGBE, M_ZERO | M_WAITOK); #endif s->ctrlq = malloc(sc->params.nports * sizeof(struct sge_wrq), M_CXGBE, M_ZERO | M_WAITOK); s->rxq = malloc(s->nrxq * sizeof(struct sge_rxq), M_CXGBE, M_ZERO | M_WAITOK); s->txq = malloc(s->ntxq * sizeof(struct sge_txq), M_CXGBE, M_ZERO | M_WAITOK); s->iqmap = malloc(s->niq * sizeof(struct sge_iq *), M_CXGBE, M_ZERO | M_WAITOK); s->eqmap = malloc(s->neq * sizeof(struct sge_eq *), M_CXGBE, M_ZERO | M_WAITOK); sc->irq = malloc(sc->intr_count * sizeof(struct irq), M_CXGBE, M_ZERO | M_WAITOK); t4_init_l2t(sc, M_WAITOK); /* * Second pass over the ports. This time we know the number of rx and * tx queues that each port should get. */ rqidx = tqidx = 0; #ifdef TCP_OFFLOAD ofld_rqidx = ofld_tqidx = 0; #endif #ifdef DEV_NETMAP nm_rqidx = nm_tqidx = 0; #endif for_each_port(sc, i) { struct port_info *pi = sc->port[i]; struct vi_info *vi; if (pi == NULL) continue; pi->nvi = num_vis; for_each_vi(pi, j, vi) { vi->pi = pi; vi->qsize_rxq = t4_qsize_rxq; vi->qsize_txq = t4_qsize_txq; vi->first_rxq = rqidx; vi->first_txq = tqidx; if (port_top_speed(pi) >= 10) { vi->tmr_idx = t4_tmr_idx_10g; vi->pktc_idx = t4_pktc_idx_10g; vi->flags |= iaq.intr_flags_10g & INTR_RXQ; vi->nrxq = j == 0 ? iaq.nrxq10g : iaq.nrxq_vi; vi->ntxq = j == 0 ? iaq.ntxq10g : iaq.ntxq_vi; } else { vi->tmr_idx = t4_tmr_idx_1g; vi->pktc_idx = t4_pktc_idx_1g; vi->flags |= iaq.intr_flags_1g & INTR_RXQ; vi->nrxq = j == 0 ? iaq.nrxq1g : iaq.nrxq_vi; vi->ntxq = j == 0 ? iaq.ntxq1g : iaq.ntxq_vi; } rqidx += vi->nrxq; tqidx += vi->ntxq; if (j == 0 && vi->ntxq > 1) vi->rsrv_noflowq = iaq.rsrv_noflowq ? 1 : 0; else vi->rsrv_noflowq = 0; #ifdef TCP_OFFLOAD vi->first_ofld_rxq = ofld_rqidx; vi->first_ofld_txq = ofld_tqidx; if (port_top_speed(pi) >= 10) { vi->flags |= iaq.intr_flags_10g & INTR_OFLD_RXQ; vi->nofldrxq = j == 0 ? iaq.nofldrxq10g : iaq.nofldrxq_vi; vi->nofldtxq = j == 0 ? iaq.nofldtxq10g : iaq.nofldtxq_vi; } else { vi->flags |= iaq.intr_flags_1g & INTR_OFLD_RXQ; vi->nofldrxq = j == 0 ? iaq.nofldrxq1g : iaq.nofldrxq_vi; vi->nofldtxq = j == 0 ? iaq.nofldtxq1g : iaq.nofldtxq_vi; } ofld_rqidx += vi->nofldrxq; ofld_tqidx += vi->nofldtxq; #endif #ifdef DEV_NETMAP if (j > 0) { vi->first_nm_rxq = nm_rqidx; vi->first_nm_txq = nm_tqidx; vi->nnmrxq = iaq.nnmrxq_vi; vi->nnmtxq = iaq.nnmtxq_vi; nm_rqidx += vi->nnmrxq; nm_tqidx += vi->nnmtxq; } #endif } } rc = t4_setup_intr_handlers(sc); if (rc != 0) { device_printf(dev, "failed to setup interrupt handlers: %d\n", rc); goto done; } rc = bus_generic_attach(dev); if (rc != 0) { device_printf(dev, "failed to attach all child ports: %d\n", rc); goto done; } device_printf(dev, "PCIe gen%d x%d, %d ports, %d %s interrupt%s, %d eq, %d iq\n", sc->params.pci.speed, sc->params.pci.width, sc->params.nports, sc->intr_count, sc->intr_type == INTR_MSIX ? "MSI-X" : (sc->intr_type == INTR_MSI ? "MSI" : "INTx"), sc->intr_count > 1 ? "s" : "", sc->sge.neq, sc->sge.niq); t4_set_desc(sc); notify_siblings(dev, 0); done: if (rc != 0 && sc->cdev) { /* cdev was created and so cxgbetool works; recover that way. */ device_printf(dev, "error during attach, adapter is now in recovery mode.\n"); rc = 0; } if (rc != 0) t4_detach_common(dev); else t4_sysctls(sc); return (rc); } static int t4_ready(device_t dev) { struct adapter *sc; sc = device_get_softc(dev); if (sc->flags & FW_OK) return (0); return (ENXIO); } static int t4_read_port_device(device_t dev, int port, device_t *child) { struct adapter *sc; struct port_info *pi; sc = device_get_softc(dev); if (port < 0 || port >= MAX_NPORTS) return (EINVAL); pi = sc->port[port]; if (pi == NULL || pi->dev == NULL) return (ENXIO); *child = pi->dev; return (0); } static int notify_siblings(device_t dev, int detaching) { device_t sibling; int error, i; error = 0; for (i = 0; i < PCI_FUNCMAX; i++) { if (i == pci_get_function(dev)) continue; sibling = pci_find_dbsf(pci_get_domain(dev), pci_get_bus(dev), pci_get_slot(dev), i); if (sibling == NULL || !device_is_attached(sibling)) continue; if (detaching) error = T4_DETACH_CHILD(sibling); else (void)T4_ATTACH_CHILD(sibling); if (error) break; } return (error); } /* * Idempotent */ static int t4_detach(device_t dev) { struct adapter *sc; int rc; sc = device_get_softc(dev); rc = notify_siblings(dev, 1); if (rc) { device_printf(dev, "failed to detach sibling devices: %d\n", rc); return (rc); } return (t4_detach_common(dev)); } int t4_detach_common(device_t dev) { struct adapter *sc; struct port_info *pi; int i, rc; sc = device_get_softc(dev); if (sc->flags & FULL_INIT_DONE) { if (!(sc->flags & IS_VF)) t4_intr_disable(sc); } if (sc->cdev) { destroy_dev(sc->cdev); sc->cdev = NULL; } if (device_is_attached(dev)) { rc = bus_generic_detach(dev); if (rc) { device_printf(dev, "failed to detach child devices: %d\n", rc); return (rc); } } for (i = 0; i < sc->intr_count; i++) t4_free_irq(sc, &sc->irq[i]); for (i = 0; i < MAX_NPORTS; i++) { pi = sc->port[i]; if (pi) { t4_free_vi(sc, sc->mbox, sc->pf, 0, pi->vi[0].viid); if (pi->dev) device_delete_child(dev, pi->dev); mtx_destroy(&pi->pi_lock); free(pi->vi, M_CXGBE); free(pi->tc, M_CXGBE); free(pi, M_CXGBE); } } if (sc->flags & FULL_INIT_DONE) adapter_full_uninit(sc); if ((sc->flags & (IS_VF | FW_OK)) == FW_OK) t4_fw_bye(sc, sc->mbox); if (sc->intr_type == INTR_MSI || sc->intr_type == INTR_MSIX) pci_release_msi(dev); if (sc->regs_res) bus_release_resource(dev, SYS_RES_MEMORY, sc->regs_rid, sc->regs_res); if (sc->udbs_res) bus_release_resource(dev, SYS_RES_MEMORY, sc->udbs_rid, sc->udbs_res); if (sc->msix_res) bus_release_resource(dev, SYS_RES_MEMORY, sc->msix_rid, sc->msix_res); if (sc->l2t) t4_free_l2t(sc->l2t); #ifdef TCP_OFFLOAD free(sc->sge.ofld_rxq, M_CXGBE); free(sc->sge.ofld_txq, M_CXGBE); #endif #ifdef DEV_NETMAP free(sc->sge.nm_rxq, M_CXGBE); free(sc->sge.nm_txq, M_CXGBE); #endif free(sc->irq, M_CXGBE); free(sc->sge.rxq, M_CXGBE); free(sc->sge.txq, M_CXGBE); free(sc->sge.ctrlq, M_CXGBE); free(sc->sge.iqmap, M_CXGBE); free(sc->sge.eqmap, M_CXGBE); free(sc->tids.ftid_tab, M_CXGBE); t4_destroy_dma_tag(sc); if (mtx_initialized(&sc->sc_lock)) { sx_xlock(&t4_list_lock); SLIST_REMOVE(&t4_list, sc, adapter, link); sx_xunlock(&t4_list_lock); mtx_destroy(&sc->sc_lock); } callout_drain(&sc->sfl_callout); if (mtx_initialized(&sc->tids.ftid_lock)) mtx_destroy(&sc->tids.ftid_lock); if (mtx_initialized(&sc->sfl_lock)) mtx_destroy(&sc->sfl_lock); if (mtx_initialized(&sc->ifp_lock)) mtx_destroy(&sc->ifp_lock); if (mtx_initialized(&sc->reg_lock)) mtx_destroy(&sc->reg_lock); for (i = 0; i < NUM_MEMWIN; i++) { struct memwin *mw = &sc->memwin[i]; if (rw_initialized(&mw->mw_lock)) rw_destroy(&mw->mw_lock); } bzero(sc, sizeof(*sc)); return (0); } static int cxgbe_probe(device_t dev) { char buf[128]; struct port_info *pi = device_get_softc(dev); snprintf(buf, sizeof(buf), "port %d", pi->port_id); device_set_desc_copy(dev, buf); return (BUS_PROBE_DEFAULT); } #define T4_CAP (IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU | IFCAP_HWCSUM | \ IFCAP_VLAN_HWCSUM | IFCAP_TSO | IFCAP_JUMBO_MTU | IFCAP_LRO | \ IFCAP_VLAN_HWTSO | IFCAP_LINKSTATE | IFCAP_HWCSUM_IPV6 | IFCAP_HWSTATS) #define T4_CAP_ENABLE (T4_CAP) static int cxgbe_vi_attach(device_t dev, struct vi_info *vi) { struct ifnet *ifp; struct sbuf *sb; vi->xact_addr_filt = -1; callout_init(&vi->tick, 1); /* Allocate an ifnet and set it up */ ifp = if_alloc(IFT_ETHER); if (ifp == NULL) { device_printf(dev, "Cannot allocate ifnet\n"); return (ENOMEM); } vi->ifp = ifp; ifp->if_softc = vi; if_initname(ifp, device_get_name(dev), device_get_unit(dev)); ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST; ifp->if_init = cxgbe_init; ifp->if_ioctl = cxgbe_ioctl; ifp->if_transmit = cxgbe_transmit; ifp->if_qflush = cxgbe_qflush; ifp->if_get_counter = cxgbe_get_counter; ifp->if_capabilities = T4_CAP; #ifdef TCP_OFFLOAD if (vi->nofldrxq != 0) ifp->if_capabilities |= IFCAP_TOE; #endif ifp->if_capenable = T4_CAP_ENABLE; ifp->if_hwassist = CSUM_TCP | CSUM_UDP | CSUM_IP | CSUM_TSO | CSUM_UDP_IPV6 | CSUM_TCP_IPV6; ifp->if_hw_tsomax = 65536 - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN); ifp->if_hw_tsomaxsegcount = TX_SGL_SEGS; ifp->if_hw_tsomaxsegsize = 65536; /* Initialize ifmedia for this VI */ ifmedia_init(&vi->media, IFM_IMASK, cxgbe_media_change, cxgbe_media_status); build_medialist(vi->pi, &vi->media); vi->vlan_c = EVENTHANDLER_REGISTER(vlan_config, cxgbe_vlan_config, ifp, EVENTHANDLER_PRI_ANY); ether_ifattach(ifp, vi->hw_addr); #ifdef DEV_NETMAP if (vi->nnmrxq != 0) cxgbe_nm_attach(vi); #endif sb = sbuf_new_auto(); sbuf_printf(sb, "%d txq, %d rxq (NIC)", vi->ntxq, vi->nrxq); #ifdef TCP_OFFLOAD if (ifp->if_capabilities & IFCAP_TOE) sbuf_printf(sb, "; %d txq, %d rxq (TOE)", vi->nofldtxq, vi->nofldrxq); #endif #ifdef DEV_NETMAP if (ifp->if_capabilities & IFCAP_NETMAP) sbuf_printf(sb, "; %d txq, %d rxq (netmap)", vi->nnmtxq, vi->nnmrxq); #endif sbuf_finish(sb); device_printf(dev, "%s\n", sbuf_data(sb)); sbuf_delete(sb); vi_sysctls(vi); return (0); } static int cxgbe_attach(device_t dev) { struct port_info *pi = device_get_softc(dev); struct adapter *sc = pi->adapter; struct vi_info *vi; int i, rc; callout_init_mtx(&pi->tick, &pi->pi_lock, 0); rc = cxgbe_vi_attach(dev, &pi->vi[0]); if (rc) return (rc); for_each_vi(pi, i, vi) { if (i == 0) continue; vi->dev = device_add_child(dev, sc->names->vi_ifnet_name, -1); if (vi->dev == NULL) { device_printf(dev, "failed to add VI %d\n", i); continue; } device_set_softc(vi->dev, vi); } cxgbe_sysctls(pi); bus_generic_attach(dev); return (0); } static void cxgbe_vi_detach(struct vi_info *vi) { struct ifnet *ifp = vi->ifp; ether_ifdetach(ifp); if (vi->vlan_c) EVENTHANDLER_DEREGISTER(vlan_config, vi->vlan_c); /* Let detach proceed even if these fail. */ #ifdef DEV_NETMAP if (ifp->if_capabilities & IFCAP_NETMAP) cxgbe_nm_detach(vi); #endif cxgbe_uninit_synchronized(vi); callout_drain(&vi->tick); vi_full_uninit(vi); ifmedia_removeall(&vi->media); if_free(vi->ifp); vi->ifp = NULL; } static int cxgbe_detach(device_t dev) { struct port_info *pi = device_get_softc(dev); struct adapter *sc = pi->adapter; int rc; /* Detach the extra VIs first. */ rc = bus_generic_detach(dev); if (rc) return (rc); device_delete_children(dev); doom_vi(sc, &pi->vi[0]); if (pi->flags & HAS_TRACEQ) { sc->traceq = -1; /* cloner should not create ifnet */ t4_tracer_port_detach(sc); } cxgbe_vi_detach(&pi->vi[0]); callout_drain(&pi->tick); end_synchronized_op(sc, 0); return (0); } static void cxgbe_init(void *arg) { struct vi_info *vi = arg; struct adapter *sc = vi->pi->adapter; if (begin_synchronized_op(sc, vi, SLEEP_OK | INTR_OK, "t4init") != 0) return; cxgbe_init_synchronized(vi); end_synchronized_op(sc, 0); } static int cxgbe_ioctl(struct ifnet *ifp, unsigned long cmd, caddr_t data) { int rc = 0, mtu, flags, can_sleep; struct vi_info *vi = ifp->if_softc; struct adapter *sc = vi->pi->adapter; struct ifreq *ifr = (struct ifreq *)data; uint32_t mask; switch (cmd) { case SIOCSIFMTU: mtu = ifr->ifr_mtu; if ((mtu < ETHERMIN) || (mtu > ETHERMTU_JUMBO)) return (EINVAL); rc = begin_synchronized_op(sc, vi, SLEEP_OK | INTR_OK, "t4mtu"); if (rc) return (rc); ifp->if_mtu = mtu; if (vi->flags & VI_INIT_DONE) { t4_update_fl_bufsize(ifp); if (ifp->if_drv_flags & IFF_DRV_RUNNING) rc = update_mac_settings(ifp, XGMAC_MTU); } end_synchronized_op(sc, 0); break; case SIOCSIFFLAGS: can_sleep = 0; redo_sifflags: rc = begin_synchronized_op(sc, vi, can_sleep ? (SLEEP_OK | INTR_OK) : HOLD_LOCK, "t4flg"); if (rc) return (rc); if (ifp->if_flags & IFF_UP) { if (ifp->if_drv_flags & IFF_DRV_RUNNING) { flags = vi->if_flags; if ((ifp->if_flags ^ flags) & (IFF_PROMISC | IFF_ALLMULTI)) { if (can_sleep == 1) { end_synchronized_op(sc, 0); can_sleep = 0; goto redo_sifflags; } rc = update_mac_settings(ifp, XGMAC_PROMISC | XGMAC_ALLMULTI); } } else { if (can_sleep == 0) { end_synchronized_op(sc, LOCK_HELD); can_sleep = 1; goto redo_sifflags; } rc = cxgbe_init_synchronized(vi); } vi->if_flags = ifp->if_flags; } else if (ifp->if_drv_flags & IFF_DRV_RUNNING) { if (can_sleep == 0) { end_synchronized_op(sc, LOCK_HELD); can_sleep = 1; goto redo_sifflags; } rc = cxgbe_uninit_synchronized(vi); } end_synchronized_op(sc, can_sleep ? 0 : LOCK_HELD); break; case SIOCADDMULTI: case SIOCDELMULTI: /* these two are called with a mutex held :-( */ rc = begin_synchronized_op(sc, vi, HOLD_LOCK, "t4multi"); if (rc) return (rc); if (ifp->if_drv_flags & IFF_DRV_RUNNING) rc = update_mac_settings(ifp, XGMAC_MCADDRS); end_synchronized_op(sc, LOCK_HELD); break; case SIOCSIFCAP: rc = begin_synchronized_op(sc, vi, SLEEP_OK | INTR_OK, "t4cap"); if (rc) return (rc); mask = ifr->ifr_reqcap ^ ifp->if_capenable; if (mask & IFCAP_TXCSUM) { ifp->if_capenable ^= IFCAP_TXCSUM; ifp->if_hwassist ^= (CSUM_TCP | CSUM_UDP | CSUM_IP); if (IFCAP_TSO4 & ifp->if_capenable && !(IFCAP_TXCSUM & ifp->if_capenable)) { ifp->if_capenable &= ~IFCAP_TSO4; if_printf(ifp, "tso4 disabled due to -txcsum.\n"); } } if (mask & IFCAP_TXCSUM_IPV6) { ifp->if_capenable ^= IFCAP_TXCSUM_IPV6; ifp->if_hwassist ^= (CSUM_UDP_IPV6 | CSUM_TCP_IPV6); if (IFCAP_TSO6 & ifp->if_capenable && !(IFCAP_TXCSUM_IPV6 & ifp->if_capenable)) { ifp->if_capenable &= ~IFCAP_TSO6; if_printf(ifp, "tso6 disabled due to -txcsum6.\n"); } } if (mask & IFCAP_RXCSUM) ifp->if_capenable ^= IFCAP_RXCSUM; if (mask & IFCAP_RXCSUM_IPV6) ifp->if_capenable ^= IFCAP_RXCSUM_IPV6; /* * Note that we leave CSUM_TSO alone (it is always set). The * kernel takes both IFCAP_TSOx and CSUM_TSO into account before * sending a TSO request our way, so it's sufficient to toggle * IFCAP_TSOx only. */ if (mask & IFCAP_TSO4) { if (!(IFCAP_TSO4 & ifp->if_capenable) && !(IFCAP_TXCSUM & ifp->if_capenable)) { if_printf(ifp, "enable txcsum first.\n"); rc = EAGAIN; goto fail; } ifp->if_capenable ^= IFCAP_TSO4; } if (mask & IFCAP_TSO6) { if (!(IFCAP_TSO6 & ifp->if_capenable) && !(IFCAP_TXCSUM_IPV6 & ifp->if_capenable)) { if_printf(ifp, "enable txcsum6 first.\n"); rc = EAGAIN; goto fail; } ifp->if_capenable ^= IFCAP_TSO6; } if (mask & IFCAP_LRO) { #if defined(INET) || defined(INET6) int i; struct sge_rxq *rxq; ifp->if_capenable ^= IFCAP_LRO; for_each_rxq(vi, i, rxq) { if (ifp->if_capenable & IFCAP_LRO) rxq->iq.flags |= IQ_LRO_ENABLED; else rxq->iq.flags &= ~IQ_LRO_ENABLED; } #endif } #ifdef TCP_OFFLOAD if (mask & IFCAP_TOE) { int enable = (ifp->if_capenable ^ mask) & IFCAP_TOE; rc = toe_capability(vi, enable); if (rc != 0) goto fail; ifp->if_capenable ^= mask; } #endif if (mask & IFCAP_VLAN_HWTAGGING) { ifp->if_capenable ^= IFCAP_VLAN_HWTAGGING; if (ifp->if_drv_flags & IFF_DRV_RUNNING) rc = update_mac_settings(ifp, XGMAC_VLANEX); } if (mask & IFCAP_VLAN_MTU) { ifp->if_capenable ^= IFCAP_VLAN_MTU; /* Need to find out how to disable auto-mtu-inflation */ } if (mask & IFCAP_VLAN_HWTSO) ifp->if_capenable ^= IFCAP_VLAN_HWTSO; if (mask & IFCAP_VLAN_HWCSUM) ifp->if_capenable ^= IFCAP_VLAN_HWCSUM; #ifdef VLAN_CAPABILITIES VLAN_CAPABILITIES(ifp); #endif fail: end_synchronized_op(sc, 0); break; case SIOCSIFMEDIA: case SIOCGIFMEDIA: case SIOCGIFXMEDIA: ifmedia_ioctl(ifp, ifr, &vi->media, cmd); break; case SIOCGI2C: { struct ifi2creq i2c; rc = copyin(ifr->ifr_data, &i2c, sizeof(i2c)); if (rc != 0) break; if (i2c.dev_addr != 0xA0 && i2c.dev_addr != 0xA2) { rc = EPERM; break; } if (i2c.len > sizeof(i2c.data)) { rc = EINVAL; break; } rc = begin_synchronized_op(sc, vi, SLEEP_OK | INTR_OK, "t4i2c"); if (rc) return (rc); rc = -t4_i2c_rd(sc, sc->mbox, vi->pi->port_id, i2c.dev_addr, i2c.offset, i2c.len, &i2c.data[0]); end_synchronized_op(sc, 0); if (rc == 0) rc = copyout(&i2c, ifr->ifr_data, sizeof(i2c)); break; } default: rc = ether_ioctl(ifp, cmd, data); } return (rc); } static int cxgbe_transmit(struct ifnet *ifp, struct mbuf *m) { struct vi_info *vi = ifp->if_softc; struct port_info *pi = vi->pi; struct adapter *sc = pi->adapter; struct sge_txq *txq; void *items[1]; int rc; M_ASSERTPKTHDR(m); MPASS(m->m_nextpkt == NULL); /* not quite ready for this yet */ if (__predict_false(pi->link_cfg.link_ok == 0)) { m_freem(m); return (ENETDOWN); } rc = parse_pkt(sc, &m); if (__predict_false(rc != 0)) { MPASS(m == NULL); /* was freed already */ atomic_add_int(&pi->tx_parse_error, 1); /* rare, atomic is ok */ return (rc); } /* Select a txq. */ txq = &sc->sge.txq[vi->first_txq]; if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) txq += ((m->m_pkthdr.flowid % (vi->ntxq - vi->rsrv_noflowq)) + vi->rsrv_noflowq); items[0] = m; rc = mp_ring_enqueue(txq->r, items, 1, 4096); if (__predict_false(rc != 0)) m_freem(m); return (rc); } static void cxgbe_qflush(struct ifnet *ifp) { struct vi_info *vi = ifp->if_softc; struct sge_txq *txq; int i; /* queues do not exist if !VI_INIT_DONE. */ if (vi->flags & VI_INIT_DONE) { for_each_txq(vi, i, txq) { TXQ_LOCK(txq); txq->eq.flags &= ~EQ_ENABLED; TXQ_UNLOCK(txq); while (!mp_ring_is_idle(txq->r)) { mp_ring_check_drainage(txq->r, 0); pause("qflush", 1); } } } if_qflush(ifp); } static uint64_t vi_get_counter(struct ifnet *ifp, ift_counter c) { struct vi_info *vi = ifp->if_softc; struct fw_vi_stats_vf *s = &vi->stats; vi_refresh_stats(vi->pi->adapter, vi); switch (c) { case IFCOUNTER_IPACKETS: return (s->rx_bcast_frames + s->rx_mcast_frames + s->rx_ucast_frames); case IFCOUNTER_IERRORS: return (s->rx_err_frames); case IFCOUNTER_OPACKETS: return (s->tx_bcast_frames + s->tx_mcast_frames + s->tx_ucast_frames + s->tx_offload_frames); case IFCOUNTER_OERRORS: return (s->tx_drop_frames); case IFCOUNTER_IBYTES: return (s->rx_bcast_bytes + s->rx_mcast_bytes + s->rx_ucast_bytes); case IFCOUNTER_OBYTES: return (s->tx_bcast_bytes + s->tx_mcast_bytes + s->tx_ucast_bytes + s->tx_offload_bytes); case IFCOUNTER_IMCASTS: return (s->rx_mcast_frames); case IFCOUNTER_OMCASTS: return (s->tx_mcast_frames); case IFCOUNTER_OQDROPS: { uint64_t drops; drops = 0; if (vi->flags & VI_INIT_DONE) { int i; struct sge_txq *txq; for_each_txq(vi, i, txq) drops += counter_u64_fetch(txq->r->drops); } return (drops); } default: return (if_get_counter_default(ifp, c)); } } uint64_t cxgbe_get_counter(struct ifnet *ifp, ift_counter c) { struct vi_info *vi = ifp->if_softc; struct port_info *pi = vi->pi; struct adapter *sc = pi->adapter; struct port_stats *s = &pi->stats; if (pi->nvi > 1 || sc->flags & IS_VF) return (vi_get_counter(ifp, c)); cxgbe_refresh_stats(sc, pi); switch (c) { case IFCOUNTER_IPACKETS: return (s->rx_frames); case IFCOUNTER_IERRORS: return (s->rx_jabber + s->rx_runt + s->rx_too_long + s->rx_fcs_err + s->rx_len_err); case IFCOUNTER_OPACKETS: return (s->tx_frames); case IFCOUNTER_OERRORS: return (s->tx_error_frames); case IFCOUNTER_IBYTES: return (s->rx_octets); case IFCOUNTER_OBYTES: return (s->tx_octets); case IFCOUNTER_IMCASTS: return (s->rx_mcast_frames); case IFCOUNTER_OMCASTS: return (s->tx_mcast_frames); case IFCOUNTER_IQDROPS: return (s->rx_ovflow0 + s->rx_ovflow1 + s->rx_ovflow2 + s->rx_ovflow3 + s->rx_trunc0 + s->rx_trunc1 + s->rx_trunc2 + s->rx_trunc3 + pi->tnl_cong_drops); case IFCOUNTER_OQDROPS: { uint64_t drops; drops = s->tx_drop; if (vi->flags & VI_INIT_DONE) { int i; struct sge_txq *txq; for_each_txq(vi, i, txq) drops += counter_u64_fetch(txq->r->drops); } return (drops); } default: return (if_get_counter_default(ifp, c)); } } static int cxgbe_media_change(struct ifnet *ifp) { struct vi_info *vi = ifp->if_softc; device_printf(vi->dev, "%s unimplemented.\n", __func__); return (EOPNOTSUPP); } static void cxgbe_media_status(struct ifnet *ifp, struct ifmediareq *ifmr) { struct vi_info *vi = ifp->if_softc; struct port_info *pi = vi->pi; struct ifmedia_entry *cur; int speed = pi->link_cfg.speed; cur = vi->media.ifm_cur; ifmr->ifm_status = IFM_AVALID; if (!pi->link_cfg.link_ok) return; ifmr->ifm_status |= IFM_ACTIVE; /* active and current will differ iff current media is autoselect. */ if (IFM_SUBTYPE(cur->ifm_media) != IFM_AUTO) return; ifmr->ifm_active = IFM_ETHER | IFM_FDX; if (speed == 10000) ifmr->ifm_active |= IFM_10G_T; else if (speed == 1000) ifmr->ifm_active |= IFM_1000_T; else if (speed == 100) ifmr->ifm_active |= IFM_100_TX; else if (speed == 10) ifmr->ifm_active |= IFM_10_T; else KASSERT(0, ("%s: link up but speed unknown (%u)", __func__, speed)); } static int vcxgbe_probe(device_t dev) { char buf[128]; struct vi_info *vi = device_get_softc(dev); snprintf(buf, sizeof(buf), "port %d vi %td", vi->pi->port_id, vi - vi->pi->vi); device_set_desc_copy(dev, buf); return (BUS_PROBE_DEFAULT); } static int vcxgbe_attach(device_t dev) { struct vi_info *vi; struct port_info *pi; struct adapter *sc; int func, index, rc; u32 param, val; vi = device_get_softc(dev); pi = vi->pi; sc = pi->adapter; index = vi - pi->vi; KASSERT(index < nitems(vi_mac_funcs), ("%s: VI %s doesn't have a MAC func", __func__, device_get_nameunit(dev))); func = vi_mac_funcs[index]; rc = t4_alloc_vi_func(sc, sc->mbox, pi->tx_chan, sc->pf, 0, 1, vi->hw_addr, &vi->rss_size, func, 0); if (rc < 0) { device_printf(dev, "Failed to allocate virtual interface " "for port %d: %d\n", pi->port_id, -rc); return (-rc); } vi->viid = rc; if (chip_id(sc) <= CHELSIO_T5) vi->smt_idx = (rc & 0x7f) << 1; else vi->smt_idx = (rc & 0x7f); param = V_FW_PARAMS_MNEM(FW_PARAMS_MNEM_DEV) | V_FW_PARAMS_PARAM_X(FW_PARAMS_PARAM_DEV_RSSINFO) | V_FW_PARAMS_PARAM_YZ(vi->viid); rc = t4_query_params(sc, sc->mbox, sc->pf, 0, 1, ¶m, &val); if (rc) vi->rss_base = 0xffff; else { /* MPASS((val >> 16) == rss_size); */ vi->rss_base = val & 0xffff; } rc = cxgbe_vi_attach(dev, vi); if (rc) { t4_free_vi(sc, sc->mbox, sc->pf, 0, vi->viid); return (rc); } return (0); } static int vcxgbe_detach(device_t dev) { struct vi_info *vi; struct adapter *sc; vi = device_get_softc(dev); sc = vi->pi->adapter; doom_vi(sc, vi); cxgbe_vi_detach(vi); t4_free_vi(sc, sc->mbox, sc->pf, 0, vi->viid); end_synchronized_op(sc, 0); return (0); } void t4_fatal_err(struct adapter *sc) { t4_set_reg_field(sc, A_SGE_CONTROL, F_GLOBALENABLE, 0); t4_intr_disable(sc); log(LOG_EMERG, "%s: encountered fatal error, adapter stopped.\n", device_get_nameunit(sc->dev)); } void t4_add_adapter(struct adapter *sc) { sx_xlock(&t4_list_lock); SLIST_INSERT_HEAD(&t4_list, sc, link); sx_xunlock(&t4_list_lock); } int t4_map_bars_0_and_4(struct adapter *sc) { sc->regs_rid = PCIR_BAR(0); sc->regs_res = bus_alloc_resource_any(sc->dev, SYS_RES_MEMORY, &sc->regs_rid, RF_ACTIVE); if (sc->regs_res == NULL) { device_printf(sc->dev, "cannot map registers.\n"); return (ENXIO); } sc->bt = rman_get_bustag(sc->regs_res); sc->bh = rman_get_bushandle(sc->regs_res); sc->mmio_len = rman_get_size(sc->regs_res); setbit(&sc->doorbells, DOORBELL_KDB); sc->msix_rid = PCIR_BAR(4); sc->msix_res = bus_alloc_resource_any(sc->dev, SYS_RES_MEMORY, &sc->msix_rid, RF_ACTIVE); if (sc->msix_res == NULL) { device_printf(sc->dev, "cannot map MSI-X BAR.\n"); return (ENXIO); } return (0); } int t4_map_bar_2(struct adapter *sc) { /* * T4: only iWARP driver uses the userspace doorbells. There is no need * to map it if RDMA is disabled. */ if (is_t4(sc) && sc->rdmacaps == 0) return (0); sc->udbs_rid = PCIR_BAR(2); sc->udbs_res = bus_alloc_resource_any(sc->dev, SYS_RES_MEMORY, &sc->udbs_rid, RF_ACTIVE); if (sc->udbs_res == NULL) { device_printf(sc->dev, "cannot map doorbell BAR.\n"); return (ENXIO); } sc->udbs_base = rman_get_virtual(sc->udbs_res); if (chip_id(sc) >= CHELSIO_T5) { setbit(&sc->doorbells, DOORBELL_UDB); #if defined(__i386__) || defined(__amd64__) if (t5_write_combine) { int rc, mode; /* * Enable write combining on BAR2. This is the * userspace doorbell BAR and is split into 128B * (UDBS_SEG_SIZE) doorbell regions, each associated * with an egress queue. The first 64B has the doorbell * and the second 64B can be used to submit a tx work * request with an implicit doorbell. */ rc = pmap_change_attr((vm_offset_t)sc->udbs_base, rman_get_size(sc->udbs_res), PAT_WRITE_COMBINING); if (rc == 0) { clrbit(&sc->doorbells, DOORBELL_UDB); setbit(&sc->doorbells, DOORBELL_WCWR); setbit(&sc->doorbells, DOORBELL_UDBWC); } else { device_printf(sc->dev, "couldn't enable write combining: %d\n", rc); } mode = is_t5(sc) ? V_STATMODE(0) : V_T6_STATMODE(0); t4_write_reg(sc, A_SGE_STAT_CFG, V_STATSOURCE_T5(7) | mode); } #endif } return (0); } struct memwin_init { uint32_t base; uint32_t aperture; }; static const struct memwin_init t4_memwin[NUM_MEMWIN] = { { MEMWIN0_BASE, MEMWIN0_APERTURE }, { MEMWIN1_BASE, MEMWIN1_APERTURE }, { MEMWIN2_BASE_T4, MEMWIN2_APERTURE_T4 } }; static const struct memwin_init t5_memwin[NUM_MEMWIN] = { { MEMWIN0_BASE, MEMWIN0_APERTURE }, { MEMWIN1_BASE, MEMWIN1_APERTURE }, { MEMWIN2_BASE_T5, MEMWIN2_APERTURE_T5 }, }; static void setup_memwin(struct adapter *sc) { const struct memwin_init *mw_init; struct memwin *mw; int i; uint32_t bar0; if (is_t4(sc)) { /* * Read low 32b of bar0 indirectly via the hardware backdoor * mechanism. Works from within PCI passthrough environments * too, where rman_get_start() can return a different value. We * need to program the T4 memory window decoders with the actual * addresses that will be coming across the PCIe link. */ bar0 = t4_hw_pci_read_cfg4(sc, PCIR_BAR(0)); bar0 &= (uint32_t) PCIM_BAR_MEM_BASE; mw_init = &t4_memwin[0]; } else { /* T5+ use the relative offset inside the PCIe BAR */ bar0 = 0; mw_init = &t5_memwin[0]; } for (i = 0, mw = &sc->memwin[0]; i < NUM_MEMWIN; i++, mw_init++, mw++) { rw_init(&mw->mw_lock, "memory window access"); mw->mw_base = mw_init->base; mw->mw_aperture = mw_init->aperture; mw->mw_curpos = 0; t4_write_reg(sc, PCIE_MEM_ACCESS_REG(A_PCIE_MEM_ACCESS_BASE_WIN, i), (mw->mw_base + bar0) | V_BIR(0) | V_WINDOW(ilog2(mw->mw_aperture) - 10)); rw_wlock(&mw->mw_lock); position_memwin(sc, i, 0); rw_wunlock(&mw->mw_lock); } /* flush */ t4_read_reg(sc, PCIE_MEM_ACCESS_REG(A_PCIE_MEM_ACCESS_BASE_WIN, 2)); } /* * Positions the memory window at the given address in the card's address space. * There are some alignment requirements and the actual position may be at an * address prior to the requested address. mw->mw_curpos always has the actual * position of the window. */ static void position_memwin(struct adapter *sc, int idx, uint32_t addr) { struct memwin *mw; uint32_t pf; uint32_t reg; MPASS(idx >= 0 && idx < NUM_MEMWIN); mw = &sc->memwin[idx]; rw_assert(&mw->mw_lock, RA_WLOCKED); if (is_t4(sc)) { pf = 0; mw->mw_curpos = addr & ~0xf; /* start must be 16B aligned */ } else { pf = V_PFNUM(sc->pf); mw->mw_curpos = addr & ~0x7f; /* start must be 128B aligned */ } reg = PCIE_MEM_ACCESS_REG(A_PCIE_MEM_ACCESS_OFFSET, idx); t4_write_reg(sc, reg, mw->mw_curpos | pf); t4_read_reg(sc, reg); /* flush */ } static int rw_via_memwin(struct adapter *sc, int idx, uint32_t addr, uint32_t *val, int len, int rw) { struct memwin *mw; uint32_t mw_end, v; MPASS(idx >= 0 && idx < NUM_MEMWIN); /* Memory can only be accessed in naturally aligned 4 byte units */ if (addr & 3 || len & 3 || len <= 0) return (EINVAL); mw = &sc->memwin[idx]; while (len > 0) { rw_rlock(&mw->mw_lock); mw_end = mw->mw_curpos + mw->mw_aperture; if (addr >= mw_end || addr < mw->mw_curpos) { /* Will need to reposition the window */ if (!rw_try_upgrade(&mw->mw_lock)) { rw_runlock(&mw->mw_lock); rw_wlock(&mw->mw_lock); } rw_assert(&mw->mw_lock, RA_WLOCKED); position_memwin(sc, idx, addr); rw_downgrade(&mw->mw_lock); mw_end = mw->mw_curpos + mw->mw_aperture; } rw_assert(&mw->mw_lock, RA_RLOCKED); while (addr < mw_end && len > 0) { if (rw == 0) { v = t4_read_reg(sc, mw->mw_base + addr - mw->mw_curpos); *val++ = le32toh(v); } else { v = *val++; t4_write_reg(sc, mw->mw_base + addr - mw->mw_curpos, htole32(v)); } addr += 4; len -= 4; } rw_runlock(&mw->mw_lock); } return (0); } static inline int read_via_memwin(struct adapter *sc, int idx, uint32_t addr, uint32_t *val, int len) { return (rw_via_memwin(sc, idx, addr, val, len, 0)); } static inline int write_via_memwin(struct adapter *sc, int idx, uint32_t addr, const uint32_t *val, int len) { return (rw_via_memwin(sc, idx, addr, (void *)(uintptr_t)val, len, 1)); } static int t4_range_cmp(const void *a, const void *b) { return ((const struct t4_range *)a)->start - ((const struct t4_range *)b)->start; } /* * Verify that the memory range specified by the addr/len pair is valid within * the card's address space. */ static int validate_mem_range(struct adapter *sc, uint32_t addr, int len) { struct t4_range mem_ranges[4], *r, *next; uint32_t em, addr_len; int i, n, remaining; /* Memory can only be accessed in naturally aligned 4 byte units */ if (addr & 3 || len & 3 || len <= 0) return (EINVAL); /* Enabled memories */ em = t4_read_reg(sc, A_MA_TARGET_MEM_ENABLE); r = &mem_ranges[0]; n = 0; bzero(r, sizeof(mem_ranges)); if (em & F_EDRAM0_ENABLE) { addr_len = t4_read_reg(sc, A_MA_EDRAM0_BAR); r->size = G_EDRAM0_SIZE(addr_len) << 20; if (r->size > 0) { r->start = G_EDRAM0_BASE(addr_len) << 20; if (addr >= r->start && addr + len <= r->start + r->size) return (0); r++; n++; } } if (em & F_EDRAM1_ENABLE) { addr_len = t4_read_reg(sc, A_MA_EDRAM1_BAR); r->size = G_EDRAM1_SIZE(addr_len) << 20; if (r->size > 0) { r->start = G_EDRAM1_BASE(addr_len) << 20; if (addr >= r->start && addr + len <= r->start + r->size) return (0); r++; n++; } } if (em & F_EXT_MEM_ENABLE) { addr_len = t4_read_reg(sc, A_MA_EXT_MEMORY_BAR); r->size = G_EXT_MEM_SIZE(addr_len) << 20; if (r->size > 0) { r->start = G_EXT_MEM_BASE(addr_len) << 20; if (addr >= r->start && addr + len <= r->start + r->size) return (0); r++; n++; } } if (is_t5(sc) && em & F_EXT_MEM1_ENABLE) { addr_len = t4_read_reg(sc, A_MA_EXT_MEMORY1_BAR); r->size = G_EXT_MEM1_SIZE(addr_len) << 20; if (r->size > 0) { r->start = G_EXT_MEM1_BASE(addr_len) << 20; if (addr >= r->start && addr + len <= r->start + r->size) return (0); r++; n++; } } MPASS(n <= nitems(mem_ranges)); if (n > 1) { /* Sort and merge the ranges. */ qsort(mem_ranges, n, sizeof(struct t4_range), t4_range_cmp); /* Start from index 0 and examine the next n - 1 entries. */ r = &mem_ranges[0]; for (remaining = n - 1; remaining > 0; remaining--, r++) { MPASS(r->size > 0); /* r is a valid entry. */ next = r + 1; MPASS(next->size > 0); /* and so is the next one. */ while (r->start + r->size >= next->start) { /* Merge the next one into the current entry. */ r->size = max(r->start + r->size, next->start + next->size) - r->start; n--; /* One fewer entry in total. */ if (--remaining == 0) goto done; /* short circuit */ next++; } if (next != r + 1) { /* * Some entries were merged into r and next * points to the first valid entry that couldn't * be merged. */ MPASS(next->size > 0); /* must be valid */ memcpy(r + 1, next, remaining * sizeof(*r)); #ifdef INVARIANTS /* * This so that the foo->size assertion in the * next iteration of the loop do the right * thing for entries that were pulled up and are * no longer valid. */ MPASS(n < nitems(mem_ranges)); bzero(&mem_ranges[n], (nitems(mem_ranges) - n) * sizeof(struct t4_range)); #endif } } done: /* Done merging the ranges. */ MPASS(n > 0); r = &mem_ranges[0]; for (i = 0; i < n; i++, r++) { if (addr >= r->start && addr + len <= r->start + r->size) return (0); } } return (EFAULT); } static int fwmtype_to_hwmtype(int mtype) { switch (mtype) { case FW_MEMTYPE_EDC0: return (MEM_EDC0); case FW_MEMTYPE_EDC1: return (MEM_EDC1); case FW_MEMTYPE_EXTMEM: return (MEM_MC0); case FW_MEMTYPE_EXTMEM1: return (MEM_MC1); default: panic("%s: cannot translate fw mtype %d.", __func__, mtype); } } /* * Verify that the memory range specified by the memtype/offset/len pair is * valid and lies entirely within the memtype specified. The global address of * the start of the range is returned in addr. */ static int validate_mt_off_len(struct adapter *sc, int mtype, uint32_t off, int len, uint32_t *addr) { uint32_t em, addr_len, maddr; /* Memory can only be accessed in naturally aligned 4 byte units */ if (off & 3 || len & 3 || len == 0) return (EINVAL); em = t4_read_reg(sc, A_MA_TARGET_MEM_ENABLE); switch (fwmtype_to_hwmtype(mtype)) { case MEM_EDC0: if (!(em & F_EDRAM0_ENABLE)) return (EINVAL); addr_len = t4_read_reg(sc, A_MA_EDRAM0_BAR); maddr = G_EDRAM0_BASE(addr_len) << 20; break; case MEM_EDC1: if (!(em & F_EDRAM1_ENABLE)) return (EINVAL); addr_len = t4_read_reg(sc, A_MA_EDRAM1_BAR); maddr = G_EDRAM1_BASE(addr_len) << 20; break; case MEM_MC: if (!(em & F_EXT_MEM_ENABLE)) return (EINVAL); addr_len = t4_read_reg(sc, A_MA_EXT_MEMORY_BAR); maddr = G_EXT_MEM_BASE(addr_len) << 20; break; case MEM_MC1: if (!is_t5(sc) || !(em & F_EXT_MEM1_ENABLE)) return (EINVAL); addr_len = t4_read_reg(sc, A_MA_EXT_MEMORY1_BAR); maddr = G_EXT_MEM1_BASE(addr_len) << 20; break; default: return (EINVAL); } *addr = maddr + off; /* global address */ return (validate_mem_range(sc, *addr, len)); } static int fixup_devlog_params(struct adapter *sc) { struct devlog_params *dparams = &sc->params.devlog; int rc; rc = validate_mt_off_len(sc, dparams->memtype, dparams->start, dparams->size, &dparams->addr); return (rc); } static int cfg_itype_and_nqueues(struct adapter *sc, int n10g, int n1g, int num_vis, struct intrs_and_queues *iaq) { int rc, itype, navail, nrxq10g, nrxq1g, n; int nofldrxq10g = 0, nofldrxq1g = 0; bzero(iaq, sizeof(*iaq)); iaq->ntxq10g = t4_ntxq10g; iaq->ntxq1g = t4_ntxq1g; iaq->ntxq_vi = t4_ntxq_vi; iaq->nrxq10g = nrxq10g = t4_nrxq10g; iaq->nrxq1g = nrxq1g = t4_nrxq1g; iaq->nrxq_vi = t4_nrxq_vi; iaq->rsrv_noflowq = t4_rsrv_noflowq; #ifdef TCP_OFFLOAD if (is_offload(sc)) { iaq->nofldtxq10g = t4_nofldtxq10g; iaq->nofldtxq1g = t4_nofldtxq1g; iaq->nofldtxq_vi = t4_nofldtxq_vi; iaq->nofldrxq10g = nofldrxq10g = t4_nofldrxq10g; iaq->nofldrxq1g = nofldrxq1g = t4_nofldrxq1g; iaq->nofldrxq_vi = t4_nofldrxq_vi; } #endif #ifdef DEV_NETMAP iaq->nnmtxq_vi = t4_nnmtxq_vi; iaq->nnmrxq_vi = t4_nnmrxq_vi; #endif for (itype = INTR_MSIX; itype; itype >>= 1) { if ((itype & t4_intr_types) == 0) continue; /* not allowed */ if (itype == INTR_MSIX) navail = pci_msix_count(sc->dev); else if (itype == INTR_MSI) navail = pci_msi_count(sc->dev); else navail = 1; restart: if (navail == 0) continue; iaq->intr_type = itype; iaq->intr_flags_10g = 0; iaq->intr_flags_1g = 0; /* * Best option: an interrupt vector for errors, one for the * firmware event queue, and one for every rxq (NIC and TOE) of * every VI. The VIs that support netmap use the same * interrupts for the NIC rx queues and the netmap rx queues * because only one set of queues is active at a time. */ iaq->nirq = T4_EXTRA_INTR; iaq->nirq += n10g * (nrxq10g + nofldrxq10g); iaq->nirq += n1g * (nrxq1g + nofldrxq1g); iaq->nirq += (n10g + n1g) * (num_vis - 1) * max(iaq->nrxq_vi, iaq->nnmrxq_vi); /* See comment above. */ iaq->nirq += (n10g + n1g) * (num_vis - 1) * iaq->nofldrxq_vi; if (iaq->nirq <= navail && (itype != INTR_MSI || powerof2(iaq->nirq))) { iaq->intr_flags_10g = INTR_ALL; iaq->intr_flags_1g = INTR_ALL; goto allocate; } /* Disable the VIs (and netmap) if there aren't enough intrs */ if (num_vis > 1) { device_printf(sc->dev, "virtual interfaces disabled " "because num_vis=%u with current settings " "(nrxq10g=%u, nrxq1g=%u, nofldrxq10g=%u, " "nofldrxq1g=%u, nrxq_vi=%u nofldrxq_vi=%u, " "nnmrxq_vi=%u) would need %u interrupts but " "only %u are available.\n", num_vis, nrxq10g, nrxq1g, nofldrxq10g, nofldrxq1g, iaq->nrxq_vi, iaq->nofldrxq_vi, iaq->nnmrxq_vi, iaq->nirq, navail); num_vis = 1; iaq->ntxq_vi = iaq->nrxq_vi = 0; iaq->nofldtxq_vi = iaq->nofldrxq_vi = 0; iaq->nnmtxq_vi = iaq->nnmrxq_vi = 0; goto restart; } /* * Second best option: a vector for errors, one for the firmware * event queue, and vectors for either all the NIC rx queues or * all the TOE rx queues. The queues that don't get vectors * will forward their interrupts to those that do. */ iaq->nirq = T4_EXTRA_INTR; if (nrxq10g >= nofldrxq10g) { iaq->intr_flags_10g = INTR_RXQ; iaq->nirq += n10g * nrxq10g; } else { iaq->intr_flags_10g = INTR_OFLD_RXQ; iaq->nirq += n10g * nofldrxq10g; } if (nrxq1g >= nofldrxq1g) { iaq->intr_flags_1g = INTR_RXQ; iaq->nirq += n1g * nrxq1g; } else { iaq->intr_flags_1g = INTR_OFLD_RXQ; iaq->nirq += n1g * nofldrxq1g; } if (iaq->nirq <= navail && (itype != INTR_MSI || powerof2(iaq->nirq))) goto allocate; /* * Next best option: an interrupt vector for errors, one for the * firmware event queue, and at least one per main-VI. At this * point we know we'll have to downsize nrxq and/or nofldrxq to * fit what's available to us. */ iaq->nirq = T4_EXTRA_INTR; iaq->nirq += n10g + n1g; if (iaq->nirq <= navail) { int leftover = navail - iaq->nirq; if (n10g > 0) { int target = max(nrxq10g, nofldrxq10g); iaq->intr_flags_10g = nrxq10g >= nofldrxq10g ? INTR_RXQ : INTR_OFLD_RXQ; n = 1; while (n < target && leftover >= n10g) { leftover -= n10g; iaq->nirq += n10g; n++; } iaq->nrxq10g = min(n, nrxq10g); #ifdef TCP_OFFLOAD iaq->nofldrxq10g = min(n, nofldrxq10g); #endif } if (n1g > 0) { int target = max(nrxq1g, nofldrxq1g); iaq->intr_flags_1g = nrxq1g >= nofldrxq1g ? INTR_RXQ : INTR_OFLD_RXQ; n = 1; while (n < target && leftover >= n1g) { leftover -= n1g; iaq->nirq += n1g; n++; } iaq->nrxq1g = min(n, nrxq1g); #ifdef TCP_OFFLOAD iaq->nofldrxq1g = min(n, nofldrxq1g); #endif } if (itype != INTR_MSI || powerof2(iaq->nirq)) goto allocate; } /* * Least desirable option: one interrupt vector for everything. */ iaq->nirq = iaq->nrxq10g = iaq->nrxq1g = 1; iaq->intr_flags_10g = iaq->intr_flags_1g = 0; #ifdef TCP_OFFLOAD if (is_offload(sc)) iaq->nofldrxq10g = iaq->nofldrxq1g = 1; #endif allocate: navail = iaq->nirq; rc = 0; if (itype == INTR_MSIX) rc = pci_alloc_msix(sc->dev, &navail); else if (itype == INTR_MSI) rc = pci_alloc_msi(sc->dev, &navail); if (rc == 0) { if (navail == iaq->nirq) return (0); /* * Didn't get the number requested. Use whatever number * the kernel is willing to allocate (it's in navail). */ device_printf(sc->dev, "fewer vectors than requested, " "type=%d, req=%d, rcvd=%d; will downshift req.\n", itype, iaq->nirq, navail); pci_release_msi(sc->dev); goto restart; } device_printf(sc->dev, "failed to allocate vectors:%d, type=%d, req=%d, rcvd=%d\n", itype, rc, iaq->nirq, navail); } device_printf(sc->dev, "failed to find a usable interrupt type. " "allowed=%d, msi-x=%d, msi=%d, intx=1", t4_intr_types, pci_msix_count(sc->dev), pci_msi_count(sc->dev)); return (ENXIO); } #define FW_VERSION(chip) ( \ V_FW_HDR_FW_VER_MAJOR(chip##FW_VERSION_MAJOR) | \ V_FW_HDR_FW_VER_MINOR(chip##FW_VERSION_MINOR) | \ V_FW_HDR_FW_VER_MICRO(chip##FW_VERSION_MICRO) | \ V_FW_HDR_FW_VER_BUILD(chip##FW_VERSION_BUILD)) #define FW_INTFVER(chip, intf) (chip##FW_HDR_INTFVER_##intf) struct fw_info { uint8_t chip; char *kld_name; char *fw_mod_name; struct fw_hdr fw_hdr; /* XXX: waste of space, need a sparse struct */ } fw_info[] = { { .chip = CHELSIO_T4, .kld_name = "t4fw_cfg", .fw_mod_name = "t4fw", .fw_hdr = { .chip = FW_HDR_CHIP_T4, .fw_ver = htobe32_const(FW_VERSION(T4)), .intfver_nic = FW_INTFVER(T4, NIC), .intfver_vnic = FW_INTFVER(T4, VNIC), .intfver_ofld = FW_INTFVER(T4, OFLD), .intfver_ri = FW_INTFVER(T4, RI), .intfver_iscsipdu = FW_INTFVER(T4, ISCSIPDU), .intfver_iscsi = FW_INTFVER(T4, ISCSI), .intfver_fcoepdu = FW_INTFVER(T4, FCOEPDU), .intfver_fcoe = FW_INTFVER(T4, FCOE), }, }, { .chip = CHELSIO_T5, .kld_name = "t5fw_cfg", .fw_mod_name = "t5fw", .fw_hdr = { .chip = FW_HDR_CHIP_T5, .fw_ver = htobe32_const(FW_VERSION(T5)), .intfver_nic = FW_INTFVER(T5, NIC), .intfver_vnic = FW_INTFVER(T5, VNIC), .intfver_ofld = FW_INTFVER(T5, OFLD), .intfver_ri = FW_INTFVER(T5, RI), .intfver_iscsipdu = FW_INTFVER(T5, ISCSIPDU), .intfver_iscsi = FW_INTFVER(T5, ISCSI), .intfver_fcoepdu = FW_INTFVER(T5, FCOEPDU), .intfver_fcoe = FW_INTFVER(T5, FCOE), }, }, { .chip = CHELSIO_T6, .kld_name = "t6fw_cfg", .fw_mod_name = "t6fw", .fw_hdr = { .chip = FW_HDR_CHIP_T6, .fw_ver = htobe32_const(FW_VERSION(T6)), .intfver_nic = FW_INTFVER(T6, NIC), .intfver_vnic = FW_INTFVER(T6, VNIC), .intfver_ofld = FW_INTFVER(T6, OFLD), .intfver_ri = FW_INTFVER(T6, RI), .intfver_iscsipdu = FW_INTFVER(T6, ISCSIPDU), .intfver_iscsi = FW_INTFVER(T6, ISCSI), .intfver_fcoepdu = FW_INTFVER(T6, FCOEPDU), .intfver_fcoe = FW_INTFVER(T6, FCOE), }, } }; static struct fw_info * find_fw_info(int chip) { int i; for (i = 0; i < nitems(fw_info); i++) { if (fw_info[i].chip == chip) return (&fw_info[i]); } return (NULL); } /* * Is the given firmware API compatible with the one the driver was compiled * with? */ static int fw_compatible(const struct fw_hdr *hdr1, const struct fw_hdr *hdr2) { /* short circuit if it's the exact same firmware version */ if (hdr1->chip == hdr2->chip && hdr1->fw_ver == hdr2->fw_ver) return (1); /* * XXX: Is this too conservative? Perhaps I should limit this to the * features that are supported in the driver. */ #define SAME_INTF(x) (hdr1->intfver_##x == hdr2->intfver_##x) if (hdr1->chip == hdr2->chip && SAME_INTF(nic) && SAME_INTF(vnic) && SAME_INTF(ofld) && SAME_INTF(ri) && SAME_INTF(iscsipdu) && SAME_INTF(iscsi) && SAME_INTF(fcoepdu) && SAME_INTF(fcoe)) return (1); #undef SAME_INTF return (0); } /* * The firmware in the KLD is usable, but should it be installed? This routine * explains itself in detail if it indicates the KLD firmware should be * installed. */ static int should_install_kld_fw(struct adapter *sc, int card_fw_usable, int k, int c) { const char *reason; if (!card_fw_usable) { reason = "incompatible or unusable"; goto install; } if (k > c) { reason = "older than the version bundled with this driver"; goto install; } if (t4_fw_install == 2 && k != c) { reason = "different than the version bundled with this driver"; goto install; } return (0); install: if (t4_fw_install == 0) { device_printf(sc->dev, "firmware on card (%u.%u.%u.%u) is %s, " "but the driver is prohibited from installing a different " "firmware on the card.\n", G_FW_HDR_FW_VER_MAJOR(c), G_FW_HDR_FW_VER_MINOR(c), G_FW_HDR_FW_VER_MICRO(c), G_FW_HDR_FW_VER_BUILD(c), reason); return (0); } device_printf(sc->dev, "firmware on card (%u.%u.%u.%u) is %s, " "installing firmware %u.%u.%u.%u on card.\n", G_FW_HDR_FW_VER_MAJOR(c), G_FW_HDR_FW_VER_MINOR(c), G_FW_HDR_FW_VER_MICRO(c), G_FW_HDR_FW_VER_BUILD(c), reason, G_FW_HDR_FW_VER_MAJOR(k), G_FW_HDR_FW_VER_MINOR(k), G_FW_HDR_FW_VER_MICRO(k), G_FW_HDR_FW_VER_BUILD(k)); return (1); } /* * Establish contact with the firmware and determine if we are the master driver * or not, and whether we are responsible for chip initialization. */ static int prep_firmware(struct adapter *sc) { const struct firmware *fw = NULL, *default_cfg; int rc, pf, card_fw_usable, kld_fw_usable, need_fw_reset = 1; enum dev_state state; struct fw_info *fw_info; struct fw_hdr *card_fw; /* fw on the card */ const struct fw_hdr *kld_fw; /* fw in the KLD */ const struct fw_hdr *drv_fw; /* fw header the driver was compiled against */ /* Contact firmware. */ rc = t4_fw_hello(sc, sc->mbox, sc->mbox, MASTER_MAY, &state); if (rc < 0 || state == DEV_STATE_ERR) { rc = -rc; device_printf(sc->dev, "failed to connect to the firmware: %d, %d.\n", rc, state); return (rc); } pf = rc; if (pf == sc->mbox) sc->flags |= MASTER_PF; else if (state == DEV_STATE_UNINIT) { /* * We didn't get to be the master so we definitely won't be * configuring the chip. It's a bug if someone else hasn't * configured it already. */ device_printf(sc->dev, "couldn't be master(%d), " "device not already initialized either(%d).\n", rc, state); return (EDOOFUS); } /* This is the firmware whose headers the driver was compiled against */ fw_info = find_fw_info(chip_id(sc)); if (fw_info == NULL) { device_printf(sc->dev, "unable to look up firmware information for chip %d.\n", chip_id(sc)); return (EINVAL); } drv_fw = &fw_info->fw_hdr; /* * The firmware KLD contains many modules. The KLD name is also the * name of the module that contains the default config file. */ default_cfg = firmware_get(fw_info->kld_name); /* Read the header of the firmware on the card */ card_fw = malloc(sizeof(*card_fw), M_CXGBE, M_ZERO | M_WAITOK); rc = -t4_read_flash(sc, FLASH_FW_START, sizeof (*card_fw) / sizeof (uint32_t), (uint32_t *)card_fw, 1); if (rc == 0) card_fw_usable = fw_compatible(drv_fw, (const void*)card_fw); else { device_printf(sc->dev, "Unable to read card's firmware header: %d\n", rc); card_fw_usable = 0; } /* This is the firmware in the KLD */ fw = firmware_get(fw_info->fw_mod_name); if (fw != NULL) { kld_fw = (const void *)fw->data; kld_fw_usable = fw_compatible(drv_fw, kld_fw); } else { kld_fw = NULL; kld_fw_usable = 0; } if (card_fw_usable && card_fw->fw_ver == drv_fw->fw_ver && (!kld_fw_usable || kld_fw->fw_ver == drv_fw->fw_ver)) { /* * Common case: the firmware on the card is an exact match and * the KLD is an exact match too, or the KLD is * absent/incompatible. Note that t4_fw_install = 2 is ignored * here -- use cxgbetool loadfw if you want to reinstall the * same firmware as the one on the card. */ } else if (kld_fw_usable && state == DEV_STATE_UNINIT && should_install_kld_fw(sc, card_fw_usable, be32toh(kld_fw->fw_ver), be32toh(card_fw->fw_ver))) { rc = -t4_fw_upgrade(sc, sc->mbox, fw->data, fw->datasize, 0); if (rc != 0) { device_printf(sc->dev, "failed to install firmware: %d\n", rc); goto done; } /* Installed successfully, update the cached header too. */ memcpy(card_fw, kld_fw, sizeof(*card_fw)); card_fw_usable = 1; need_fw_reset = 0; /* already reset as part of load_fw */ } if (!card_fw_usable) { uint32_t d, c, k; d = ntohl(drv_fw->fw_ver); c = ntohl(card_fw->fw_ver); k = kld_fw ? ntohl(kld_fw->fw_ver) : 0; device_printf(sc->dev, "Cannot find a usable firmware: " "fw_install %d, chip state %d, " "driver compiled with %d.%d.%d.%d, " "card has %d.%d.%d.%d, KLD has %d.%d.%d.%d\n", t4_fw_install, state, G_FW_HDR_FW_VER_MAJOR(d), G_FW_HDR_FW_VER_MINOR(d), G_FW_HDR_FW_VER_MICRO(d), G_FW_HDR_FW_VER_BUILD(d), G_FW_HDR_FW_VER_MAJOR(c), G_FW_HDR_FW_VER_MINOR(c), G_FW_HDR_FW_VER_MICRO(c), G_FW_HDR_FW_VER_BUILD(c), G_FW_HDR_FW_VER_MAJOR(k), G_FW_HDR_FW_VER_MINOR(k), G_FW_HDR_FW_VER_MICRO(k), G_FW_HDR_FW_VER_BUILD(k)); rc = EINVAL; goto done; } /* Reset device */ if (need_fw_reset && (rc = -t4_fw_reset(sc, sc->mbox, F_PIORSTMODE | F_PIORST)) != 0) { device_printf(sc->dev, "firmware reset failed: %d.\n", rc); if (rc != ETIMEDOUT && rc != EIO) t4_fw_bye(sc, sc->mbox); goto done; } sc->flags |= FW_OK; rc = get_params__pre_init(sc); if (rc != 0) goto done; /* error message displayed already */ /* Partition adapter resources as specified in the config file. */ if (state == DEV_STATE_UNINIT) { KASSERT(sc->flags & MASTER_PF, ("%s: trying to change chip settings when not master.", __func__)); rc = partition_resources(sc, default_cfg, fw_info->kld_name); if (rc != 0) goto done; /* error message displayed already */ t4_tweak_chip_settings(sc); /* get basic stuff going */ rc = -t4_fw_initialize(sc, sc->mbox); if (rc != 0) { device_printf(sc->dev, "fw init failed: %d.\n", rc); goto done; } } else { snprintf(sc->cfg_file, sizeof(sc->cfg_file), "pf%d", pf); sc->cfcsum = 0; } done: free(card_fw, M_CXGBE); if (fw != NULL) firmware_put(fw, FIRMWARE_UNLOAD); if (default_cfg != NULL) firmware_put(default_cfg, FIRMWARE_UNLOAD); return (rc); } #define FW_PARAM_DEV(param) \ (V_FW_PARAMS_MNEM(FW_PARAMS_MNEM_DEV) | \ V_FW_PARAMS_PARAM_X(FW_PARAMS_PARAM_DEV_##param)) #define FW_PARAM_PFVF(param) \ (V_FW_PARAMS_MNEM(FW_PARAMS_MNEM_PFVF) | \ V_FW_PARAMS_PARAM_X(FW_PARAMS_PARAM_PFVF_##param)) /* * Partition chip resources for use between various PFs, VFs, etc. */ static int partition_resources(struct adapter *sc, const struct firmware *default_cfg, const char *name_prefix) { const struct firmware *cfg = NULL; int rc = 0; struct fw_caps_config_cmd caps; uint32_t mtype, moff, finicsum, cfcsum; /* * Figure out what configuration file to use. Pick the default config * file for the card if the user hasn't specified one explicitly. */ snprintf(sc->cfg_file, sizeof(sc->cfg_file), "%s", t4_cfg_file); if (strncmp(t4_cfg_file, DEFAULT_CF, sizeof(t4_cfg_file)) == 0) { /* Card specific overrides go here. */ if (pci_get_device(sc->dev) == 0x440a) snprintf(sc->cfg_file, sizeof(sc->cfg_file), UWIRE_CF); if (is_fpga(sc)) snprintf(sc->cfg_file, sizeof(sc->cfg_file), FPGA_CF); } /* * We need to load another module if the profile is anything except * "default" or "flash". */ if (strncmp(sc->cfg_file, DEFAULT_CF, sizeof(sc->cfg_file)) != 0 && strncmp(sc->cfg_file, FLASH_CF, sizeof(sc->cfg_file)) != 0) { char s[32]; snprintf(s, sizeof(s), "%s_%s", name_prefix, sc->cfg_file); cfg = firmware_get(s); if (cfg == NULL) { if (default_cfg != NULL) { device_printf(sc->dev, "unable to load module \"%s\" for " "configuration profile \"%s\", will use " "the default config file instead.\n", s, sc->cfg_file); snprintf(sc->cfg_file, sizeof(sc->cfg_file), "%s", DEFAULT_CF); } else { device_printf(sc->dev, "unable to load module \"%s\" for " "configuration profile \"%s\", will use " "the config file on the card's flash " "instead.\n", s, sc->cfg_file); snprintf(sc->cfg_file, sizeof(sc->cfg_file), "%s", FLASH_CF); } } } if (strncmp(sc->cfg_file, DEFAULT_CF, sizeof(sc->cfg_file)) == 0 && default_cfg == NULL) { device_printf(sc->dev, "default config file not available, will use the config " "file on the card's flash instead.\n"); snprintf(sc->cfg_file, sizeof(sc->cfg_file), "%s", FLASH_CF); } if (strncmp(sc->cfg_file, FLASH_CF, sizeof(sc->cfg_file)) != 0) { u_int cflen; const uint32_t *cfdata; uint32_t param, val, addr; KASSERT(cfg != NULL || default_cfg != NULL, ("%s: no config to upload", __func__)); /* * Ask the firmware where it wants us to upload the config file. */ param = FW_PARAM_DEV(CF); rc = -t4_query_params(sc, sc->mbox, sc->pf, 0, 1, ¶m, &val); if (rc != 0) { /* No support for config file? Shouldn't happen. */ device_printf(sc->dev, "failed to query config file location: %d.\n", rc); goto done; } mtype = G_FW_PARAMS_PARAM_Y(val); moff = G_FW_PARAMS_PARAM_Z(val) << 16; /* * XXX: sheer laziness. We deliberately added 4 bytes of * useless stuffing/comments at the end of the config file so * it's ok to simply throw away the last remaining bytes when * the config file is not an exact multiple of 4. This also * helps with the validate_mt_off_len check. */ if (cfg != NULL) { cflen = cfg->datasize & ~3; cfdata = cfg->data; } else { cflen = default_cfg->datasize & ~3; cfdata = default_cfg->data; } if (cflen > FLASH_CFG_MAX_SIZE) { device_printf(sc->dev, "config file too long (%d, max allowed is %d). " "Will try to use the config on the card, if any.\n", cflen, FLASH_CFG_MAX_SIZE); goto use_config_on_flash; } rc = validate_mt_off_len(sc, mtype, moff, cflen, &addr); if (rc != 0) { device_printf(sc->dev, "%s: addr (%d/0x%x) or len %d is not valid: %d. " "Will try to use the config on the card, if any.\n", __func__, mtype, moff, cflen, rc); goto use_config_on_flash; } write_via_memwin(sc, 2, addr, cfdata, cflen); } else { use_config_on_flash: mtype = FW_MEMTYPE_FLASH; moff = t4_flash_cfg_addr(sc); } bzero(&caps, sizeof(caps)); caps.op_to_write = htobe32(V_FW_CMD_OP(FW_CAPS_CONFIG_CMD) | F_FW_CMD_REQUEST | F_FW_CMD_READ); caps.cfvalid_to_len16 = htobe32(F_FW_CAPS_CONFIG_CMD_CFVALID | V_FW_CAPS_CONFIG_CMD_MEMTYPE_CF(mtype) | V_FW_CAPS_CONFIG_CMD_MEMADDR64K_CF(moff >> 16) | FW_LEN16(caps)); rc = -t4_wr_mbox(sc, sc->mbox, &caps, sizeof(caps), &caps); if (rc != 0) { device_printf(sc->dev, "failed to pre-process config file: %d " "(mtype %d, moff 0x%x).\n", rc, mtype, moff); goto done; } finicsum = be32toh(caps.finicsum); cfcsum = be32toh(caps.cfcsum); if (finicsum != cfcsum) { device_printf(sc->dev, "WARNING: config file checksum mismatch: %08x %08x\n", finicsum, cfcsum); } sc->cfcsum = cfcsum; #define LIMIT_CAPS(x) do { \ caps.x &= htobe16(t4_##x##_allowed); \ } while (0) /* * Let the firmware know what features will (not) be used so it can tune * things accordingly. */ LIMIT_CAPS(nbmcaps); LIMIT_CAPS(linkcaps); LIMIT_CAPS(switchcaps); LIMIT_CAPS(niccaps); LIMIT_CAPS(toecaps); LIMIT_CAPS(rdmacaps); LIMIT_CAPS(cryptocaps); LIMIT_CAPS(iscsicaps); LIMIT_CAPS(fcoecaps); #undef LIMIT_CAPS caps.op_to_write = htobe32(V_FW_CMD_OP(FW_CAPS_CONFIG_CMD) | F_FW_CMD_REQUEST | F_FW_CMD_WRITE); caps.cfvalid_to_len16 = htobe32(FW_LEN16(caps)); rc = -t4_wr_mbox(sc, sc->mbox, &caps, sizeof(caps), NULL); if (rc != 0) { device_printf(sc->dev, "failed to process config file: %d.\n", rc); } done: if (cfg != NULL) firmware_put(cfg, FIRMWARE_UNLOAD); return (rc); } /* * Retrieve parameters that are needed (or nice to have) very early. */ static int get_params__pre_init(struct adapter *sc) { int rc; uint32_t param[2], val[2]; t4_get_version_info(sc); snprintf(sc->fw_version, sizeof(sc->fw_version), "%u.%u.%u.%u", G_FW_HDR_FW_VER_MAJOR(sc->params.fw_vers), G_FW_HDR_FW_VER_MINOR(sc->params.fw_vers), G_FW_HDR_FW_VER_MICRO(sc->params.fw_vers), G_FW_HDR_FW_VER_BUILD(sc->params.fw_vers)); snprintf(sc->bs_version, sizeof(sc->bs_version), "%u.%u.%u.%u", G_FW_HDR_FW_VER_MAJOR(sc->params.bs_vers), G_FW_HDR_FW_VER_MINOR(sc->params.bs_vers), G_FW_HDR_FW_VER_MICRO(sc->params.bs_vers), G_FW_HDR_FW_VER_BUILD(sc->params.bs_vers)); snprintf(sc->tp_version, sizeof(sc->tp_version), "%u.%u.%u.%u", G_FW_HDR_FW_VER_MAJOR(sc->params.tp_vers), G_FW_HDR_FW_VER_MINOR(sc->params.tp_vers), G_FW_HDR_FW_VER_MICRO(sc->params.tp_vers), G_FW_HDR_FW_VER_BUILD(sc->params.tp_vers)); snprintf(sc->er_version, sizeof(sc->er_version), "%u.%u.%u.%u", G_FW_HDR_FW_VER_MAJOR(sc->params.er_vers), G_FW_HDR_FW_VER_MINOR(sc->params.er_vers), G_FW_HDR_FW_VER_MICRO(sc->params.er_vers), G_FW_HDR_FW_VER_BUILD(sc->params.er_vers)); param[0] = FW_PARAM_DEV(PORTVEC); param[1] = FW_PARAM_DEV(CCLK); rc = -t4_query_params(sc, sc->mbox, sc->pf, 0, 2, param, val); if (rc != 0) { device_printf(sc->dev, "failed to query parameters (pre_init): %d.\n", rc); return (rc); } sc->params.portvec = val[0]; sc->params.nports = bitcount32(val[0]); sc->params.vpd.cclk = val[1]; /* Read device log parameters. */ rc = -t4_init_devlog_params(sc, 1); if (rc == 0) fixup_devlog_params(sc); else { device_printf(sc->dev, "failed to get devlog parameters: %d.\n", rc); rc = 0; /* devlog isn't critical for device operation */ } return (rc); } /* * Retrieve various parameters that are of interest to the driver. The device * has been initialized by the firmware at this point. */ static int get_params__post_init(struct adapter *sc) { int rc; uint32_t param[7], val[7]; struct fw_caps_config_cmd caps; param[0] = FW_PARAM_PFVF(IQFLINT_START); param[1] = FW_PARAM_PFVF(EQ_START); param[2] = FW_PARAM_PFVF(FILTER_START); param[3] = FW_PARAM_PFVF(FILTER_END); param[4] = FW_PARAM_PFVF(L2T_START); param[5] = FW_PARAM_PFVF(L2T_END); rc = -t4_query_params(sc, sc->mbox, sc->pf, 0, 6, param, val); if (rc != 0) { device_printf(sc->dev, "failed to query parameters (post_init): %d.\n", rc); return (rc); } sc->sge.iq_start = val[0]; sc->sge.eq_start = val[1]; sc->tids.ftid_base = val[2]; sc->tids.nftids = val[3] - val[2] + 1; sc->params.ftid_min = val[2]; sc->params.ftid_max = val[3]; sc->vres.l2t.start = val[4]; sc->vres.l2t.size = val[5] - val[4] + 1; KASSERT(sc->vres.l2t.size <= L2T_SIZE, ("%s: L2 table size (%u) larger than expected (%u)", __func__, sc->vres.l2t.size, L2T_SIZE)); /* get capabilites */ bzero(&caps, sizeof(caps)); caps.op_to_write = htobe32(V_FW_CMD_OP(FW_CAPS_CONFIG_CMD) | F_FW_CMD_REQUEST | F_FW_CMD_READ); caps.cfvalid_to_len16 = htobe32(FW_LEN16(caps)); rc = -t4_wr_mbox(sc, sc->mbox, &caps, sizeof(caps), &caps); if (rc != 0) { device_printf(sc->dev, "failed to get card capabilities: %d.\n", rc); return (rc); } #define READ_CAPS(x) do { \ sc->x = htobe16(caps.x); \ } while (0) READ_CAPS(nbmcaps); READ_CAPS(linkcaps); READ_CAPS(switchcaps); READ_CAPS(niccaps); READ_CAPS(toecaps); READ_CAPS(rdmacaps); READ_CAPS(cryptocaps); READ_CAPS(iscsicaps); READ_CAPS(fcoecaps); if (sc->niccaps & FW_CAPS_CONFIG_NIC_ETHOFLD) { param[0] = FW_PARAM_PFVF(ETHOFLD_START); param[1] = FW_PARAM_PFVF(ETHOFLD_END); param[2] = FW_PARAM_DEV(FLOWC_BUFFIFO_SZ); rc = -t4_query_params(sc, sc->mbox, sc->pf, 0, 3, param, val); if (rc != 0) { device_printf(sc->dev, "failed to query NIC parameters: %d.\n", rc); return (rc); } sc->tids.etid_base = val[0]; sc->params.etid_min = val[0]; sc->tids.netids = val[1] - val[0] + 1; sc->params.netids = sc->tids.netids; sc->params.eo_wr_cred = val[2]; sc->params.ethoffload = 1; } if (sc->toecaps) { /* query offload-related parameters */ param[0] = FW_PARAM_DEV(NTID); param[1] = FW_PARAM_PFVF(SERVER_START); param[2] = FW_PARAM_PFVF(SERVER_END); param[3] = FW_PARAM_PFVF(TDDP_START); param[4] = FW_PARAM_PFVF(TDDP_END); param[5] = FW_PARAM_DEV(FLOWC_BUFFIFO_SZ); rc = -t4_query_params(sc, sc->mbox, sc->pf, 0, 6, param, val); if (rc != 0) { device_printf(sc->dev, "failed to query TOE parameters: %d.\n", rc); return (rc); } sc->tids.ntids = val[0]; sc->tids.natids = min(sc->tids.ntids / 2, MAX_ATIDS); sc->tids.stid_base = val[1]; sc->tids.nstids = val[2] - val[1] + 1; sc->vres.ddp.start = val[3]; sc->vres.ddp.size = val[4] - val[3] + 1; sc->params.ofldq_wr_cred = val[5]; sc->params.offload = 1; } if (sc->rdmacaps) { param[0] = FW_PARAM_PFVF(STAG_START); param[1] = FW_PARAM_PFVF(STAG_END); param[2] = FW_PARAM_PFVF(RQ_START); param[3] = FW_PARAM_PFVF(RQ_END); param[4] = FW_PARAM_PFVF(PBL_START); param[5] = FW_PARAM_PFVF(PBL_END); rc = -t4_query_params(sc, sc->mbox, sc->pf, 0, 6, param, val); if (rc != 0) { device_printf(sc->dev, "failed to query RDMA parameters(1): %d.\n", rc); return (rc); } sc->vres.stag.start = val[0]; sc->vres.stag.size = val[1] - val[0] + 1; sc->vres.rq.start = val[2]; sc->vres.rq.size = val[3] - val[2] + 1; sc->vres.pbl.start = val[4]; sc->vres.pbl.size = val[5] - val[4] + 1; param[0] = FW_PARAM_PFVF(SQRQ_START); param[1] = FW_PARAM_PFVF(SQRQ_END); param[2] = FW_PARAM_PFVF(CQ_START); param[3] = FW_PARAM_PFVF(CQ_END); param[4] = FW_PARAM_PFVF(OCQ_START); param[5] = FW_PARAM_PFVF(OCQ_END); rc = -t4_query_params(sc, sc->mbox, sc->pf, 0, 6, param, val); if (rc != 0) { device_printf(sc->dev, "failed to query RDMA parameters(2): %d.\n", rc); return (rc); } sc->vres.qp.start = val[0]; sc->vres.qp.size = val[1] - val[0] + 1; sc->vres.cq.start = val[2]; sc->vres.cq.size = val[3] - val[2] + 1; sc->vres.ocq.start = val[4]; sc->vres.ocq.size = val[5] - val[4] + 1; } if (sc->iscsicaps) { param[0] = FW_PARAM_PFVF(ISCSI_START); param[1] = FW_PARAM_PFVF(ISCSI_END); rc = -t4_query_params(sc, sc->mbox, sc->pf, 0, 2, param, val); if (rc != 0) { device_printf(sc->dev, "failed to query iSCSI parameters: %d.\n", rc); return (rc); } sc->vres.iscsi.start = val[0]; sc->vres.iscsi.size = val[1] - val[0] + 1; } t4_init_sge_params(sc); /* * We've got the params we wanted to query via the firmware. Now grab * some others directly from the chip. */ rc = t4_read_chip_settings(sc); return (rc); } static int set_params__post_init(struct adapter *sc) { uint32_t param, val; /* ask for encapsulated CPLs */ param = FW_PARAM_PFVF(CPLFW4MSG_ENCAP); val = 1; (void)t4_set_params(sc, sc->mbox, sc->pf, 0, 1, ¶m, &val); return (0); } #undef FW_PARAM_PFVF #undef FW_PARAM_DEV static void t4_set_desc(struct adapter *sc) { char buf[128]; struct adapter_params *p = &sc->params; snprintf(buf, sizeof(buf), "Chelsio %s", p->vpd.id); device_set_desc_copy(sc->dev, buf); } static void build_medialist(struct port_info *pi, struct ifmedia *media) { int m; PORT_LOCK(pi); ifmedia_removeall(media); m = IFM_ETHER | IFM_FDX; switch(pi->port_type) { case FW_PORT_TYPE_BT_XFI: case FW_PORT_TYPE_BT_XAUI: ifmedia_add(media, m | IFM_10G_T, 0, NULL); /* fall through */ case FW_PORT_TYPE_BT_SGMII: ifmedia_add(media, m | IFM_1000_T, 0, NULL); ifmedia_add(media, m | IFM_100_TX, 0, NULL); ifmedia_add(media, IFM_ETHER | IFM_AUTO, 0, NULL); ifmedia_set(media, IFM_ETHER | IFM_AUTO); break; case FW_PORT_TYPE_CX4: ifmedia_add(media, m | IFM_10G_CX4, 0, NULL); ifmedia_set(media, m | IFM_10G_CX4); break; case FW_PORT_TYPE_QSFP_10G: case FW_PORT_TYPE_SFP: case FW_PORT_TYPE_FIBER_XFI: case FW_PORT_TYPE_FIBER_XAUI: switch (pi->mod_type) { case FW_PORT_MOD_TYPE_LR: ifmedia_add(media, m | IFM_10G_LR, 0, NULL); ifmedia_set(media, m | IFM_10G_LR); break; case FW_PORT_MOD_TYPE_SR: ifmedia_add(media, m | IFM_10G_SR, 0, NULL); ifmedia_set(media, m | IFM_10G_SR); break; case FW_PORT_MOD_TYPE_LRM: ifmedia_add(media, m | IFM_10G_LRM, 0, NULL); ifmedia_set(media, m | IFM_10G_LRM); break; case FW_PORT_MOD_TYPE_TWINAX_PASSIVE: case FW_PORT_MOD_TYPE_TWINAX_ACTIVE: ifmedia_add(media, m | IFM_10G_TWINAX, 0, NULL); ifmedia_set(media, m | IFM_10G_TWINAX); break; case FW_PORT_MOD_TYPE_NONE: m &= ~IFM_FDX; ifmedia_add(media, m | IFM_NONE, 0, NULL); ifmedia_set(media, m | IFM_NONE); break; case FW_PORT_MOD_TYPE_NA: case FW_PORT_MOD_TYPE_ER: default: device_printf(pi->dev, "unknown port_type (%d), mod_type (%d)\n", pi->port_type, pi->mod_type); ifmedia_add(media, m | IFM_UNKNOWN, 0, NULL); ifmedia_set(media, m | IFM_UNKNOWN); break; } break; case FW_PORT_TYPE_CR_QSFP: case FW_PORT_TYPE_SFP28: switch (pi->mod_type) { case FW_PORT_MOD_TYPE_SR: MPASS(pi->port_type == FW_PORT_TYPE_SFP28); ifmedia_add(media, m | IFM_25G_SR, 0, NULL); ifmedia_set(media, m | IFM_25G_SR); break; case FW_PORT_MOD_TYPE_TWINAX_PASSIVE: case FW_PORT_MOD_TYPE_TWINAX_ACTIVE: ifmedia_add(media, m | IFM_25G_CR, 0, NULL); ifmedia_set(media, m | IFM_25G_CR); break; case FW_PORT_MOD_TYPE_NONE: m &= ~IFM_FDX; ifmedia_add(media, m | IFM_NONE, 0, NULL); ifmedia_set(media, m | IFM_NONE); break; default: device_printf(pi->dev, "unknown port_type (%d), mod_type (%d)\n", pi->port_type, pi->mod_type); ifmedia_add(media, m | IFM_UNKNOWN, 0, NULL); ifmedia_set(media, m | IFM_UNKNOWN); break; } break; case FW_PORT_TYPE_QSFP: switch (pi->mod_type) { case FW_PORT_MOD_TYPE_LR: ifmedia_add(media, m | IFM_40G_LR4, 0, NULL); ifmedia_set(media, m | IFM_40G_LR4); break; case FW_PORT_MOD_TYPE_SR: ifmedia_add(media, m | IFM_40G_SR4, 0, NULL); ifmedia_set(media, m | IFM_40G_SR4); break; case FW_PORT_MOD_TYPE_TWINAX_PASSIVE: case FW_PORT_MOD_TYPE_TWINAX_ACTIVE: ifmedia_add(media, m | IFM_40G_CR4, 0, NULL); ifmedia_set(media, m | IFM_40G_CR4); break; case FW_PORT_MOD_TYPE_NONE: m &= ~IFM_FDX; ifmedia_add(media, m | IFM_NONE, 0, NULL); ifmedia_set(media, m | IFM_NONE); break; default: device_printf(pi->dev, "unknown port_type (%d), mod_type (%d)\n", pi->port_type, pi->mod_type); ifmedia_add(media, m | IFM_UNKNOWN, 0, NULL); ifmedia_set(media, m | IFM_UNKNOWN); break; } break; case FW_PORT_TYPE_CR2_QSFP: switch (pi->mod_type) { case FW_PORT_MOD_TYPE_TWINAX_PASSIVE: case FW_PORT_MOD_TYPE_TWINAX_ACTIVE: ifmedia_add(media, m | IFM_50G_CR2, 0, NULL); ifmedia_set(media, m | IFM_50G_CR2); break; case FW_PORT_MOD_TYPE_NONE: m &= ~IFM_FDX; ifmedia_add(media, m | IFM_NONE, 0, NULL); ifmedia_set(media, m | IFM_NONE); break; default: device_printf(pi->dev, "unknown port_type (%d), mod_type (%d)\n", pi->port_type, pi->mod_type); ifmedia_add(media, m | IFM_UNKNOWN, 0, NULL); ifmedia_set(media, m | IFM_UNKNOWN); break; } break; case FW_PORT_TYPE_KR4_100G: case FW_PORT_TYPE_CR4_QSFP: switch (pi->mod_type) { case FW_PORT_MOD_TYPE_LR: ifmedia_add(media, m | IFM_100G_LR4, 0, NULL); ifmedia_set(media, m | IFM_100G_LR4); break; case FW_PORT_MOD_TYPE_SR: ifmedia_add(media, m | IFM_100G_SR4, 0, NULL); ifmedia_set(media, m | IFM_100G_SR4); break; case FW_PORT_MOD_TYPE_TWINAX_PASSIVE: case FW_PORT_MOD_TYPE_TWINAX_ACTIVE: ifmedia_add(media, m | IFM_100G_CR4, 0, NULL); ifmedia_set(media, m | IFM_100G_CR4); break; case FW_PORT_MOD_TYPE_NONE: m &= ~IFM_FDX; ifmedia_add(media, m | IFM_NONE, 0, NULL); ifmedia_set(media, m | IFM_NONE); break; default: device_printf(pi->dev, "unknown port_type (%d), mod_type (%d)\n", pi->port_type, pi->mod_type); ifmedia_add(media, m | IFM_UNKNOWN, 0, NULL); ifmedia_set(media, m | IFM_UNKNOWN); break; } break; default: device_printf(pi->dev, "unknown port_type (%d), mod_type (%d)\n", pi->port_type, pi->mod_type); ifmedia_add(media, m | IFM_UNKNOWN, 0, NULL); ifmedia_set(media, m | IFM_UNKNOWN); break; } PORT_UNLOCK(pi); } #define FW_MAC_EXACT_CHUNK 7 /* * Program the port's XGMAC based on parameters in ifnet. The caller also * indicates which parameters should be programmed (the rest are left alone). */ int update_mac_settings(struct ifnet *ifp, int flags) { int rc = 0; struct vi_info *vi = ifp->if_softc; struct port_info *pi = vi->pi; struct adapter *sc = pi->adapter; int mtu = -1, promisc = -1, allmulti = -1, vlanex = -1; ASSERT_SYNCHRONIZED_OP(sc); KASSERT(flags, ("%s: not told what to update.", __func__)); if (flags & XGMAC_MTU) mtu = ifp->if_mtu; if (flags & XGMAC_PROMISC) promisc = ifp->if_flags & IFF_PROMISC ? 1 : 0; if (flags & XGMAC_ALLMULTI) allmulti = ifp->if_flags & IFF_ALLMULTI ? 1 : 0; if (flags & XGMAC_VLANEX) vlanex = ifp->if_capenable & IFCAP_VLAN_HWTAGGING ? 1 : 0; if (flags & (XGMAC_MTU|XGMAC_PROMISC|XGMAC_ALLMULTI|XGMAC_VLANEX)) { rc = -t4_set_rxmode(sc, sc->mbox, vi->viid, mtu, promisc, allmulti, 1, vlanex, false); if (rc) { if_printf(ifp, "set_rxmode (%x) failed: %d\n", flags, rc); return (rc); } } if (flags & XGMAC_UCADDR) { uint8_t ucaddr[ETHER_ADDR_LEN]; bcopy(IF_LLADDR(ifp), ucaddr, sizeof(ucaddr)); rc = t4_change_mac(sc, sc->mbox, vi->viid, vi->xact_addr_filt, ucaddr, true, true); if (rc < 0) { rc = -rc; if_printf(ifp, "change_mac failed: %d\n", rc); return (rc); } else { vi->xact_addr_filt = rc; rc = 0; } } if (flags & XGMAC_MCADDRS) { const uint8_t *mcaddr[FW_MAC_EXACT_CHUNK]; int del = 1; uint64_t hash = 0; struct ifmultiaddr *ifma; int i = 0, j; if_maddr_rlock(ifp); TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { if (ifma->ifma_addr->sa_family != AF_LINK) continue; mcaddr[i] = LLADDR((struct sockaddr_dl *)ifma->ifma_addr); MPASS(ETHER_IS_MULTICAST(mcaddr[i])); i++; if (i == FW_MAC_EXACT_CHUNK) { rc = t4_alloc_mac_filt(sc, sc->mbox, vi->viid, del, i, mcaddr, NULL, &hash, 0); if (rc < 0) { rc = -rc; for (j = 0; j < i; j++) { if_printf(ifp, "failed to add mc address" " %02x:%02x:%02x:" "%02x:%02x:%02x rc=%d\n", mcaddr[j][0], mcaddr[j][1], mcaddr[j][2], mcaddr[j][3], mcaddr[j][4], mcaddr[j][5], rc); } goto mcfail; } del = 0; i = 0; } } if (i > 0) { rc = t4_alloc_mac_filt(sc, sc->mbox, vi->viid, del, i, mcaddr, NULL, &hash, 0); if (rc < 0) { rc = -rc; for (j = 0; j < i; j++) { if_printf(ifp, "failed to add mc address" " %02x:%02x:%02x:" "%02x:%02x:%02x rc=%d\n", mcaddr[j][0], mcaddr[j][1], mcaddr[j][2], mcaddr[j][3], mcaddr[j][4], mcaddr[j][5], rc); } goto mcfail; } } rc = -t4_set_addr_hash(sc, sc->mbox, vi->viid, 0, hash, 0); if (rc != 0) if_printf(ifp, "failed to set mc address hash: %d", rc); mcfail: if_maddr_runlock(ifp); } return (rc); } /* * {begin|end}_synchronized_op must be called from the same thread. */ int begin_synchronized_op(struct adapter *sc, struct vi_info *vi, int flags, char *wmesg) { int rc, pri; #ifdef WITNESS /* the caller thinks it's ok to sleep, but is it really? */ if (flags & SLEEP_OK) WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, "begin_synchronized_op"); #endif if (INTR_OK) pri = PCATCH; else pri = 0; ADAPTER_LOCK(sc); for (;;) { if (vi && IS_DOOMED(vi)) { rc = ENXIO; goto done; } if (!IS_BUSY(sc)) { rc = 0; break; } if (!(flags & SLEEP_OK)) { rc = EBUSY; goto done; } if (mtx_sleep(&sc->flags, &sc->sc_lock, pri, wmesg, 0)) { rc = EINTR; goto done; } } KASSERT(!IS_BUSY(sc), ("%s: controller busy.", __func__)); SET_BUSY(sc); #ifdef INVARIANTS sc->last_op = wmesg; sc->last_op_thr = curthread; sc->last_op_flags = flags; #endif done: if (!(flags & HOLD_LOCK) || rc) ADAPTER_UNLOCK(sc); return (rc); } /* * Tell if_ioctl and if_init that the VI is going away. This is * special variant of begin_synchronized_op and must be paired with a * call to end_synchronized_op. */ void doom_vi(struct adapter *sc, struct vi_info *vi) { ADAPTER_LOCK(sc); SET_DOOMED(vi); wakeup(&sc->flags); while (IS_BUSY(sc)) mtx_sleep(&sc->flags, &sc->sc_lock, 0, "t4detach", 0); SET_BUSY(sc); #ifdef INVARIANTS sc->last_op = "t4detach"; sc->last_op_thr = curthread; sc->last_op_flags = 0; #endif ADAPTER_UNLOCK(sc); } /* * {begin|end}_synchronized_op must be called from the same thread. */ void end_synchronized_op(struct adapter *sc, int flags) { if (flags & LOCK_HELD) ADAPTER_LOCK_ASSERT_OWNED(sc); else ADAPTER_LOCK(sc); KASSERT(IS_BUSY(sc), ("%s: controller not busy.", __func__)); CLR_BUSY(sc); wakeup(&sc->flags); ADAPTER_UNLOCK(sc); } static int cxgbe_init_synchronized(struct vi_info *vi) { struct port_info *pi = vi->pi; struct adapter *sc = pi->adapter; struct ifnet *ifp = vi->ifp; int rc = 0, i; struct sge_txq *txq; ASSERT_SYNCHRONIZED_OP(sc); if (ifp->if_drv_flags & IFF_DRV_RUNNING) return (0); /* already running */ if (!(sc->flags & FULL_INIT_DONE) && ((rc = adapter_full_init(sc)) != 0)) return (rc); /* error message displayed already */ if (!(vi->flags & VI_INIT_DONE) && ((rc = vi_full_init(vi)) != 0)) return (rc); /* error message displayed already */ rc = update_mac_settings(ifp, XGMAC_ALL); if (rc) goto done; /* error message displayed already */ rc = -t4_enable_vi(sc, sc->mbox, vi->viid, true, true); if (rc != 0) { if_printf(ifp, "enable_vi failed: %d\n", rc); goto done; } /* * Can't fail from this point onwards. Review cxgbe_uninit_synchronized * if this changes. */ for_each_txq(vi, i, txq) { TXQ_LOCK(txq); txq->eq.flags |= EQ_ENABLED; TXQ_UNLOCK(txq); } /* * The first iq of the first port to come up is used for tracing. */ if (sc->traceq < 0 && IS_MAIN_VI(vi)) { sc->traceq = sc->sge.rxq[vi->first_rxq].iq.abs_id; t4_write_reg(sc, is_t4(sc) ? A_MPS_TRC_RSS_CONTROL : A_MPS_T5_TRC_RSS_CONTROL, V_RSSCONTROL(pi->tx_chan) | V_QUEUENUMBER(sc->traceq)); pi->flags |= HAS_TRACEQ; } /* all ok */ PORT_LOCK(pi); ifp->if_drv_flags |= IFF_DRV_RUNNING; pi->up_vis++; if (pi->nvi > 1 || sc->flags & IS_VF) callout_reset(&vi->tick, hz, vi_tick, vi); else callout_reset(&pi->tick, hz, cxgbe_tick, pi); PORT_UNLOCK(pi); done: if (rc != 0) cxgbe_uninit_synchronized(vi); return (rc); } /* * Idempotent. */ static int cxgbe_uninit_synchronized(struct vi_info *vi) { struct port_info *pi = vi->pi; struct adapter *sc = pi->adapter; struct ifnet *ifp = vi->ifp; int rc, i; struct sge_txq *txq; ASSERT_SYNCHRONIZED_OP(sc); if (!(vi->flags & VI_INIT_DONE)) { KASSERT(!(ifp->if_drv_flags & IFF_DRV_RUNNING), ("uninited VI is running")); return (0); } /* * Disable the VI so that all its data in either direction is discarded * by the MPS. Leave everything else (the queues, interrupts, and 1Hz * tick) intact as the TP can deliver negative advice or data that it's * holding in its RAM (for an offloaded connection) even after the VI is * disabled. */ rc = -t4_enable_vi(sc, sc->mbox, vi->viid, false, false); if (rc) { if_printf(ifp, "disable_vi failed: %d\n", rc); return (rc); } for_each_txq(vi, i, txq) { TXQ_LOCK(txq); txq->eq.flags &= ~EQ_ENABLED; TXQ_UNLOCK(txq); } PORT_LOCK(pi); if (pi->nvi > 1 || sc->flags & IS_VF) callout_stop(&vi->tick); else callout_stop(&pi->tick); if (!(ifp->if_drv_flags & IFF_DRV_RUNNING)) { PORT_UNLOCK(pi); return (0); } ifp->if_drv_flags &= ~IFF_DRV_RUNNING; pi->up_vis--; if (pi->up_vis > 0) { PORT_UNLOCK(pi); return (0); } PORT_UNLOCK(pi); pi->link_cfg.link_ok = 0; pi->link_cfg.speed = 0; pi->linkdnrc = -1; t4_os_link_changed(sc, pi->port_id, 0, -1); return (0); } /* * It is ok for this function to fail midway and return right away. t4_detach * will walk the entire sc->irq list and clean up whatever is valid. */ int t4_setup_intr_handlers(struct adapter *sc) { int rc, rid, p, q, v; char s[8]; struct irq *irq; struct port_info *pi; struct vi_info *vi; struct sge *sge = &sc->sge; struct sge_rxq *rxq; #ifdef TCP_OFFLOAD struct sge_ofld_rxq *ofld_rxq; #endif #ifdef DEV_NETMAP struct sge_nm_rxq *nm_rxq; #endif #ifdef RSS int nbuckets = rss_getnumbuckets(); #endif /* * Setup interrupts. */ irq = &sc->irq[0]; rid = sc->intr_type == INTR_INTX ? 0 : 1; if (sc->intr_count == 1) return (t4_alloc_irq(sc, irq, rid, t4_intr_all, sc, "all")); /* Multiple interrupts. */ if (sc->flags & IS_VF) KASSERT(sc->intr_count >= T4VF_EXTRA_INTR + sc->params.nports, ("%s: too few intr.", __func__)); else KASSERT(sc->intr_count >= T4_EXTRA_INTR + sc->params.nports, ("%s: too few intr.", __func__)); /* The first one is always error intr on PFs */ if (!(sc->flags & IS_VF)) { rc = t4_alloc_irq(sc, irq, rid, t4_intr_err, sc, "err"); if (rc != 0) return (rc); irq++; rid++; } /* The second one is always the firmware event queue (first on VFs) */ rc = t4_alloc_irq(sc, irq, rid, t4_intr_evt, &sge->fwq, "evt"); if (rc != 0) return (rc); irq++; rid++; for_each_port(sc, p) { pi = sc->port[p]; for_each_vi(pi, v, vi) { vi->first_intr = rid - 1; if (vi->nnmrxq > 0) { int n = max(vi->nrxq, vi->nnmrxq); MPASS(vi->flags & INTR_RXQ); rxq = &sge->rxq[vi->first_rxq]; #ifdef DEV_NETMAP nm_rxq = &sge->nm_rxq[vi->first_nm_rxq]; #endif for (q = 0; q < n; q++) { snprintf(s, sizeof(s), "%x%c%x", p, 'a' + v, q); if (q < vi->nrxq) irq->rxq = rxq++; #ifdef DEV_NETMAP if (q < vi->nnmrxq) irq->nm_rxq = nm_rxq++; #endif rc = t4_alloc_irq(sc, irq, rid, t4_vi_intr, irq, s); if (rc != 0) return (rc); irq++; rid++; vi->nintr++; } } else if (vi->flags & INTR_RXQ) { for_each_rxq(vi, q, rxq) { snprintf(s, sizeof(s), "%x%c%x", p, 'a' + v, q); rc = t4_alloc_irq(sc, irq, rid, t4_intr, rxq, s); if (rc != 0) return (rc); #ifdef RSS bus_bind_intr(sc->dev, irq->res, rss_getcpu(q % nbuckets)); #endif irq++; rid++; vi->nintr++; } } #ifdef TCP_OFFLOAD if (vi->flags & INTR_OFLD_RXQ) { for_each_ofld_rxq(vi, q, ofld_rxq) { snprintf(s, sizeof(s), "%x%c%x", p, 'A' + v, q); rc = t4_alloc_irq(sc, irq, rid, t4_intr, ofld_rxq, s); if (rc != 0) return (rc); irq++; rid++; vi->nintr++; } } #endif } } MPASS(irq == &sc->irq[sc->intr_count]); return (0); } int adapter_full_init(struct adapter *sc) { int rc, i; ASSERT_SYNCHRONIZED_OP(sc); ADAPTER_LOCK_ASSERT_NOTOWNED(sc); KASSERT((sc->flags & FULL_INIT_DONE) == 0, ("%s: FULL_INIT_DONE already", __func__)); /* * queues that belong to the adapter (not any particular port). */ rc = t4_setup_adapter_queues(sc); if (rc != 0) goto done; for (i = 0; i < nitems(sc->tq); i++) { sc->tq[i] = taskqueue_create("t4 taskq", M_NOWAIT, taskqueue_thread_enqueue, &sc->tq[i]); if (sc->tq[i] == NULL) { device_printf(sc->dev, "failed to allocate task queue %d\n", i); rc = ENOMEM; goto done; } taskqueue_start_threads(&sc->tq[i], 1, PI_NET, "%s tq%d", device_get_nameunit(sc->dev), i); } if (!(sc->flags & IS_VF)) t4_intr_enable(sc); sc->flags |= FULL_INIT_DONE; done: if (rc != 0) adapter_full_uninit(sc); return (rc); } int adapter_full_uninit(struct adapter *sc) { int i; ADAPTER_LOCK_ASSERT_NOTOWNED(sc); t4_teardown_adapter_queues(sc); for (i = 0; i < nitems(sc->tq) && sc->tq[i]; i++) { taskqueue_free(sc->tq[i]); sc->tq[i] = NULL; } sc->flags &= ~FULL_INIT_DONE; return (0); } #ifdef RSS #define SUPPORTED_RSS_HASHTYPES (RSS_HASHTYPE_RSS_IPV4 | \ RSS_HASHTYPE_RSS_TCP_IPV4 | RSS_HASHTYPE_RSS_IPV6 | \ RSS_HASHTYPE_RSS_TCP_IPV6 | RSS_HASHTYPE_RSS_UDP_IPV4 | \ RSS_HASHTYPE_RSS_UDP_IPV6) /* Translates kernel hash types to hardware. */ static int hashconfig_to_hashen(int hashconfig) { int hashen = 0; if (hashconfig & RSS_HASHTYPE_RSS_IPV4) hashen |= F_FW_RSS_VI_CONFIG_CMD_IP4TWOTUPEN; if (hashconfig & RSS_HASHTYPE_RSS_IPV6) hashen |= F_FW_RSS_VI_CONFIG_CMD_IP6TWOTUPEN; if (hashconfig & RSS_HASHTYPE_RSS_UDP_IPV4) { hashen |= F_FW_RSS_VI_CONFIG_CMD_UDPEN | F_FW_RSS_VI_CONFIG_CMD_IP4FOURTUPEN; } if (hashconfig & RSS_HASHTYPE_RSS_UDP_IPV6) { hashen |= F_FW_RSS_VI_CONFIG_CMD_UDPEN | F_FW_RSS_VI_CONFIG_CMD_IP6FOURTUPEN; } if (hashconfig & RSS_HASHTYPE_RSS_TCP_IPV4) hashen |= F_FW_RSS_VI_CONFIG_CMD_IP4FOURTUPEN; if (hashconfig & RSS_HASHTYPE_RSS_TCP_IPV6) hashen |= F_FW_RSS_VI_CONFIG_CMD_IP6FOURTUPEN; return (hashen); } /* Translates hardware hash types to kernel. */ static int hashen_to_hashconfig(int hashen) { int hashconfig = 0; if (hashen & F_FW_RSS_VI_CONFIG_CMD_UDPEN) { /* * If UDP hashing was enabled it must have been enabled for * either IPv4 or IPv6 (inclusive or). Enabling UDP without * enabling any 4-tuple hash is nonsense configuration. */ MPASS(hashen & (F_FW_RSS_VI_CONFIG_CMD_IP4FOURTUPEN | F_FW_RSS_VI_CONFIG_CMD_IP6FOURTUPEN)); if (hashen & F_FW_RSS_VI_CONFIG_CMD_IP4FOURTUPEN) hashconfig |= RSS_HASHTYPE_RSS_UDP_IPV4; if (hashen & F_FW_RSS_VI_CONFIG_CMD_IP6FOURTUPEN) hashconfig |= RSS_HASHTYPE_RSS_UDP_IPV6; } if (hashen & F_FW_RSS_VI_CONFIG_CMD_IP4FOURTUPEN) hashconfig |= RSS_HASHTYPE_RSS_TCP_IPV4; if (hashen & F_FW_RSS_VI_CONFIG_CMD_IP6FOURTUPEN) hashconfig |= RSS_HASHTYPE_RSS_TCP_IPV6; if (hashen & F_FW_RSS_VI_CONFIG_CMD_IP4TWOTUPEN) hashconfig |= RSS_HASHTYPE_RSS_IPV4; if (hashen & F_FW_RSS_VI_CONFIG_CMD_IP6TWOTUPEN) hashconfig |= RSS_HASHTYPE_RSS_IPV6; return (hashconfig); } #endif int vi_full_init(struct vi_info *vi) { struct adapter *sc = vi->pi->adapter; struct ifnet *ifp = vi->ifp; uint16_t *rss; struct sge_rxq *rxq; int rc, i, j, hashen; #ifdef RSS int nbuckets = rss_getnumbuckets(); int hashconfig = rss_gethashconfig(); int extra; uint32_t raw_rss_key[RSS_KEYSIZE / sizeof(uint32_t)]; uint32_t rss_key[RSS_KEYSIZE / sizeof(uint32_t)]; #endif ASSERT_SYNCHRONIZED_OP(sc); KASSERT((vi->flags & VI_INIT_DONE) == 0, ("%s: VI_INIT_DONE already", __func__)); sysctl_ctx_init(&vi->ctx); vi->flags |= VI_SYSCTL_CTX; /* * Allocate tx/rx/fl queues for this VI. */ rc = t4_setup_vi_queues(vi); if (rc != 0) goto done; /* error message displayed already */ /* * Setup RSS for this VI. Save a copy of the RSS table for later use. */ if (vi->nrxq > vi->rss_size) { if_printf(ifp, "nrxq (%d) > hw RSS table size (%d); " "some queues will never receive traffic.\n", vi->nrxq, vi->rss_size); } else if (vi->rss_size % vi->nrxq) { if_printf(ifp, "nrxq (%d), hw RSS table size (%d); " "expect uneven traffic distribution.\n", vi->nrxq, vi->rss_size); } #ifdef RSS MPASS(RSS_KEYSIZE == 40); if (vi->nrxq != nbuckets) { if_printf(ifp, "nrxq (%d) != kernel RSS buckets (%d);" "performance will be impacted.\n", vi->nrxq, nbuckets); } rss_getkey((void *)&raw_rss_key[0]); for (i = 0; i < nitems(rss_key); i++) { rss_key[i] = htobe32(raw_rss_key[nitems(rss_key) - 1 - i]); } t4_write_rss_key(sc, &rss_key[0], -1); #endif rss = malloc(vi->rss_size * sizeof (*rss), M_CXGBE, M_ZERO | M_WAITOK); for (i = 0; i < vi->rss_size;) { #ifdef RSS j = rss_get_indirection_to_bucket(i); j %= vi->nrxq; rxq = &sc->sge.rxq[vi->first_rxq + j]; rss[i++] = rxq->iq.abs_id; #else for_each_rxq(vi, j, rxq) { rss[i++] = rxq->iq.abs_id; if (i == vi->rss_size) break; } #endif } rc = -t4_config_rss_range(sc, sc->mbox, vi->viid, 0, vi->rss_size, rss, vi->rss_size); if (rc != 0) { if_printf(ifp, "rss_config failed: %d\n", rc); goto done; } #ifdef RSS hashen = hashconfig_to_hashen(hashconfig); /* * We may have had to enable some hashes even though the global config * wants them disabled. This is a potential problem that must be * reported to the user. */ extra = hashen_to_hashconfig(hashen) ^ hashconfig; /* * If we consider only the supported hash types, then the enabled hashes * are a superset of the requested hashes. In other words, there cannot * be any supported hash that was requested but not enabled, but there * can be hashes that were not requested but had to be enabled. */ extra &= SUPPORTED_RSS_HASHTYPES; MPASS((extra & hashconfig) == 0); if (extra) { if_printf(ifp, "global RSS config (0x%x) cannot be accommodated.\n", hashconfig); } if (extra & RSS_HASHTYPE_RSS_IPV4) if_printf(ifp, "IPv4 2-tuple hashing forced on.\n"); if (extra & RSS_HASHTYPE_RSS_TCP_IPV4) if_printf(ifp, "TCP/IPv4 4-tuple hashing forced on.\n"); if (extra & RSS_HASHTYPE_RSS_IPV6) if_printf(ifp, "IPv6 2-tuple hashing forced on.\n"); if (extra & RSS_HASHTYPE_RSS_TCP_IPV6) if_printf(ifp, "TCP/IPv6 4-tuple hashing forced on.\n"); if (extra & RSS_HASHTYPE_RSS_UDP_IPV4) if_printf(ifp, "UDP/IPv4 4-tuple hashing forced on.\n"); if (extra & RSS_HASHTYPE_RSS_UDP_IPV6) if_printf(ifp, "UDP/IPv6 4-tuple hashing forced on.\n"); #else hashen = F_FW_RSS_VI_CONFIG_CMD_IP6FOURTUPEN | F_FW_RSS_VI_CONFIG_CMD_IP6TWOTUPEN | F_FW_RSS_VI_CONFIG_CMD_IP4FOURTUPEN | F_FW_RSS_VI_CONFIG_CMD_IP4TWOTUPEN | F_FW_RSS_VI_CONFIG_CMD_UDPEN; #endif rc = -t4_config_vi_rss(sc, sc->mbox, vi->viid, hashen, rss[0]); if (rc != 0) { if_printf(ifp, "rss hash/defaultq config failed: %d\n", rc); goto done; } vi->rss = rss; vi->flags |= VI_INIT_DONE; done: if (rc != 0) vi_full_uninit(vi); return (rc); } /* * Idempotent. */ int vi_full_uninit(struct vi_info *vi) { struct port_info *pi = vi->pi; struct adapter *sc = pi->adapter; int i; struct sge_rxq *rxq; struct sge_txq *txq; #ifdef TCP_OFFLOAD struct sge_ofld_rxq *ofld_rxq; struct sge_wrq *ofld_txq; #endif if (vi->flags & VI_INIT_DONE) { /* Need to quiesce queues. */ /* XXX: Only for the first VI? */ if (IS_MAIN_VI(vi) && !(sc->flags & IS_VF)) quiesce_wrq(sc, &sc->sge.ctrlq[pi->port_id]); for_each_txq(vi, i, txq) { quiesce_txq(sc, txq); } #ifdef TCP_OFFLOAD for_each_ofld_txq(vi, i, ofld_txq) { quiesce_wrq(sc, ofld_txq); } #endif for_each_rxq(vi, i, rxq) { quiesce_iq(sc, &rxq->iq); quiesce_fl(sc, &rxq->fl); } #ifdef TCP_OFFLOAD for_each_ofld_rxq(vi, i, ofld_rxq) { quiesce_iq(sc, &ofld_rxq->iq); quiesce_fl(sc, &ofld_rxq->fl); } #endif free(vi->rss, M_CXGBE); free(vi->nm_rss, M_CXGBE); } t4_teardown_vi_queues(vi); vi->flags &= ~VI_INIT_DONE; return (0); } static void quiesce_txq(struct adapter *sc, struct sge_txq *txq) { struct sge_eq *eq = &txq->eq; struct sge_qstat *spg = (void *)&eq->desc[eq->sidx]; (void) sc; /* unused */ #ifdef INVARIANTS TXQ_LOCK(txq); MPASS((eq->flags & EQ_ENABLED) == 0); TXQ_UNLOCK(txq); #endif /* Wait for the mp_ring to empty. */ while (!mp_ring_is_idle(txq->r)) { mp_ring_check_drainage(txq->r, 0); pause("rquiesce", 1); } /* Then wait for the hardware to finish. */ while (spg->cidx != htobe16(eq->pidx)) pause("equiesce", 1); /* Finally, wait for the driver to reclaim all descriptors. */ while (eq->cidx != eq->pidx) pause("dquiesce", 1); } static void quiesce_wrq(struct adapter *sc, struct sge_wrq *wrq) { /* XXXTX */ } static void quiesce_iq(struct adapter *sc, struct sge_iq *iq) { (void) sc; /* unused */ /* Synchronize with the interrupt handler */ while (!atomic_cmpset_int(&iq->state, IQS_IDLE, IQS_DISABLED)) pause("iqfree", 1); } static void quiesce_fl(struct adapter *sc, struct sge_fl *fl) { mtx_lock(&sc->sfl_lock); FL_LOCK(fl); fl->flags |= FL_DOOMED; FL_UNLOCK(fl); callout_stop(&sc->sfl_callout); mtx_unlock(&sc->sfl_lock); KASSERT((fl->flags & FL_STARVING) == 0, ("%s: still starving", __func__)); } static int t4_alloc_irq(struct adapter *sc, struct irq *irq, int rid, driver_intr_t *handler, void *arg, char *name) { int rc; irq->rid = rid; irq->res = bus_alloc_resource_any(sc->dev, SYS_RES_IRQ, &irq->rid, RF_SHAREABLE | RF_ACTIVE); if (irq->res == NULL) { device_printf(sc->dev, "failed to allocate IRQ for rid %d, name %s.\n", rid, name); return (ENOMEM); } rc = bus_setup_intr(sc->dev, irq->res, INTR_MPSAFE | INTR_TYPE_NET, NULL, handler, arg, &irq->tag); if (rc != 0) { device_printf(sc->dev, "failed to setup interrupt for rid %d, name %s: %d\n", rid, name, rc); } else if (name) bus_describe_intr(sc->dev, irq->res, irq->tag, "%s", name); return (rc); } static int t4_free_irq(struct adapter *sc, struct irq *irq) { if (irq->tag) bus_teardown_intr(sc->dev, irq->res, irq->tag); if (irq->res) bus_release_resource(sc->dev, SYS_RES_IRQ, irq->rid, irq->res); bzero(irq, sizeof(*irq)); return (0); } static void get_regs(struct adapter *sc, struct t4_regdump *regs, uint8_t *buf) { regs->version = chip_id(sc) | chip_rev(sc) << 10; t4_get_regs(sc, buf, regs->len); } #define A_PL_INDIR_CMD 0x1f8 #define S_PL_AUTOINC 31 #define M_PL_AUTOINC 0x1U #define V_PL_AUTOINC(x) ((x) << S_PL_AUTOINC) #define G_PL_AUTOINC(x) (((x) >> S_PL_AUTOINC) & M_PL_AUTOINC) #define S_PL_VFID 20 #define M_PL_VFID 0xffU #define V_PL_VFID(x) ((x) << S_PL_VFID) #define G_PL_VFID(x) (((x) >> S_PL_VFID) & M_PL_VFID) #define S_PL_ADDR 0 #define M_PL_ADDR 0xfffffU #define V_PL_ADDR(x) ((x) << S_PL_ADDR) #define G_PL_ADDR(x) (((x) >> S_PL_ADDR) & M_PL_ADDR) #define A_PL_INDIR_DATA 0x1fc static uint64_t read_vf_stat(struct adapter *sc, unsigned int viid, int reg) { u32 stats[2]; mtx_assert(&sc->reg_lock, MA_OWNED); if (sc->flags & IS_VF) { stats[0] = t4_read_reg(sc, VF_MPS_REG(reg)); stats[1] = t4_read_reg(sc, VF_MPS_REG(reg + 4)); } else { t4_write_reg(sc, A_PL_INDIR_CMD, V_PL_AUTOINC(1) | V_PL_VFID(G_FW_VIID_VIN(viid)) | V_PL_ADDR(VF_MPS_REG(reg))); stats[0] = t4_read_reg(sc, A_PL_INDIR_DATA); stats[1] = t4_read_reg(sc, A_PL_INDIR_DATA); } return (((uint64_t)stats[1]) << 32 | stats[0]); } static void t4_get_vi_stats(struct adapter *sc, unsigned int viid, struct fw_vi_stats_vf *stats) { #define GET_STAT(name) \ read_vf_stat(sc, viid, A_MPS_VF_STAT_##name##_L) stats->tx_bcast_bytes = GET_STAT(TX_VF_BCAST_BYTES); stats->tx_bcast_frames = GET_STAT(TX_VF_BCAST_FRAMES); stats->tx_mcast_bytes = GET_STAT(TX_VF_MCAST_BYTES); stats->tx_mcast_frames = GET_STAT(TX_VF_MCAST_FRAMES); stats->tx_ucast_bytes = GET_STAT(TX_VF_UCAST_BYTES); stats->tx_ucast_frames = GET_STAT(TX_VF_UCAST_FRAMES); stats->tx_drop_frames = GET_STAT(TX_VF_DROP_FRAMES); stats->tx_offload_bytes = GET_STAT(TX_VF_OFFLOAD_BYTES); stats->tx_offload_frames = GET_STAT(TX_VF_OFFLOAD_FRAMES); stats->rx_bcast_bytes = GET_STAT(RX_VF_BCAST_BYTES); stats->rx_bcast_frames = GET_STAT(RX_VF_BCAST_FRAMES); stats->rx_mcast_bytes = GET_STAT(RX_VF_MCAST_BYTES); stats->rx_mcast_frames = GET_STAT(RX_VF_MCAST_FRAMES); stats->rx_ucast_bytes = GET_STAT(RX_VF_UCAST_BYTES); stats->rx_ucast_frames = GET_STAT(RX_VF_UCAST_FRAMES); stats->rx_err_frames = GET_STAT(RX_VF_ERR_FRAMES); #undef GET_STAT } static void t4_clr_vi_stats(struct adapter *sc, unsigned int viid) { int reg; t4_write_reg(sc, A_PL_INDIR_CMD, V_PL_AUTOINC(1) | V_PL_VFID(G_FW_VIID_VIN(viid)) | V_PL_ADDR(VF_MPS_REG(A_MPS_VF_STAT_TX_VF_BCAST_BYTES_L))); for (reg = A_MPS_VF_STAT_TX_VF_BCAST_BYTES_L; reg <= A_MPS_VF_STAT_RX_VF_ERR_FRAMES_H; reg += 4) t4_write_reg(sc, A_PL_INDIR_DATA, 0); } static void vi_refresh_stats(struct adapter *sc, struct vi_info *vi) { struct timeval tv; const struct timeval interval = {0, 250000}; /* 250ms */ if (!(vi->flags & VI_INIT_DONE)) return; getmicrotime(&tv); timevalsub(&tv, &interval); if (timevalcmp(&tv, &vi->last_refreshed, <)) return; mtx_lock(&sc->reg_lock); t4_get_vi_stats(sc, vi->viid, &vi->stats); getmicrotime(&vi->last_refreshed); mtx_unlock(&sc->reg_lock); } static void cxgbe_refresh_stats(struct adapter *sc, struct port_info *pi) { int i; u_int v, tnl_cong_drops; struct timeval tv; const struct timeval interval = {0, 250000}; /* 250ms */ getmicrotime(&tv); timevalsub(&tv, &interval); if (timevalcmp(&tv, &pi->last_refreshed, <)) return; tnl_cong_drops = 0; t4_get_port_stats(sc, pi->tx_chan, &pi->stats); for (i = 0; i < sc->chip_params->nchan; i++) { if (pi->rx_chan_map & (1 << i)) { mtx_lock(&sc->reg_lock); t4_read_indirect(sc, A_TP_MIB_INDEX, A_TP_MIB_DATA, &v, 1, A_TP_MIB_TNL_CNG_DROP_0 + i); mtx_unlock(&sc->reg_lock); tnl_cong_drops += v; } } pi->tnl_cong_drops = tnl_cong_drops; getmicrotime(&pi->last_refreshed); } static void cxgbe_tick(void *arg) { struct port_info *pi = arg; struct adapter *sc = pi->adapter; PORT_LOCK_ASSERT_OWNED(pi); cxgbe_refresh_stats(sc, pi); callout_schedule(&pi->tick, hz); } void vi_tick(void *arg) { struct vi_info *vi = arg; struct adapter *sc = vi->pi->adapter; vi_refresh_stats(sc, vi); callout_schedule(&vi->tick, hz); } static void cxgbe_vlan_config(void *arg, struct ifnet *ifp, uint16_t vid) { struct ifnet *vlan; if (arg != ifp || ifp->if_type != IFT_ETHER) return; vlan = VLAN_DEVAT(ifp, vid); VLAN_SETCOOKIE(vlan, ifp); } /* * Should match fw_caps_config_ enums in t4fw_interface.h */ static char *caps_decoder[] = { "\20\001IPMI\002NCSI", /* 0: NBM */ "\20\001PPP\002QFC\003DCBX", /* 1: link */ "\20\001INGRESS\002EGRESS", /* 2: switch */ "\20\001NIC\002VM\003IDS\004UM\005UM_ISGL" /* 3: NIC */ "\006HASHFILTER\007ETHOFLD", "\20\001TOE", /* 4: TOE */ "\20\001RDDP\002RDMAC", /* 5: RDMA */ "\20\001INITIATOR_PDU\002TARGET_PDU" /* 6: iSCSI */ "\003INITIATOR_CNXOFLD\004TARGET_CNXOFLD" "\005INITIATOR_SSNOFLD\006TARGET_SSNOFLD" "\007T10DIF" "\010INITIATOR_CMDOFLD\011TARGET_CMDOFLD", "\20\001LOOKASIDE\002TLSKEYS", /* 7: Crypto */ "\20\001INITIATOR\002TARGET\003CTRL_OFLD" /* 8: FCoE */ "\004PO_INITIATOR\005PO_TARGET", }; void t4_sysctls(struct adapter *sc) { struct sysctl_ctx_list *ctx; struct sysctl_oid *oid; struct sysctl_oid_list *children, *c0; static char *doorbells = {"\20\1UDB\2WCWR\3UDBWC\4KDB"}; ctx = device_get_sysctl_ctx(sc->dev); /* * dev.t4nex.X. */ oid = device_get_sysctl_tree(sc->dev); c0 = children = SYSCTL_CHILDREN(oid); sc->sc_do_rxcopy = 1; SYSCTL_ADD_INT(ctx, children, OID_AUTO, "do_rx_copy", CTLFLAG_RW, &sc->sc_do_rxcopy, 1, "Do RX copy of small frames"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "nports", CTLFLAG_RD, NULL, sc->params.nports, "# of ports"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "doorbells", CTLTYPE_STRING | CTLFLAG_RD, doorbells, sc->doorbells, sysctl_bitfield, "A", "available doorbells"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "core_clock", CTLFLAG_RD, NULL, sc->params.vpd.cclk, "core clock frequency (in KHz)"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "holdoff_timers", CTLTYPE_STRING | CTLFLAG_RD, sc->params.sge.timer_val, sizeof(sc->params.sge.timer_val), sysctl_int_array, "A", "interrupt holdoff timer values (us)"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "holdoff_pkt_counts", CTLTYPE_STRING | CTLFLAG_RD, sc->params.sge.counter_val, sizeof(sc->params.sge.counter_val), sysctl_int_array, "A", "interrupt holdoff packet counter values"); t4_sge_sysctls(sc, ctx, children); sc->lro_timeout = 100; SYSCTL_ADD_INT(ctx, children, OID_AUTO, "lro_timeout", CTLFLAG_RW, &sc->lro_timeout, 0, "lro inactive-flush timeout (in us)"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "dflags", CTLFLAG_RW, &sc->debug_flags, 0, "flags to enable runtime debugging"); SYSCTL_ADD_STRING(ctx, children, OID_AUTO, "tp_version", CTLFLAG_RD, sc->tp_version, 0, "TP microcode version"); SYSCTL_ADD_STRING(ctx, children, OID_AUTO, "firmware_version", CTLFLAG_RD, sc->fw_version, 0, "firmware version"); if (sc->flags & IS_VF) return; SYSCTL_ADD_INT(ctx, children, OID_AUTO, "hw_revision", CTLFLAG_RD, NULL, chip_rev(sc), "chip hardware revision"); SYSCTL_ADD_STRING(ctx, children, OID_AUTO, "sn", CTLFLAG_RD, sc->params.vpd.sn, 0, "serial number"); SYSCTL_ADD_STRING(ctx, children, OID_AUTO, "pn", CTLFLAG_RD, sc->params.vpd.pn, 0, "part number"); SYSCTL_ADD_STRING(ctx, children, OID_AUTO, "ec", CTLFLAG_RD, sc->params.vpd.ec, 0, "engineering change"); SYSCTL_ADD_STRING(ctx, children, OID_AUTO, "na", CTLFLAG_RD, sc->params.vpd.na, 0, "network address"); SYSCTL_ADD_STRING(ctx, children, OID_AUTO, "er_version", CTLFLAG_RD, sc->er_version, 0, "expansion ROM version"); SYSCTL_ADD_STRING(ctx, children, OID_AUTO, "bs_version", CTLFLAG_RD, sc->bs_version, 0, "bootstrap firmware version"); SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "scfg_version", CTLFLAG_RD, NULL, sc->params.scfg_vers, "serial config version"); SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "vpd_version", CTLFLAG_RD, NULL, sc->params.vpd_vers, "VPD version"); SYSCTL_ADD_STRING(ctx, children, OID_AUTO, "cf", CTLFLAG_RD, sc->cfg_file, 0, "configuration file"); SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "cfcsum", CTLFLAG_RD, NULL, sc->cfcsum, "config file checksum"); #define SYSCTL_CAP(name, n, text) \ SYSCTL_ADD_PROC(ctx, children, OID_AUTO, #name, \ CTLTYPE_STRING | CTLFLAG_RD, caps_decoder[n], sc->name, \ sysctl_bitfield, "A", "available " text " capabilities") SYSCTL_CAP(nbmcaps, 0, "NBM"); SYSCTL_CAP(linkcaps, 1, "link"); SYSCTL_CAP(switchcaps, 2, "switch"); SYSCTL_CAP(niccaps, 3, "NIC"); SYSCTL_CAP(toecaps, 4, "TCP offload"); SYSCTL_CAP(rdmacaps, 5, "RDMA"); SYSCTL_CAP(iscsicaps, 6, "iSCSI"); SYSCTL_CAP(cryptocaps, 7, "crypto"); SYSCTL_CAP(fcoecaps, 8, "FCoE"); #undef SYSCTL_CAP SYSCTL_ADD_INT(ctx, children, OID_AUTO, "nfilters", CTLFLAG_RD, NULL, sc->tids.nftids, "number of filters"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "temperature", CTLTYPE_INT | CTLFLAG_RD, sc, 0, sysctl_temperature, "I", "chip temperature (in Celsius)"); #ifdef SBUF_DRAIN /* * dev.t4nex.X.misc. Marked CTLFLAG_SKIP to avoid information overload. */ oid = SYSCTL_ADD_NODE(ctx, c0, OID_AUTO, "misc", CTLFLAG_RD | CTLFLAG_SKIP, NULL, "logs and miscellaneous information"); children = SYSCTL_CHILDREN(oid); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cctrl", CTLTYPE_STRING | CTLFLAG_RD, sc, 0, sysctl_cctrl, "A", "congestion control"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_ibq_tp0", CTLTYPE_STRING | CTLFLAG_RD, sc, 0, sysctl_cim_ibq_obq, "A", "CIM IBQ 0 (TP0)"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_ibq_tp1", CTLTYPE_STRING | CTLFLAG_RD, sc, 1, sysctl_cim_ibq_obq, "A", "CIM IBQ 1 (TP1)"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_ibq_ulp", CTLTYPE_STRING | CTLFLAG_RD, sc, 2, sysctl_cim_ibq_obq, "A", "CIM IBQ 2 (ULP)"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_ibq_sge0", CTLTYPE_STRING | CTLFLAG_RD, sc, 3, sysctl_cim_ibq_obq, "A", "CIM IBQ 3 (SGE0)"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_ibq_sge1", CTLTYPE_STRING | CTLFLAG_RD, sc, 4, sysctl_cim_ibq_obq, "A", "CIM IBQ 4 (SGE1)"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_ibq_ncsi", CTLTYPE_STRING | CTLFLAG_RD, sc, 5, sysctl_cim_ibq_obq, "A", "CIM IBQ 5 (NCSI)"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_la", CTLTYPE_STRING | CTLFLAG_RD, sc, 0, chip_id(sc) <= CHELSIO_T5 ? sysctl_cim_la : sysctl_cim_la_t6, "A", "CIM logic analyzer"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_ma_la", CTLTYPE_STRING | CTLFLAG_RD, sc, 0, sysctl_cim_ma_la, "A", "CIM MA logic analyzer"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_obq_ulp0", CTLTYPE_STRING | CTLFLAG_RD, sc, 0 + CIM_NUM_IBQ, sysctl_cim_ibq_obq, "A", "CIM OBQ 0 (ULP0)"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_obq_ulp1", CTLTYPE_STRING | CTLFLAG_RD, sc, 1 + CIM_NUM_IBQ, sysctl_cim_ibq_obq, "A", "CIM OBQ 1 (ULP1)"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_obq_ulp2", CTLTYPE_STRING | CTLFLAG_RD, sc, 2 + CIM_NUM_IBQ, sysctl_cim_ibq_obq, "A", "CIM OBQ 2 (ULP2)"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_obq_ulp3", CTLTYPE_STRING | CTLFLAG_RD, sc, 3 + CIM_NUM_IBQ, sysctl_cim_ibq_obq, "A", "CIM OBQ 3 (ULP3)"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_obq_sge", CTLTYPE_STRING | CTLFLAG_RD, sc, 4 + CIM_NUM_IBQ, sysctl_cim_ibq_obq, "A", "CIM OBQ 4 (SGE)"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_obq_ncsi", CTLTYPE_STRING | CTLFLAG_RD, sc, 5 + CIM_NUM_IBQ, sysctl_cim_ibq_obq, "A", "CIM OBQ 5 (NCSI)"); if (chip_id(sc) > CHELSIO_T4) { SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_obq_sge0_rx", CTLTYPE_STRING | CTLFLAG_RD, sc, 6 + CIM_NUM_IBQ, sysctl_cim_ibq_obq, "A", "CIM OBQ 6 (SGE0-RX)"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_obq_sge1_rx", CTLTYPE_STRING | CTLFLAG_RD, sc, 7 + CIM_NUM_IBQ, sysctl_cim_ibq_obq, "A", "CIM OBQ 7 (SGE1-RX)"); } SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_pif_la", CTLTYPE_STRING | CTLFLAG_RD, sc, 0, sysctl_cim_pif_la, "A", "CIM PIF logic analyzer"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_qcfg", CTLTYPE_STRING | CTLFLAG_RD, sc, 0, sysctl_cim_qcfg, "A", "CIM queue configuration"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cpl_stats", CTLTYPE_STRING | CTLFLAG_RD, sc, 0, sysctl_cpl_stats, "A", "CPL statistics"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "ddp_stats", CTLTYPE_STRING | CTLFLAG_RD, sc, 0, sysctl_ddp_stats, "A", "non-TCP DDP statistics"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "devlog", CTLTYPE_STRING | CTLFLAG_RD, sc, 0, sysctl_devlog, "A", "firmware's device log"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "fcoe_stats", CTLTYPE_STRING | CTLFLAG_RD, sc, 0, sysctl_fcoe_stats, "A", "FCoE statistics"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "hw_sched", CTLTYPE_STRING | CTLFLAG_RD, sc, 0, sysctl_hw_sched, "A", "hardware scheduler "); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "l2t", CTLTYPE_STRING | CTLFLAG_RD, sc, 0, sysctl_l2t, "A", "hardware L2 table"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "lb_stats", CTLTYPE_STRING | CTLFLAG_RD, sc, 0, sysctl_lb_stats, "A", "loopback statistics"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "meminfo", CTLTYPE_STRING | CTLFLAG_RD, sc, 0, sysctl_meminfo, "A", "memory regions"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "mps_tcam", CTLTYPE_STRING | CTLFLAG_RD, sc, 0, chip_id(sc) <= CHELSIO_T5 ? sysctl_mps_tcam : sysctl_mps_tcam_t6, "A", "MPS TCAM entries"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "path_mtus", CTLTYPE_STRING | CTLFLAG_RD, sc, 0, sysctl_path_mtus, "A", "path MTUs"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "pm_stats", CTLTYPE_STRING | CTLFLAG_RD, sc, 0, sysctl_pm_stats, "A", "PM statistics"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "rdma_stats", CTLTYPE_STRING | CTLFLAG_RD, sc, 0, sysctl_rdma_stats, "A", "RDMA statistics"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "tcp_stats", CTLTYPE_STRING | CTLFLAG_RD, sc, 0, sysctl_tcp_stats, "A", "TCP statistics"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "tids", CTLTYPE_STRING | CTLFLAG_RD, sc, 0, sysctl_tids, "A", "TID information"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "tp_err_stats", CTLTYPE_STRING | CTLFLAG_RD, sc, 0, sysctl_tp_err_stats, "A", "TP error statistics"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "tp_la_mask", CTLTYPE_INT | CTLFLAG_RW, sc, 0, sysctl_tp_la_mask, "I", "TP logic analyzer event capture mask"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "tp_la", CTLTYPE_STRING | CTLFLAG_RD, sc, 0, sysctl_tp_la, "A", "TP logic analyzer"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "tx_rate", CTLTYPE_STRING | CTLFLAG_RD, sc, 0, sysctl_tx_rate, "A", "Tx rate"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "ulprx_la", CTLTYPE_STRING | CTLFLAG_RD, sc, 0, sysctl_ulprx_la, "A", "ULPRX logic analyzer"); if (chip_id(sc) >= CHELSIO_T5) { SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "wcwr_stats", CTLTYPE_STRING | CTLFLAG_RD, sc, 0, sysctl_wcwr_stats, "A", "write combined work requests"); } #endif #ifdef TCP_OFFLOAD if (is_offload(sc)) { /* * dev.t4nex.X.toe. */ oid = SYSCTL_ADD_NODE(ctx, c0, OID_AUTO, "toe", CTLFLAG_RD, NULL, "TOE parameters"); children = SYSCTL_CHILDREN(oid); sc->tt.sndbuf = 256 * 1024; SYSCTL_ADD_INT(ctx, children, OID_AUTO, "sndbuf", CTLFLAG_RW, &sc->tt.sndbuf, 0, "max hardware send buffer size"); sc->tt.ddp = 0; SYSCTL_ADD_INT(ctx, children, OID_AUTO, "ddp", CTLFLAG_RW, &sc->tt.ddp, 0, "DDP allowed"); sc->tt.rx_coalesce = 1; SYSCTL_ADD_INT(ctx, children, OID_AUTO, "rx_coalesce", CTLFLAG_RW, &sc->tt.rx_coalesce, 0, "receive coalescing"); sc->tt.tx_align = 1; SYSCTL_ADD_INT(ctx, children, OID_AUTO, "tx_align", CTLFLAG_RW, &sc->tt.tx_align, 0, "chop and align payload"); sc->tt.tx_zcopy = 0; SYSCTL_ADD_INT(ctx, children, OID_AUTO, "tx_zcopy", CTLFLAG_RW, &sc->tt.tx_zcopy, 0, "Enable zero-copy aio_write(2)"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "timer_tick", CTLTYPE_STRING | CTLFLAG_RD, sc, 0, sysctl_tp_tick, "A", "TP timer tick (us)"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "timestamp_tick", CTLTYPE_STRING | CTLFLAG_RD, sc, 1, sysctl_tp_tick, "A", "TCP timestamp tick (us)"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "dack_tick", CTLTYPE_STRING | CTLFLAG_RD, sc, 2, sysctl_tp_tick, "A", "DACK tick (us)"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "dack_timer", CTLTYPE_UINT | CTLFLAG_RD, sc, 0, sysctl_tp_dack_timer, "IU", "DACK timer (us)"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "rexmt_min", CTLTYPE_ULONG | CTLFLAG_RD, sc, A_TP_RXT_MIN, sysctl_tp_timer, "LU", "Retransmit min (us)"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "rexmt_max", CTLTYPE_ULONG | CTLFLAG_RD, sc, A_TP_RXT_MAX, sysctl_tp_timer, "LU", "Retransmit max (us)"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "persist_min", CTLTYPE_ULONG | CTLFLAG_RD, sc, A_TP_PERS_MIN, sysctl_tp_timer, "LU", "Persist timer min (us)"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "persist_max", CTLTYPE_ULONG | CTLFLAG_RD, sc, A_TP_PERS_MAX, sysctl_tp_timer, "LU", "Persist timer max (us)"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "keepalive_idle", CTLTYPE_ULONG | CTLFLAG_RD, sc, A_TP_KEEP_IDLE, sysctl_tp_timer, "LU", "Keepidle idle timer (us)"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "keepalive_intvl", CTLTYPE_ULONG | CTLFLAG_RD, sc, A_TP_KEEP_INTVL, sysctl_tp_timer, "LU", "Keepidle interval (us)"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "initial_srtt", CTLTYPE_ULONG | CTLFLAG_RD, sc, A_TP_INIT_SRTT, sysctl_tp_timer, "LU", "Initial SRTT (us)"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "finwait2_timer", CTLTYPE_ULONG | CTLFLAG_RD, sc, A_TP_FINWAIT2_TIMER, sysctl_tp_timer, "LU", "FINWAIT2 timer (us)"); } #endif } void vi_sysctls(struct vi_info *vi) { struct sysctl_ctx_list *ctx; struct sysctl_oid *oid; struct sysctl_oid_list *children; ctx = device_get_sysctl_ctx(vi->dev); /* * dev.v?(cxgbe|cxl).X. */ oid = device_get_sysctl_tree(vi->dev); children = SYSCTL_CHILDREN(oid); SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "viid", CTLFLAG_RD, NULL, vi->viid, "VI identifer"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "nrxq", CTLFLAG_RD, &vi->nrxq, 0, "# of rx queues"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "ntxq", CTLFLAG_RD, &vi->ntxq, 0, "# of tx queues"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "first_rxq", CTLFLAG_RD, &vi->first_rxq, 0, "index of first rx queue"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "first_txq", CTLFLAG_RD, &vi->first_txq, 0, "index of first tx queue"); SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "rss_size", CTLFLAG_RD, NULL, vi->rss_size, "size of RSS indirection table"); if (IS_MAIN_VI(vi)) { SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "rsrv_noflowq", CTLTYPE_INT | CTLFLAG_RW, vi, 0, sysctl_noflowq, "IU", "Reserve queue 0 for non-flowid packets"); } #ifdef TCP_OFFLOAD if (vi->nofldrxq != 0) { SYSCTL_ADD_INT(ctx, children, OID_AUTO, "nofldrxq", CTLFLAG_RD, &vi->nofldrxq, 0, "# of rx queues for offloaded TCP connections"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "nofldtxq", CTLFLAG_RD, &vi->nofldtxq, 0, "# of tx queues for offloaded TCP connections"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "first_ofld_rxq", CTLFLAG_RD, &vi->first_ofld_rxq, 0, "index of first TOE rx queue"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "first_ofld_txq", CTLFLAG_RD, &vi->first_ofld_txq, 0, "index of first TOE tx queue"); } #endif #ifdef DEV_NETMAP if (vi->nnmrxq != 0) { SYSCTL_ADD_INT(ctx, children, OID_AUTO, "nnmrxq", CTLFLAG_RD, &vi->nnmrxq, 0, "# of netmap rx queues"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "nnmtxq", CTLFLAG_RD, &vi->nnmtxq, 0, "# of netmap tx queues"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "first_nm_rxq", CTLFLAG_RD, &vi->first_nm_rxq, 0, "index of first netmap rx queue"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "first_nm_txq", CTLFLAG_RD, &vi->first_nm_txq, 0, "index of first netmap tx queue"); } #endif SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "holdoff_tmr_idx", CTLTYPE_INT | CTLFLAG_RW, vi, 0, sysctl_holdoff_tmr_idx, "I", "holdoff timer index"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "holdoff_pktc_idx", CTLTYPE_INT | CTLFLAG_RW, vi, 0, sysctl_holdoff_pktc_idx, "I", "holdoff packet counter index"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "qsize_rxq", CTLTYPE_INT | CTLFLAG_RW, vi, 0, sysctl_qsize_rxq, "I", "rx queue size"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "qsize_txq", CTLTYPE_INT | CTLFLAG_RW, vi, 0, sysctl_qsize_txq, "I", "tx queue size"); } static void cxgbe_sysctls(struct port_info *pi) { struct sysctl_ctx_list *ctx; struct sysctl_oid *oid; struct sysctl_oid_list *children, *children2; struct adapter *sc = pi->adapter; int i; char name[16]; ctx = device_get_sysctl_ctx(pi->dev); /* * dev.cxgbe.X. */ oid = device_get_sysctl_tree(pi->dev); children = SYSCTL_CHILDREN(oid); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "linkdnrc", CTLTYPE_STRING | CTLFLAG_RD, pi, 0, sysctl_linkdnrc, "A", "reason why link is down"); if (pi->port_type == FW_PORT_TYPE_BT_XAUI) { SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "temperature", CTLTYPE_INT | CTLFLAG_RD, pi, 0, sysctl_btphy, "I", "PHY temperature (in Celsius)"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "fw_version", CTLTYPE_INT | CTLFLAG_RD, pi, 1, sysctl_btphy, "I", "PHY firmware version"); } SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "pause_settings", CTLTYPE_STRING | CTLFLAG_RW, pi, PAUSE_TX, sysctl_pause_settings, "A", "PAUSE settings (bit 0 = rx_pause, bit 1 = tx_pause)"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "max_speed", CTLFLAG_RD, NULL, port_top_speed(pi), "max speed (in Gbps)"); if (sc->flags & IS_VF) return; /* * dev.(cxgbe|cxl).X.tc. */ oid = SYSCTL_ADD_NODE(ctx, children, OID_AUTO, "tc", CTLFLAG_RD, NULL, "Tx scheduler traffic classes"); for (i = 0; i < sc->chip_params->nsched_cls; i++) { struct tx_sched_class *tc = &pi->tc[i]; snprintf(name, sizeof(name), "%d", i); children2 = SYSCTL_CHILDREN(SYSCTL_ADD_NODE(ctx, SYSCTL_CHILDREN(oid), OID_AUTO, name, CTLFLAG_RD, NULL, "traffic class")); SYSCTL_ADD_UINT(ctx, children2, OID_AUTO, "flags", CTLFLAG_RD, &tc->flags, 0, "flags"); SYSCTL_ADD_UINT(ctx, children2, OID_AUTO, "refcount", CTLFLAG_RD, &tc->refcount, 0, "references to this class"); #ifdef SBUF_DRAIN SYSCTL_ADD_PROC(ctx, children2, OID_AUTO, "params", CTLTYPE_STRING | CTLFLAG_RD, sc, (pi->port_id << 16) | i, sysctl_tc_params, "A", "traffic class parameters"); #endif } /* * dev.cxgbe.X.stats. */ oid = SYSCTL_ADD_NODE(ctx, children, OID_AUTO, "stats", CTLFLAG_RD, NULL, "port statistics"); children = SYSCTL_CHILDREN(oid); SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "tx_parse_error", CTLFLAG_RD, &pi->tx_parse_error, 0, "# of tx packets with invalid length or # of segments"); #define SYSCTL_ADD_T4_REG64(pi, name, desc, reg) \ SYSCTL_ADD_OID(ctx, children, OID_AUTO, name, \ CTLTYPE_U64 | CTLFLAG_RD, sc, reg, \ sysctl_handle_t4_reg64, "QU", desc) SYSCTL_ADD_T4_REG64(pi, "tx_octets", "# of octets in good frames", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_TX_PORT_BYTES_L)); SYSCTL_ADD_T4_REG64(pi, "tx_frames", "total # of good frames", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_TX_PORT_FRAMES_L)); SYSCTL_ADD_T4_REG64(pi, "tx_bcast_frames", "# of broadcast frames", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_TX_PORT_BCAST_L)); SYSCTL_ADD_T4_REG64(pi, "tx_mcast_frames", "# of multicast frames", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_TX_PORT_MCAST_L)); SYSCTL_ADD_T4_REG64(pi, "tx_ucast_frames", "# of unicast frames", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_TX_PORT_UCAST_L)); SYSCTL_ADD_T4_REG64(pi, "tx_error_frames", "# of error frames", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_TX_PORT_ERROR_L)); SYSCTL_ADD_T4_REG64(pi, "tx_frames_64", "# of tx frames in this range", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_TX_PORT_64B_L)); SYSCTL_ADD_T4_REG64(pi, "tx_frames_65_127", "# of tx frames in this range", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_TX_PORT_65B_127B_L)); SYSCTL_ADD_T4_REG64(pi, "tx_frames_128_255", "# of tx frames in this range", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_TX_PORT_128B_255B_L)); SYSCTL_ADD_T4_REG64(pi, "tx_frames_256_511", "# of tx frames in this range", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_TX_PORT_256B_511B_L)); SYSCTL_ADD_T4_REG64(pi, "tx_frames_512_1023", "# of tx frames in this range", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_TX_PORT_512B_1023B_L)); SYSCTL_ADD_T4_REG64(pi, "tx_frames_1024_1518", "# of tx frames in this range", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_TX_PORT_1024B_1518B_L)); SYSCTL_ADD_T4_REG64(pi, "tx_frames_1519_max", "# of tx frames in this range", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_TX_PORT_1519B_MAX_L)); SYSCTL_ADD_T4_REG64(pi, "tx_drop", "# of dropped tx frames", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_TX_PORT_DROP_L)); SYSCTL_ADD_T4_REG64(pi, "tx_pause", "# of pause frames transmitted", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_TX_PORT_PAUSE_L)); SYSCTL_ADD_T4_REG64(pi, "tx_ppp0", "# of PPP prio 0 frames transmitted", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_TX_PORT_PPP0_L)); SYSCTL_ADD_T4_REG64(pi, "tx_ppp1", "# of PPP prio 1 frames transmitted", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_TX_PORT_PPP1_L)); SYSCTL_ADD_T4_REG64(pi, "tx_ppp2", "# of PPP prio 2 frames transmitted", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_TX_PORT_PPP2_L)); SYSCTL_ADD_T4_REG64(pi, "tx_ppp3", "# of PPP prio 3 frames transmitted", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_TX_PORT_PPP3_L)); SYSCTL_ADD_T4_REG64(pi, "tx_ppp4", "# of PPP prio 4 frames transmitted", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_TX_PORT_PPP4_L)); SYSCTL_ADD_T4_REG64(pi, "tx_ppp5", "# of PPP prio 5 frames transmitted", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_TX_PORT_PPP5_L)); SYSCTL_ADD_T4_REG64(pi, "tx_ppp6", "# of PPP prio 6 frames transmitted", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_TX_PORT_PPP6_L)); SYSCTL_ADD_T4_REG64(pi, "tx_ppp7", "# of PPP prio 7 frames transmitted", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_TX_PORT_PPP7_L)); SYSCTL_ADD_T4_REG64(pi, "rx_octets", "# of octets in good frames", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_BYTES_L)); SYSCTL_ADD_T4_REG64(pi, "rx_frames", "total # of good frames", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_FRAMES_L)); SYSCTL_ADD_T4_REG64(pi, "rx_bcast_frames", "# of broadcast frames", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_BCAST_L)); SYSCTL_ADD_T4_REG64(pi, "rx_mcast_frames", "# of multicast frames", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_MCAST_L)); SYSCTL_ADD_T4_REG64(pi, "rx_ucast_frames", "# of unicast frames", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_UCAST_L)); SYSCTL_ADD_T4_REG64(pi, "rx_too_long", "# of frames exceeding MTU", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_MTU_ERROR_L)); SYSCTL_ADD_T4_REG64(pi, "rx_jabber", "# of jabber frames", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_MTU_CRC_ERROR_L)); SYSCTL_ADD_T4_REG64(pi, "rx_fcs_err", "# of frames received with bad FCS", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_CRC_ERROR_L)); SYSCTL_ADD_T4_REG64(pi, "rx_len_err", "# of frames received with length error", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_LEN_ERROR_L)); SYSCTL_ADD_T4_REG64(pi, "rx_symbol_err", "symbol errors", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_SYM_ERROR_L)); SYSCTL_ADD_T4_REG64(pi, "rx_runt", "# of short frames received", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_LESS_64B_L)); SYSCTL_ADD_T4_REG64(pi, "rx_frames_64", "# of rx frames in this range", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_64B_L)); SYSCTL_ADD_T4_REG64(pi, "rx_frames_65_127", "# of rx frames in this range", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_65B_127B_L)); SYSCTL_ADD_T4_REG64(pi, "rx_frames_128_255", "# of rx frames in this range", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_128B_255B_L)); SYSCTL_ADD_T4_REG64(pi, "rx_frames_256_511", "# of rx frames in this range", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_256B_511B_L)); SYSCTL_ADD_T4_REG64(pi, "rx_frames_512_1023", "# of rx frames in this range", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_512B_1023B_L)); SYSCTL_ADD_T4_REG64(pi, "rx_frames_1024_1518", "# of rx frames in this range", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_1024B_1518B_L)); SYSCTL_ADD_T4_REG64(pi, "rx_frames_1519_max", "# of rx frames in this range", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_1519B_MAX_L)); SYSCTL_ADD_T4_REG64(pi, "rx_pause", "# of pause frames received", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_PAUSE_L)); SYSCTL_ADD_T4_REG64(pi, "rx_ppp0", "# of PPP prio 0 frames received", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_PPP0_L)); SYSCTL_ADD_T4_REG64(pi, "rx_ppp1", "# of PPP prio 1 frames received", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_PPP1_L)); SYSCTL_ADD_T4_REG64(pi, "rx_ppp2", "# of PPP prio 2 frames received", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_PPP2_L)); SYSCTL_ADD_T4_REG64(pi, "rx_ppp3", "# of PPP prio 3 frames received", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_PPP3_L)); SYSCTL_ADD_T4_REG64(pi, "rx_ppp4", "# of PPP prio 4 frames received", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_PPP4_L)); SYSCTL_ADD_T4_REG64(pi, "rx_ppp5", "# of PPP prio 5 frames received", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_PPP5_L)); SYSCTL_ADD_T4_REG64(pi, "rx_ppp6", "# of PPP prio 6 frames received", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_PPP6_L)); SYSCTL_ADD_T4_REG64(pi, "rx_ppp7", "# of PPP prio 7 frames received", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_PPP7_L)); #undef SYSCTL_ADD_T4_REG64 #define SYSCTL_ADD_T4_PORTSTAT(name, desc) \ SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, #name, CTLFLAG_RD, \ &pi->stats.name, desc) /* We get these from port_stats and they may be stale by up to 1s */ SYSCTL_ADD_T4_PORTSTAT(rx_ovflow0, "# drops due to buffer-group 0 overflows"); SYSCTL_ADD_T4_PORTSTAT(rx_ovflow1, "# drops due to buffer-group 1 overflows"); SYSCTL_ADD_T4_PORTSTAT(rx_ovflow2, "# drops due to buffer-group 2 overflows"); SYSCTL_ADD_T4_PORTSTAT(rx_ovflow3, "# drops due to buffer-group 3 overflows"); SYSCTL_ADD_T4_PORTSTAT(rx_trunc0, "# of buffer-group 0 truncated packets"); SYSCTL_ADD_T4_PORTSTAT(rx_trunc1, "# of buffer-group 1 truncated packets"); SYSCTL_ADD_T4_PORTSTAT(rx_trunc2, "# of buffer-group 2 truncated packets"); SYSCTL_ADD_T4_PORTSTAT(rx_trunc3, "# of buffer-group 3 truncated packets"); #undef SYSCTL_ADD_T4_PORTSTAT } static int sysctl_int_array(SYSCTL_HANDLER_ARGS) { int rc, *i, space = 0; struct sbuf sb; sbuf_new_for_sysctl(&sb, NULL, 64, req); for (i = arg1; arg2; arg2 -= sizeof(int), i++) { if (space) sbuf_printf(&sb, " "); sbuf_printf(&sb, "%d", *i); space = 1; } rc = sbuf_finish(&sb); sbuf_delete(&sb); return (rc); } static int sysctl_bitfield(SYSCTL_HANDLER_ARGS) { int rc; struct sbuf *sb; rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return(rc); sb = sbuf_new_for_sysctl(NULL, NULL, 128, req); if (sb == NULL) return (ENOMEM); sbuf_printf(sb, "%b", (int)arg2, (char *)arg1); rc = sbuf_finish(sb); sbuf_delete(sb); return (rc); } static int sysctl_btphy(SYSCTL_HANDLER_ARGS) { struct port_info *pi = arg1; int op = arg2; struct adapter *sc = pi->adapter; u_int v; int rc; rc = begin_synchronized_op(sc, &pi->vi[0], SLEEP_OK | INTR_OK, "t4btt"); if (rc) return (rc); /* XXX: magic numbers */ rc = -t4_mdio_rd(sc, sc->mbox, pi->mdio_addr, 0x1e, op ? 0x20 : 0xc820, &v); end_synchronized_op(sc, 0); if (rc) return (rc); if (op == 0) v /= 256; rc = sysctl_handle_int(oidp, &v, 0, req); return (rc); } static int sysctl_noflowq(SYSCTL_HANDLER_ARGS) { struct vi_info *vi = arg1; int rc, val; val = vi->rsrv_noflowq; rc = sysctl_handle_int(oidp, &val, 0, req); if (rc != 0 || req->newptr == NULL) return (rc); if ((val >= 1) && (vi->ntxq > 1)) vi->rsrv_noflowq = 1; else vi->rsrv_noflowq = 0; return (rc); } static int sysctl_holdoff_tmr_idx(SYSCTL_HANDLER_ARGS) { struct vi_info *vi = arg1; struct adapter *sc = vi->pi->adapter; int idx, rc, i; struct sge_rxq *rxq; #ifdef TCP_OFFLOAD struct sge_ofld_rxq *ofld_rxq; #endif uint8_t v; idx = vi->tmr_idx; rc = sysctl_handle_int(oidp, &idx, 0, req); if (rc != 0 || req->newptr == NULL) return (rc); if (idx < 0 || idx >= SGE_NTIMERS) return (EINVAL); rc = begin_synchronized_op(sc, vi, HOLD_LOCK | SLEEP_OK | INTR_OK, "t4tmr"); if (rc) return (rc); v = V_QINTR_TIMER_IDX(idx) | V_QINTR_CNT_EN(vi->pktc_idx != -1); for_each_rxq(vi, i, rxq) { #ifdef atomic_store_rel_8 atomic_store_rel_8(&rxq->iq.intr_params, v); #else rxq->iq.intr_params = v; #endif } #ifdef TCP_OFFLOAD for_each_ofld_rxq(vi, i, ofld_rxq) { #ifdef atomic_store_rel_8 atomic_store_rel_8(&ofld_rxq->iq.intr_params, v); #else ofld_rxq->iq.intr_params = v; #endif } #endif vi->tmr_idx = idx; end_synchronized_op(sc, LOCK_HELD); return (0); } static int sysctl_holdoff_pktc_idx(SYSCTL_HANDLER_ARGS) { struct vi_info *vi = arg1; struct adapter *sc = vi->pi->adapter; int idx, rc; idx = vi->pktc_idx; rc = sysctl_handle_int(oidp, &idx, 0, req); if (rc != 0 || req->newptr == NULL) return (rc); if (idx < -1 || idx >= SGE_NCOUNTERS) return (EINVAL); rc = begin_synchronized_op(sc, vi, HOLD_LOCK | SLEEP_OK | INTR_OK, "t4pktc"); if (rc) return (rc); if (vi->flags & VI_INIT_DONE) rc = EBUSY; /* cannot be changed once the queues are created */ else vi->pktc_idx = idx; end_synchronized_op(sc, LOCK_HELD); return (rc); } static int sysctl_qsize_rxq(SYSCTL_HANDLER_ARGS) { struct vi_info *vi = arg1; struct adapter *sc = vi->pi->adapter; int qsize, rc; qsize = vi->qsize_rxq; rc = sysctl_handle_int(oidp, &qsize, 0, req); if (rc != 0 || req->newptr == NULL) return (rc); if (qsize < 128 || (qsize & 7)) return (EINVAL); rc = begin_synchronized_op(sc, vi, HOLD_LOCK | SLEEP_OK | INTR_OK, "t4rxqs"); if (rc) return (rc); if (vi->flags & VI_INIT_DONE) rc = EBUSY; /* cannot be changed once the queues are created */ else vi->qsize_rxq = qsize; end_synchronized_op(sc, LOCK_HELD); return (rc); } static int sysctl_qsize_txq(SYSCTL_HANDLER_ARGS) { struct vi_info *vi = arg1; struct adapter *sc = vi->pi->adapter; int qsize, rc; qsize = vi->qsize_txq; rc = sysctl_handle_int(oidp, &qsize, 0, req); if (rc != 0 || req->newptr == NULL) return (rc); if (qsize < 128 || qsize > 65536) return (EINVAL); rc = begin_synchronized_op(sc, vi, HOLD_LOCK | SLEEP_OK | INTR_OK, "t4txqs"); if (rc) return (rc); if (vi->flags & VI_INIT_DONE) rc = EBUSY; /* cannot be changed once the queues are created */ else vi->qsize_txq = qsize; end_synchronized_op(sc, LOCK_HELD); return (rc); } static int sysctl_pause_settings(SYSCTL_HANDLER_ARGS) { struct port_info *pi = arg1; struct adapter *sc = pi->adapter; struct link_config *lc = &pi->link_cfg; int rc; if (req->newptr == NULL) { struct sbuf *sb; static char *bits = "\20\1PAUSE_RX\2PAUSE_TX"; rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return(rc); sb = sbuf_new_for_sysctl(NULL, NULL, 128, req); if (sb == NULL) return (ENOMEM); sbuf_printf(sb, "%b", lc->fc & (PAUSE_TX | PAUSE_RX), bits); rc = sbuf_finish(sb); sbuf_delete(sb); } else { char s[2]; int n; s[0] = '0' + (lc->requested_fc & (PAUSE_TX | PAUSE_RX)); s[1] = 0; rc = sysctl_handle_string(oidp, s, sizeof(s), req); if (rc != 0) return(rc); if (s[1] != 0) return (EINVAL); if (s[0] < '0' || s[0] > '9') return (EINVAL); /* not a number */ n = s[0] - '0'; if (n & ~(PAUSE_TX | PAUSE_RX)) return (EINVAL); /* some other bit is set too */ rc = begin_synchronized_op(sc, &pi->vi[0], SLEEP_OK | INTR_OK, "t4PAUSE"); if (rc) return (rc); if ((lc->requested_fc & (PAUSE_TX | PAUSE_RX)) != n) { int link_ok = lc->link_ok; lc->requested_fc &= ~(PAUSE_TX | PAUSE_RX); lc->requested_fc |= n; rc = -t4_link_l1cfg(sc, sc->mbox, pi->tx_chan, lc); lc->link_ok = link_ok; /* restore */ } end_synchronized_op(sc, 0); } return (rc); } static int sysctl_handle_t4_reg64(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; int reg = arg2; uint64_t val; val = t4_read_reg64(sc, reg); return (sysctl_handle_64(oidp, &val, 0, req)); } static int sysctl_temperature(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; int rc, t; uint32_t param, val; rc = begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4temp"); if (rc) return (rc); param = V_FW_PARAMS_MNEM(FW_PARAMS_MNEM_DEV) | V_FW_PARAMS_PARAM_X(FW_PARAMS_PARAM_DEV_DIAG) | V_FW_PARAMS_PARAM_Y(FW_PARAM_DEV_DIAG_TMP); rc = -t4_query_params(sc, sc->mbox, sc->pf, 0, 1, ¶m, &val); end_synchronized_op(sc, 0); if (rc) return (rc); /* unknown is returned as 0 but we display -1 in that case */ t = val == 0 ? -1 : val; rc = sysctl_handle_int(oidp, &t, 0, req); return (rc); } #ifdef SBUF_DRAIN static int sysctl_cctrl(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; struct sbuf *sb; int rc, i; uint16_t incr[NMTUS][NCCTRL_WIN]; static const char *dec_fac[] = { "0.5", "0.5625", "0.625", "0.6875", "0.75", "0.8125", "0.875", "0.9375" }; rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return (rc); sb = sbuf_new_for_sysctl(NULL, NULL, 4096, req); if (sb == NULL) return (ENOMEM); t4_read_cong_tbl(sc, incr); for (i = 0; i < NCCTRL_WIN; ++i) { sbuf_printf(sb, "%2d: %4u %4u %4u %4u %4u %4u %4u %4u\n", i, incr[0][i], incr[1][i], incr[2][i], incr[3][i], incr[4][i], incr[5][i], incr[6][i], incr[7][i]); sbuf_printf(sb, "%8u %4u %4u %4u %4u %4u %4u %4u %5u %s\n", incr[8][i], incr[9][i], incr[10][i], incr[11][i], incr[12][i], incr[13][i], incr[14][i], incr[15][i], sc->params.a_wnd[i], dec_fac[sc->params.b_wnd[i]]); } rc = sbuf_finish(sb); sbuf_delete(sb); return (rc); } static const char *qname[CIM_NUM_IBQ + CIM_NUM_OBQ_T5] = { "TP0", "TP1", "ULP", "SGE0", "SGE1", "NC-SI", /* ibq's */ "ULP0", "ULP1", "ULP2", "ULP3", "SGE", "NC-SI", /* obq's */ "SGE0-RX", "SGE1-RX" /* additional obq's (T5 onwards) */ }; static int sysctl_cim_ibq_obq(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; struct sbuf *sb; int rc, i, n, qid = arg2; uint32_t *buf, *p; char *qtype; u_int cim_num_obq = sc->chip_params->cim_num_obq; KASSERT(qid >= 0 && qid < CIM_NUM_IBQ + cim_num_obq, ("%s: bad qid %d\n", __func__, qid)); if (qid < CIM_NUM_IBQ) { /* inbound queue */ qtype = "IBQ"; n = 4 * CIM_IBQ_SIZE; buf = malloc(n * sizeof(uint32_t), M_CXGBE, M_ZERO | M_WAITOK); rc = t4_read_cim_ibq(sc, qid, buf, n); } else { /* outbound queue */ qtype = "OBQ"; qid -= CIM_NUM_IBQ; n = 4 * cim_num_obq * CIM_OBQ_SIZE; buf = malloc(n * sizeof(uint32_t), M_CXGBE, M_ZERO | M_WAITOK); rc = t4_read_cim_obq(sc, qid, buf, n); } if (rc < 0) { rc = -rc; goto done; } n = rc * sizeof(uint32_t); /* rc has # of words actually read */ rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) goto done; sb = sbuf_new_for_sysctl(NULL, NULL, PAGE_SIZE, req); if (sb == NULL) { rc = ENOMEM; goto done; } sbuf_printf(sb, "%s%d %s", qtype , qid, qname[arg2]); for (i = 0, p = buf; i < n; i += 16, p += 4) sbuf_printf(sb, "\n%#06x: %08x %08x %08x %08x", i, p[0], p[1], p[2], p[3]); rc = sbuf_finish(sb); sbuf_delete(sb); done: free(buf, M_CXGBE); return (rc); } static int sysctl_cim_la(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; u_int cfg; struct sbuf *sb; uint32_t *buf, *p; int rc; MPASS(chip_id(sc) <= CHELSIO_T5); rc = -t4_cim_read(sc, A_UP_UP_DBG_LA_CFG, 1, &cfg); if (rc != 0) return (rc); rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return (rc); sb = sbuf_new_for_sysctl(NULL, NULL, 4096, req); if (sb == NULL) return (ENOMEM); buf = malloc(sc->params.cim_la_size * sizeof(uint32_t), M_CXGBE, M_ZERO | M_WAITOK); rc = -t4_cim_read_la(sc, buf, NULL); if (rc != 0) goto done; sbuf_printf(sb, "Status Data PC%s", cfg & F_UPDBGLACAPTPCONLY ? "" : " LS0Stat LS0Addr LS0Data"); for (p = buf; p <= &buf[sc->params.cim_la_size - 8]; p += 8) { if (cfg & F_UPDBGLACAPTPCONLY) { sbuf_printf(sb, "\n %02x %08x %08x", p[5] & 0xff, p[6], p[7]); sbuf_printf(sb, "\n %02x %02x%06x %02x%06x", (p[3] >> 8) & 0xff, p[3] & 0xff, p[4] >> 8, p[4] & 0xff, p[5] >> 8); sbuf_printf(sb, "\n %02x %x%07x %x%07x", (p[0] >> 4) & 0xff, p[0] & 0xf, p[1] >> 4, p[1] & 0xf, p[2] >> 4); } else { sbuf_printf(sb, "\n %02x %x%07x %x%07x %08x %08x " "%08x%08x%08x%08x", (p[0] >> 4) & 0xff, p[0] & 0xf, p[1] >> 4, p[1] & 0xf, p[2] >> 4, p[2] & 0xf, p[3], p[4], p[5], p[6], p[7]); } } rc = sbuf_finish(sb); sbuf_delete(sb); done: free(buf, M_CXGBE); return (rc); } static int sysctl_cim_la_t6(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; u_int cfg; struct sbuf *sb; uint32_t *buf, *p; int rc; MPASS(chip_id(sc) > CHELSIO_T5); rc = -t4_cim_read(sc, A_UP_UP_DBG_LA_CFG, 1, &cfg); if (rc != 0) return (rc); rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return (rc); sb = sbuf_new_for_sysctl(NULL, NULL, 4096, req); if (sb == NULL) return (ENOMEM); buf = malloc(sc->params.cim_la_size * sizeof(uint32_t), M_CXGBE, M_ZERO | M_WAITOK); rc = -t4_cim_read_la(sc, buf, NULL); if (rc != 0) goto done; sbuf_printf(sb, "Status Inst Data PC%s", cfg & F_UPDBGLACAPTPCONLY ? "" : " LS0Stat LS0Addr LS0Data LS1Stat LS1Addr LS1Data"); for (p = buf; p <= &buf[sc->params.cim_la_size - 10]; p += 10) { if (cfg & F_UPDBGLACAPTPCONLY) { sbuf_printf(sb, "\n %02x %08x %08x %08x", p[3] & 0xff, p[2], p[1], p[0]); sbuf_printf(sb, "\n %02x %02x%06x %02x%06x %02x%06x", (p[6] >> 8) & 0xff, p[6] & 0xff, p[5] >> 8, p[5] & 0xff, p[4] >> 8, p[4] & 0xff, p[3] >> 8); sbuf_printf(sb, "\n %02x %04x%04x %04x%04x %04x%04x", (p[9] >> 16) & 0xff, p[9] & 0xffff, p[8] >> 16, p[8] & 0xffff, p[7] >> 16, p[7] & 0xffff, p[6] >> 16); } else { sbuf_printf(sb, "\n %02x %04x%04x %04x%04x %04x%04x " "%08x %08x %08x %08x %08x %08x", (p[9] >> 16) & 0xff, p[9] & 0xffff, p[8] >> 16, p[8] & 0xffff, p[7] >> 16, p[7] & 0xffff, p[6] >> 16, p[2], p[1], p[0], p[5], p[4], p[3]); } } rc = sbuf_finish(sb); sbuf_delete(sb); done: free(buf, M_CXGBE); return (rc); } static int sysctl_cim_ma_la(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; u_int i; struct sbuf *sb; uint32_t *buf, *p; int rc; rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return (rc); sb = sbuf_new_for_sysctl(NULL, NULL, 4096, req); if (sb == NULL) return (ENOMEM); buf = malloc(2 * CIM_MALA_SIZE * 5 * sizeof(uint32_t), M_CXGBE, M_ZERO | M_WAITOK); t4_cim_read_ma_la(sc, buf, buf + 5 * CIM_MALA_SIZE); p = buf; for (i = 0; i < CIM_MALA_SIZE; i++, p += 5) { sbuf_printf(sb, "\n%02x%08x%08x%08x%08x", p[4], p[3], p[2], p[1], p[0]); } sbuf_printf(sb, "\n\nCnt ID Tag UE Data RDY VLD"); for (i = 0; i < CIM_MALA_SIZE; i++, p += 5) { sbuf_printf(sb, "\n%3u %2u %x %u %08x%08x %u %u", (p[2] >> 10) & 0xff, (p[2] >> 7) & 7, (p[2] >> 3) & 0xf, (p[2] >> 2) & 1, (p[1] >> 2) | ((p[2] & 3) << 30), (p[0] >> 2) | ((p[1] & 3) << 30), (p[0] >> 1) & 1, p[0] & 1); } rc = sbuf_finish(sb); sbuf_delete(sb); free(buf, M_CXGBE); return (rc); } static int sysctl_cim_pif_la(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; u_int i; struct sbuf *sb; uint32_t *buf, *p; int rc; rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return (rc); sb = sbuf_new_for_sysctl(NULL, NULL, 4096, req); if (sb == NULL) return (ENOMEM); buf = malloc(2 * CIM_PIFLA_SIZE * 6 * sizeof(uint32_t), M_CXGBE, M_ZERO | M_WAITOK); t4_cim_read_pif_la(sc, buf, buf + 6 * CIM_PIFLA_SIZE, NULL, NULL); p = buf; sbuf_printf(sb, "Cntl ID DataBE Addr Data"); for (i = 0; i < CIM_PIFLA_SIZE; i++, p += 6) { sbuf_printf(sb, "\n %02x %02x %04x %08x %08x%08x%08x%08x", (p[5] >> 22) & 0xff, (p[5] >> 16) & 0x3f, p[5] & 0xffff, p[4], p[3], p[2], p[1], p[0]); } sbuf_printf(sb, "\n\nCntl ID Data"); for (i = 0; i < CIM_PIFLA_SIZE; i++, p += 6) { sbuf_printf(sb, "\n %02x %02x %08x%08x%08x%08x", (p[4] >> 6) & 0xff, p[4] & 0x3f, p[3], p[2], p[1], p[0]); } rc = sbuf_finish(sb); sbuf_delete(sb); free(buf, M_CXGBE); return (rc); } static int sysctl_cim_qcfg(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; struct sbuf *sb; int rc, i; uint16_t base[CIM_NUM_IBQ + CIM_NUM_OBQ_T5]; uint16_t size[CIM_NUM_IBQ + CIM_NUM_OBQ_T5]; uint16_t thres[CIM_NUM_IBQ]; uint32_t obq_wr[2 * CIM_NUM_OBQ_T5], *wr = obq_wr; uint32_t stat[4 * (CIM_NUM_IBQ + CIM_NUM_OBQ_T5)], *p = stat; u_int cim_num_obq, ibq_rdaddr, obq_rdaddr, nq; cim_num_obq = sc->chip_params->cim_num_obq; if (is_t4(sc)) { ibq_rdaddr = A_UP_IBQ_0_RDADDR; obq_rdaddr = A_UP_OBQ_0_REALADDR; } else { ibq_rdaddr = A_UP_IBQ_0_SHADOW_RDADDR; obq_rdaddr = A_UP_OBQ_0_SHADOW_REALADDR; } nq = CIM_NUM_IBQ + cim_num_obq; rc = -t4_cim_read(sc, ibq_rdaddr, 4 * nq, stat); if (rc == 0) rc = -t4_cim_read(sc, obq_rdaddr, 2 * cim_num_obq, obq_wr); if (rc != 0) return (rc); t4_read_cimq_cfg(sc, base, size, thres); rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return (rc); sb = sbuf_new_for_sysctl(NULL, NULL, PAGE_SIZE, req); if (sb == NULL) return (ENOMEM); sbuf_printf(sb, "Queue Base Size Thres RdPtr WrPtr SOP EOP Avail"); for (i = 0; i < CIM_NUM_IBQ; i++, p += 4) sbuf_printf(sb, "\n%7s %5x %5u %5u %6x %4x %4u %4u %5u", qname[i], base[i], size[i], thres[i], G_IBQRDADDR(p[0]), G_IBQWRADDR(p[1]), G_QUESOPCNT(p[3]), G_QUEEOPCNT(p[3]), G_QUEREMFLITS(p[2]) * 16); for ( ; i < nq; i++, p += 4, wr += 2) sbuf_printf(sb, "\n%7s %5x %5u %12x %4x %4u %4u %5u", qname[i], base[i], size[i], G_QUERDADDR(p[0]) & 0x3fff, wr[0] - base[i], G_QUESOPCNT(p[3]), G_QUEEOPCNT(p[3]), G_QUEREMFLITS(p[2]) * 16); rc = sbuf_finish(sb); sbuf_delete(sb); return (rc); } static int sysctl_cpl_stats(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; struct sbuf *sb; int rc; struct tp_cpl_stats stats; rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return (rc); sb = sbuf_new_for_sysctl(NULL, NULL, 256, req); if (sb == NULL) return (ENOMEM); mtx_lock(&sc->reg_lock); t4_tp_get_cpl_stats(sc, &stats); mtx_unlock(&sc->reg_lock); if (sc->chip_params->nchan > 2) { sbuf_printf(sb, " channel 0 channel 1" " channel 2 channel 3"); sbuf_printf(sb, "\nCPL requests: %10u %10u %10u %10u", stats.req[0], stats.req[1], stats.req[2], stats.req[3]); sbuf_printf(sb, "\nCPL responses: %10u %10u %10u %10u", stats.rsp[0], stats.rsp[1], stats.rsp[2], stats.rsp[3]); } else { sbuf_printf(sb, " channel 0 channel 1"); sbuf_printf(sb, "\nCPL requests: %10u %10u", stats.req[0], stats.req[1]); sbuf_printf(sb, "\nCPL responses: %10u %10u", stats.rsp[0], stats.rsp[1]); } rc = sbuf_finish(sb); sbuf_delete(sb); return (rc); } static int sysctl_ddp_stats(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; struct sbuf *sb; int rc; struct tp_usm_stats stats; rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return(rc); sb = sbuf_new_for_sysctl(NULL, NULL, 256, req); if (sb == NULL) return (ENOMEM); t4_get_usm_stats(sc, &stats); sbuf_printf(sb, "Frames: %u\n", stats.frames); sbuf_printf(sb, "Octets: %ju\n", stats.octets); sbuf_printf(sb, "Drops: %u", stats.drops); rc = sbuf_finish(sb); sbuf_delete(sb); return (rc); } static const char * const devlog_level_strings[] = { [FW_DEVLOG_LEVEL_EMERG] = "EMERG", [FW_DEVLOG_LEVEL_CRIT] = "CRIT", [FW_DEVLOG_LEVEL_ERR] = "ERR", [FW_DEVLOG_LEVEL_NOTICE] = "NOTICE", [FW_DEVLOG_LEVEL_INFO] = "INFO", [FW_DEVLOG_LEVEL_DEBUG] = "DEBUG" }; static const char * const devlog_facility_strings[] = { [FW_DEVLOG_FACILITY_CORE] = "CORE", [FW_DEVLOG_FACILITY_CF] = "CF", [FW_DEVLOG_FACILITY_SCHED] = "SCHED", [FW_DEVLOG_FACILITY_TIMER] = "TIMER", [FW_DEVLOG_FACILITY_RES] = "RES", [FW_DEVLOG_FACILITY_HW] = "HW", [FW_DEVLOG_FACILITY_FLR] = "FLR", [FW_DEVLOG_FACILITY_DMAQ] = "DMAQ", [FW_DEVLOG_FACILITY_PHY] = "PHY", [FW_DEVLOG_FACILITY_MAC] = "MAC", [FW_DEVLOG_FACILITY_PORT] = "PORT", [FW_DEVLOG_FACILITY_VI] = "VI", [FW_DEVLOG_FACILITY_FILTER] = "FILTER", [FW_DEVLOG_FACILITY_ACL] = "ACL", [FW_DEVLOG_FACILITY_TM] = "TM", [FW_DEVLOG_FACILITY_QFC] = "QFC", [FW_DEVLOG_FACILITY_DCB] = "DCB", [FW_DEVLOG_FACILITY_ETH] = "ETH", [FW_DEVLOG_FACILITY_OFLD] = "OFLD", [FW_DEVLOG_FACILITY_RI] = "RI", [FW_DEVLOG_FACILITY_ISCSI] = "ISCSI", [FW_DEVLOG_FACILITY_FCOE] = "FCOE", [FW_DEVLOG_FACILITY_FOISCSI] = "FOISCSI", [FW_DEVLOG_FACILITY_FOFCOE] = "FOFCOE", [FW_DEVLOG_FACILITY_CHNET] = "CHNET", }; static int sysctl_devlog(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; struct devlog_params *dparams = &sc->params.devlog; struct fw_devlog_e *buf, *e; int i, j, rc, nentries, first = 0; struct sbuf *sb; uint64_t ftstamp = UINT64_MAX; if (dparams->addr == 0) return (ENXIO); buf = malloc(dparams->size, M_CXGBE, M_NOWAIT); if (buf == NULL) return (ENOMEM); rc = read_via_memwin(sc, 1, dparams->addr, (void *)buf, dparams->size); if (rc != 0) goto done; nentries = dparams->size / sizeof(struct fw_devlog_e); for (i = 0; i < nentries; i++) { e = &buf[i]; if (e->timestamp == 0) break; /* end */ e->timestamp = be64toh(e->timestamp); e->seqno = be32toh(e->seqno); for (j = 0; j < 8; j++) e->params[j] = be32toh(e->params[j]); if (e->timestamp < ftstamp) { ftstamp = e->timestamp; first = i; } } if (buf[first].timestamp == 0) goto done; /* nothing in the log */ rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) goto done; sb = sbuf_new_for_sysctl(NULL, NULL, 4096, req); if (sb == NULL) { rc = ENOMEM; goto done; } sbuf_printf(sb, "%10s %15s %8s %8s %s\n", "Seq#", "Tstamp", "Level", "Facility", "Message"); i = first; do { e = &buf[i]; if (e->timestamp == 0) break; /* end */ sbuf_printf(sb, "%10d %15ju %8s %8s ", e->seqno, e->timestamp, (e->level < nitems(devlog_level_strings) ? devlog_level_strings[e->level] : "UNKNOWN"), (e->facility < nitems(devlog_facility_strings) ? devlog_facility_strings[e->facility] : "UNKNOWN")); sbuf_printf(sb, e->fmt, e->params[0], e->params[1], e->params[2], e->params[3], e->params[4], e->params[5], e->params[6], e->params[7]); if (++i == nentries) i = 0; } while (i != first); rc = sbuf_finish(sb); sbuf_delete(sb); done: free(buf, M_CXGBE); return (rc); } static int sysctl_fcoe_stats(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; struct sbuf *sb; int rc; struct tp_fcoe_stats stats[MAX_NCHAN]; int i, nchan = sc->chip_params->nchan; rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return (rc); sb = sbuf_new_for_sysctl(NULL, NULL, 256, req); if (sb == NULL) return (ENOMEM); for (i = 0; i < nchan; i++) t4_get_fcoe_stats(sc, i, &stats[i]); if (nchan > 2) { sbuf_printf(sb, " channel 0 channel 1" " channel 2 channel 3"); sbuf_printf(sb, "\noctetsDDP: %16ju %16ju %16ju %16ju", stats[0].octets_ddp, stats[1].octets_ddp, stats[2].octets_ddp, stats[3].octets_ddp); sbuf_printf(sb, "\nframesDDP: %16u %16u %16u %16u", stats[0].frames_ddp, stats[1].frames_ddp, stats[2].frames_ddp, stats[3].frames_ddp); sbuf_printf(sb, "\nframesDrop: %16u %16u %16u %16u", stats[0].frames_drop, stats[1].frames_drop, stats[2].frames_drop, stats[3].frames_drop); } else { sbuf_printf(sb, " channel 0 channel 1"); sbuf_printf(sb, "\noctetsDDP: %16ju %16ju", stats[0].octets_ddp, stats[1].octets_ddp); sbuf_printf(sb, "\nframesDDP: %16u %16u", stats[0].frames_ddp, stats[1].frames_ddp); sbuf_printf(sb, "\nframesDrop: %16u %16u", stats[0].frames_drop, stats[1].frames_drop); } rc = sbuf_finish(sb); sbuf_delete(sb); return (rc); } static int sysctl_hw_sched(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; struct sbuf *sb; int rc, i; unsigned int map, kbps, ipg, mode; unsigned int pace_tab[NTX_SCHED]; rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return (rc); sb = sbuf_new_for_sysctl(NULL, NULL, 256, req); if (sb == NULL) return (ENOMEM); map = t4_read_reg(sc, A_TP_TX_MOD_QUEUE_REQ_MAP); mode = G_TIMERMODE(t4_read_reg(sc, A_TP_MOD_CONFIG)); t4_read_pace_tbl(sc, pace_tab); sbuf_printf(sb, "Scheduler Mode Channel Rate (Kbps) " "Class IPG (0.1 ns) Flow IPG (us)"); for (i = 0; i < NTX_SCHED; ++i, map >>= 2) { t4_get_tx_sched(sc, i, &kbps, &ipg); sbuf_printf(sb, "\n %u %-5s %u ", i, (mode & (1 << i)) ? "flow" : "class", map & 3); if (kbps) sbuf_printf(sb, "%9u ", kbps); else sbuf_printf(sb, " disabled "); if (ipg) sbuf_printf(sb, "%13u ", ipg); else sbuf_printf(sb, " disabled "); if (pace_tab[i]) sbuf_printf(sb, "%10u", pace_tab[i]); else sbuf_printf(sb, " disabled"); } rc = sbuf_finish(sb); sbuf_delete(sb); return (rc); } static int sysctl_lb_stats(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; struct sbuf *sb; int rc, i, j; uint64_t *p0, *p1; struct lb_port_stats s[2]; static const char *stat_name[] = { "OctetsOK:", "FramesOK:", "BcastFrames:", "McastFrames:", "UcastFrames:", "ErrorFrames:", "Frames64:", "Frames65To127:", "Frames128To255:", "Frames256To511:", "Frames512To1023:", "Frames1024To1518:", "Frames1519ToMax:", "FramesDropped:", "BG0FramesDropped:", "BG1FramesDropped:", "BG2FramesDropped:", "BG3FramesDropped:", "BG0FramesTrunc:", "BG1FramesTrunc:", "BG2FramesTrunc:", "BG3FramesTrunc:" }; rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return (rc); sb = sbuf_new_for_sysctl(NULL, NULL, 4096, req); if (sb == NULL) return (ENOMEM); memset(s, 0, sizeof(s)); for (i = 0; i < sc->chip_params->nchan; i += 2) { t4_get_lb_stats(sc, i, &s[0]); t4_get_lb_stats(sc, i + 1, &s[1]); p0 = &s[0].octets; p1 = &s[1].octets; sbuf_printf(sb, "%s Loopback %u" " Loopback %u", i == 0 ? "" : "\n", i, i + 1); for (j = 0; j < nitems(stat_name); j++) sbuf_printf(sb, "\n%-17s %20ju %20ju", stat_name[j], *p0++, *p1++); } rc = sbuf_finish(sb); sbuf_delete(sb); return (rc); } static int sysctl_linkdnrc(SYSCTL_HANDLER_ARGS) { int rc = 0; struct port_info *pi = arg1; struct sbuf *sb; rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return(rc); sb = sbuf_new_for_sysctl(NULL, NULL, 64, req); if (sb == NULL) return (ENOMEM); if (pi->linkdnrc < 0) sbuf_printf(sb, "n/a"); else sbuf_printf(sb, "%s", t4_link_down_rc_str(pi->linkdnrc)); rc = sbuf_finish(sb); sbuf_delete(sb); return (rc); } struct mem_desc { unsigned int base; unsigned int limit; unsigned int idx; }; static int mem_desc_cmp(const void *a, const void *b) { return ((const struct mem_desc *)a)->base - ((const struct mem_desc *)b)->base; } static void mem_region_show(struct sbuf *sb, const char *name, unsigned int from, unsigned int to) { unsigned int size; if (from == to) return; size = to - from + 1; if (size == 0) return; /* XXX: need humanize_number(3) in libkern for a more readable 'size' */ sbuf_printf(sb, "%-15s %#x-%#x [%u]\n", name, from, to, size); } static int sysctl_meminfo(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; struct sbuf *sb; int rc, i, n; uint32_t lo, hi, used, alloc; static const char *memory[] = {"EDC0:", "EDC1:", "MC:", "MC0:", "MC1:"}; static const char *region[] = { "DBQ contexts:", "IMSG contexts:", "FLM cache:", "TCBs:", "Pstructs:", "Timers:", "Rx FL:", "Tx FL:", "Pstruct FL:", "Tx payload:", "Rx payload:", "LE hash:", "iSCSI region:", "TDDP region:", "TPT region:", "STAG region:", "RQ region:", "RQUDP region:", "PBL region:", "TXPBL region:", "DBVFIFO region:", "ULPRX state:", "ULPTX state:", "On-chip queues:" }; struct mem_desc avail[4]; struct mem_desc mem[nitems(region) + 3]; /* up to 3 holes */ struct mem_desc *md = mem; rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return (rc); sb = sbuf_new_for_sysctl(NULL, NULL, 4096, req); if (sb == NULL) return (ENOMEM); for (i = 0; i < nitems(mem); i++) { mem[i].limit = 0; mem[i].idx = i; } /* Find and sort the populated memory ranges */ i = 0; lo = t4_read_reg(sc, A_MA_TARGET_MEM_ENABLE); if (lo & F_EDRAM0_ENABLE) { hi = t4_read_reg(sc, A_MA_EDRAM0_BAR); avail[i].base = G_EDRAM0_BASE(hi) << 20; avail[i].limit = avail[i].base + (G_EDRAM0_SIZE(hi) << 20); avail[i].idx = 0; i++; } if (lo & F_EDRAM1_ENABLE) { hi = t4_read_reg(sc, A_MA_EDRAM1_BAR); avail[i].base = G_EDRAM1_BASE(hi) << 20; avail[i].limit = avail[i].base + (G_EDRAM1_SIZE(hi) << 20); avail[i].idx = 1; i++; } if (lo & F_EXT_MEM_ENABLE) { hi = t4_read_reg(sc, A_MA_EXT_MEMORY_BAR); avail[i].base = G_EXT_MEM_BASE(hi) << 20; avail[i].limit = avail[i].base + (G_EXT_MEM_SIZE(hi) << 20); avail[i].idx = is_t5(sc) ? 3 : 2; /* Call it MC0 for T5 */ i++; } if (is_t5(sc) && lo & F_EXT_MEM1_ENABLE) { hi = t4_read_reg(sc, A_MA_EXT_MEMORY1_BAR); avail[i].base = G_EXT_MEM1_BASE(hi) << 20; avail[i].limit = avail[i].base + (G_EXT_MEM1_SIZE(hi) << 20); avail[i].idx = 4; i++; } if (!i) /* no memory available */ return 0; qsort(avail, i, sizeof(struct mem_desc), mem_desc_cmp); (md++)->base = t4_read_reg(sc, A_SGE_DBQ_CTXT_BADDR); (md++)->base = t4_read_reg(sc, A_SGE_IMSG_CTXT_BADDR); (md++)->base = t4_read_reg(sc, A_SGE_FLM_CACHE_BADDR); (md++)->base = t4_read_reg(sc, A_TP_CMM_TCB_BASE); (md++)->base = t4_read_reg(sc, A_TP_CMM_MM_BASE); (md++)->base = t4_read_reg(sc, A_TP_CMM_TIMER_BASE); (md++)->base = t4_read_reg(sc, A_TP_CMM_MM_RX_FLST_BASE); (md++)->base = t4_read_reg(sc, A_TP_CMM_MM_TX_FLST_BASE); (md++)->base = t4_read_reg(sc, A_TP_CMM_MM_PS_FLST_BASE); /* the next few have explicit upper bounds */ md->base = t4_read_reg(sc, A_TP_PMM_TX_BASE); md->limit = md->base - 1 + t4_read_reg(sc, A_TP_PMM_TX_PAGE_SIZE) * G_PMTXMAXPAGE(t4_read_reg(sc, A_TP_PMM_TX_MAX_PAGE)); md++; md->base = t4_read_reg(sc, A_TP_PMM_RX_BASE); md->limit = md->base - 1 + t4_read_reg(sc, A_TP_PMM_RX_PAGE_SIZE) * G_PMRXMAXPAGE(t4_read_reg(sc, A_TP_PMM_RX_MAX_PAGE)); md++; if (t4_read_reg(sc, A_LE_DB_CONFIG) & F_HASHEN) { if (chip_id(sc) <= CHELSIO_T5) md->base = t4_read_reg(sc, A_LE_DB_HASH_TID_BASE); else md->base = t4_read_reg(sc, A_LE_DB_HASH_TBL_BASE_ADDR); md->limit = 0; } else { md->base = 0; md->idx = nitems(region); /* hide it */ } md++; #define ulp_region(reg) \ md->base = t4_read_reg(sc, A_ULP_ ## reg ## _LLIMIT);\ (md++)->limit = t4_read_reg(sc, A_ULP_ ## reg ## _ULIMIT) ulp_region(RX_ISCSI); ulp_region(RX_TDDP); ulp_region(TX_TPT); ulp_region(RX_STAG); ulp_region(RX_RQ); ulp_region(RX_RQUDP); ulp_region(RX_PBL); ulp_region(TX_PBL); #undef ulp_region md->base = 0; md->idx = nitems(region); if (!is_t4(sc)) { uint32_t size = 0; uint32_t sge_ctrl = t4_read_reg(sc, A_SGE_CONTROL2); uint32_t fifo_size = t4_read_reg(sc, A_SGE_DBVFIFO_SIZE); if (is_t5(sc)) { if (sge_ctrl & F_VFIFO_ENABLE) size = G_DBVFIFO_SIZE(fifo_size); } else size = G_T6_DBVFIFO_SIZE(fifo_size); if (size) { md->base = G_BASEADDR(t4_read_reg(sc, A_SGE_DBVFIFO_BADDR)); md->limit = md->base + (size << 2) - 1; } } md++; md->base = t4_read_reg(sc, A_ULP_RX_CTX_BASE); md->limit = 0; md++; md->base = t4_read_reg(sc, A_ULP_TX_ERR_TABLE_BASE); md->limit = 0; md++; md->base = sc->vres.ocq.start; if (sc->vres.ocq.size) md->limit = md->base + sc->vres.ocq.size - 1; else md->idx = nitems(region); /* hide it */ md++; /* add any address-space holes, there can be up to 3 */ for (n = 0; n < i - 1; n++) if (avail[n].limit < avail[n + 1].base) (md++)->base = avail[n].limit; if (avail[n].limit) (md++)->base = avail[n].limit; n = md - mem; qsort(mem, n, sizeof(struct mem_desc), mem_desc_cmp); for (lo = 0; lo < i; lo++) mem_region_show(sb, memory[avail[lo].idx], avail[lo].base, avail[lo].limit - 1); sbuf_printf(sb, "\n"); for (i = 0; i < n; i++) { if (mem[i].idx >= nitems(region)) continue; /* skip holes */ if (!mem[i].limit) mem[i].limit = i < n - 1 ? mem[i + 1].base - 1 : ~0; mem_region_show(sb, region[mem[i].idx], mem[i].base, mem[i].limit); } sbuf_printf(sb, "\n"); lo = t4_read_reg(sc, A_CIM_SDRAM_BASE_ADDR); hi = t4_read_reg(sc, A_CIM_SDRAM_ADDR_SIZE) + lo - 1; mem_region_show(sb, "uP RAM:", lo, hi); lo = t4_read_reg(sc, A_CIM_EXTMEM2_BASE_ADDR); hi = t4_read_reg(sc, A_CIM_EXTMEM2_ADDR_SIZE) + lo - 1; mem_region_show(sb, "uP Extmem2:", lo, hi); lo = t4_read_reg(sc, A_TP_PMM_RX_MAX_PAGE); sbuf_printf(sb, "\n%u Rx pages of size %uKiB for %u channels\n", G_PMRXMAXPAGE(lo), t4_read_reg(sc, A_TP_PMM_RX_PAGE_SIZE) >> 10, (lo & F_PMRXNUMCHN) ? 2 : 1); lo = t4_read_reg(sc, A_TP_PMM_TX_MAX_PAGE); hi = t4_read_reg(sc, A_TP_PMM_TX_PAGE_SIZE); sbuf_printf(sb, "%u Tx pages of size %u%ciB for %u channels\n", G_PMTXMAXPAGE(lo), hi >= (1 << 20) ? (hi >> 20) : (hi >> 10), hi >= (1 << 20) ? 'M' : 'K', 1 << G_PMTXNUMCHN(lo)); sbuf_printf(sb, "%u p-structs\n", t4_read_reg(sc, A_TP_CMM_MM_MAX_PSTRUCT)); for (i = 0; i < 4; i++) { if (chip_id(sc) > CHELSIO_T5) lo = t4_read_reg(sc, A_MPS_RX_MAC_BG_PG_CNT0 + i * 4); else lo = t4_read_reg(sc, A_MPS_RX_PG_RSV0 + i * 4); if (is_t5(sc)) { used = G_T5_USED(lo); alloc = G_T5_ALLOC(lo); } else { used = G_USED(lo); alloc = G_ALLOC(lo); } /* For T6 these are MAC buffer groups */ sbuf_printf(sb, "\nPort %d using %u pages out of %u allocated", i, used, alloc); } for (i = 0; i < sc->chip_params->nchan; i++) { if (chip_id(sc) > CHELSIO_T5) lo = t4_read_reg(sc, A_MPS_RX_LPBK_BG_PG_CNT0 + i * 4); else lo = t4_read_reg(sc, A_MPS_RX_PG_RSV4 + i * 4); if (is_t5(sc)) { used = G_T5_USED(lo); alloc = G_T5_ALLOC(lo); } else { used = G_USED(lo); alloc = G_ALLOC(lo); } /* For T6 these are MAC buffer groups */ sbuf_printf(sb, "\nLoopback %d using %u pages out of %u allocated", i, used, alloc); } rc = sbuf_finish(sb); sbuf_delete(sb); return (rc); } static inline void tcamxy2valmask(uint64_t x, uint64_t y, uint8_t *addr, uint64_t *mask) { *mask = x | y; y = htobe64(y); memcpy(addr, (char *)&y + 2, ETHER_ADDR_LEN); } static int sysctl_mps_tcam(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; struct sbuf *sb; int rc, i; MPASS(chip_id(sc) <= CHELSIO_T5); rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return (rc); sb = sbuf_new_for_sysctl(NULL, NULL, 4096, req); if (sb == NULL) return (ENOMEM); sbuf_printf(sb, "Idx Ethernet address Mask Vld Ports PF" " VF Replication P0 P1 P2 P3 ML"); for (i = 0; i < sc->chip_params->mps_tcam_size; i++) { uint64_t tcamx, tcamy, mask; uint32_t cls_lo, cls_hi; uint8_t addr[ETHER_ADDR_LEN]; tcamy = t4_read_reg64(sc, MPS_CLS_TCAM_Y_L(i)); tcamx = t4_read_reg64(sc, MPS_CLS_TCAM_X_L(i)); if (tcamx & tcamy) continue; tcamxy2valmask(tcamx, tcamy, addr, &mask); cls_lo = t4_read_reg(sc, MPS_CLS_SRAM_L(i)); cls_hi = t4_read_reg(sc, MPS_CLS_SRAM_H(i)); sbuf_printf(sb, "\n%3u %02x:%02x:%02x:%02x:%02x:%02x %012jx" " %c %#x%4u%4d", i, addr[0], addr[1], addr[2], addr[3], addr[4], addr[5], (uintmax_t)mask, (cls_lo & F_SRAM_VLD) ? 'Y' : 'N', G_PORTMAP(cls_hi), G_PF(cls_lo), (cls_lo & F_VF_VALID) ? G_VF(cls_lo) : -1); if (cls_lo & F_REPLICATE) { struct fw_ldst_cmd ldst_cmd; memset(&ldst_cmd, 0, sizeof(ldst_cmd)); ldst_cmd.op_to_addrspace = htobe32(V_FW_CMD_OP(FW_LDST_CMD) | F_FW_CMD_REQUEST | F_FW_CMD_READ | V_FW_LDST_CMD_ADDRSPACE(FW_LDST_ADDRSPC_MPS)); ldst_cmd.cycles_to_len16 = htobe32(FW_LEN16(ldst_cmd)); ldst_cmd.u.mps.rplc.fid_idx = htobe16(V_FW_LDST_CMD_FID(FW_LDST_MPS_RPLC) | V_FW_LDST_CMD_IDX(i)); rc = begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4mps"); if (rc) break; rc = -t4_wr_mbox(sc, sc->mbox, &ldst_cmd, sizeof(ldst_cmd), &ldst_cmd); end_synchronized_op(sc, 0); if (rc != 0) { sbuf_printf(sb, "%36d", rc); rc = 0; } else { sbuf_printf(sb, " %08x %08x %08x %08x", be32toh(ldst_cmd.u.mps.rplc.rplc127_96), be32toh(ldst_cmd.u.mps.rplc.rplc95_64), be32toh(ldst_cmd.u.mps.rplc.rplc63_32), be32toh(ldst_cmd.u.mps.rplc.rplc31_0)); } } else sbuf_printf(sb, "%36s", ""); sbuf_printf(sb, "%4u%3u%3u%3u %#3x", G_SRAM_PRIO0(cls_lo), G_SRAM_PRIO1(cls_lo), G_SRAM_PRIO2(cls_lo), G_SRAM_PRIO3(cls_lo), (cls_lo >> S_MULTILISTEN0) & 0xf); } if (rc) (void) sbuf_finish(sb); else rc = sbuf_finish(sb); sbuf_delete(sb); return (rc); } static int sysctl_mps_tcam_t6(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; struct sbuf *sb; int rc, i; MPASS(chip_id(sc) > CHELSIO_T5); rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return (rc); sb = sbuf_new_for_sysctl(NULL, NULL, 4096, req); if (sb == NULL) return (ENOMEM); sbuf_printf(sb, "Idx Ethernet address Mask VNI Mask" " IVLAN Vld DIP_Hit Lookup Port Vld Ports PF VF" " Replication" " P0 P1 P2 P3 ML\n"); for (i = 0; i < sc->chip_params->mps_tcam_size; i++) { uint8_t dip_hit, vlan_vld, lookup_type, port_num; uint16_t ivlan; uint64_t tcamx, tcamy, val, mask; uint32_t cls_lo, cls_hi, ctl, data2, vnix, vniy; uint8_t addr[ETHER_ADDR_LEN]; ctl = V_CTLREQID(1) | V_CTLCMDTYPE(0) | V_CTLXYBITSEL(0); if (i < 256) ctl |= V_CTLTCAMINDEX(i) | V_CTLTCAMSEL(0); else ctl |= V_CTLTCAMINDEX(i - 256) | V_CTLTCAMSEL(1); t4_write_reg(sc, A_MPS_CLS_TCAM_DATA2_CTL, ctl); val = t4_read_reg(sc, A_MPS_CLS_TCAM_RDATA1_REQ_ID1); tcamy = G_DMACH(val) << 32; tcamy |= t4_read_reg(sc, A_MPS_CLS_TCAM_RDATA0_REQ_ID1); data2 = t4_read_reg(sc, A_MPS_CLS_TCAM_RDATA2_REQ_ID1); lookup_type = G_DATALKPTYPE(data2); port_num = G_DATAPORTNUM(data2); if (lookup_type && lookup_type != M_DATALKPTYPE) { /* Inner header VNI */ vniy = ((data2 & F_DATAVIDH2) << 23) | (G_DATAVIDH1(data2) << 16) | G_VIDL(val); dip_hit = data2 & F_DATADIPHIT; vlan_vld = 0; } else { vniy = 0; dip_hit = 0; vlan_vld = data2 & F_DATAVIDH2; ivlan = G_VIDL(val); } ctl |= V_CTLXYBITSEL(1); t4_write_reg(sc, A_MPS_CLS_TCAM_DATA2_CTL, ctl); val = t4_read_reg(sc, A_MPS_CLS_TCAM_RDATA1_REQ_ID1); tcamx = G_DMACH(val) << 32; tcamx |= t4_read_reg(sc, A_MPS_CLS_TCAM_RDATA0_REQ_ID1); data2 = t4_read_reg(sc, A_MPS_CLS_TCAM_RDATA2_REQ_ID1); if (lookup_type && lookup_type != M_DATALKPTYPE) { /* Inner header VNI mask */ vnix = ((data2 & F_DATAVIDH2) << 23) | (G_DATAVIDH1(data2) << 16) | G_VIDL(val); } else vnix = 0; if (tcamx & tcamy) continue; tcamxy2valmask(tcamx, tcamy, addr, &mask); cls_lo = t4_read_reg(sc, MPS_CLS_SRAM_L(i)); cls_hi = t4_read_reg(sc, MPS_CLS_SRAM_H(i)); if (lookup_type && lookup_type != M_DATALKPTYPE) { sbuf_printf(sb, "\n%3u %02x:%02x:%02x:%02x:%02x:%02x " "%012jx %06x %06x - - %3c" " 'I' %4x %3c %#x%4u%4d", i, addr[0], addr[1], addr[2], addr[3], addr[4], addr[5], (uintmax_t)mask, vniy, vnix, dip_hit ? 'Y' : 'N', port_num, cls_lo & F_T6_SRAM_VLD ? 'Y' : 'N', G_PORTMAP(cls_hi), G_T6_PF(cls_lo), cls_lo & F_T6_VF_VALID ? G_T6_VF(cls_lo) : -1); } else { sbuf_printf(sb, "\n%3u %02x:%02x:%02x:%02x:%02x:%02x " "%012jx - - ", i, addr[0], addr[1], addr[2], addr[3], addr[4], addr[5], (uintmax_t)mask); if (vlan_vld) sbuf_printf(sb, "%4u Y ", ivlan); else sbuf_printf(sb, " - N "); sbuf_printf(sb, "- %3c %4x %3c %#x%4u%4d", lookup_type ? 'I' : 'O', port_num, cls_lo & F_T6_SRAM_VLD ? 'Y' : 'N', G_PORTMAP(cls_hi), G_T6_PF(cls_lo), cls_lo & F_T6_VF_VALID ? G_T6_VF(cls_lo) : -1); } if (cls_lo & F_T6_REPLICATE) { struct fw_ldst_cmd ldst_cmd; memset(&ldst_cmd, 0, sizeof(ldst_cmd)); ldst_cmd.op_to_addrspace = htobe32(V_FW_CMD_OP(FW_LDST_CMD) | F_FW_CMD_REQUEST | F_FW_CMD_READ | V_FW_LDST_CMD_ADDRSPACE(FW_LDST_ADDRSPC_MPS)); ldst_cmd.cycles_to_len16 = htobe32(FW_LEN16(ldst_cmd)); ldst_cmd.u.mps.rplc.fid_idx = htobe16(V_FW_LDST_CMD_FID(FW_LDST_MPS_RPLC) | V_FW_LDST_CMD_IDX(i)); rc = begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t6mps"); if (rc) break; rc = -t4_wr_mbox(sc, sc->mbox, &ldst_cmd, sizeof(ldst_cmd), &ldst_cmd); end_synchronized_op(sc, 0); if (rc != 0) { sbuf_printf(sb, "%72d", rc); rc = 0; } else { sbuf_printf(sb, " %08x %08x %08x %08x" " %08x %08x %08x %08x", be32toh(ldst_cmd.u.mps.rplc.rplc255_224), be32toh(ldst_cmd.u.mps.rplc.rplc223_192), be32toh(ldst_cmd.u.mps.rplc.rplc191_160), be32toh(ldst_cmd.u.mps.rplc.rplc159_128), be32toh(ldst_cmd.u.mps.rplc.rplc127_96), be32toh(ldst_cmd.u.mps.rplc.rplc95_64), be32toh(ldst_cmd.u.mps.rplc.rplc63_32), be32toh(ldst_cmd.u.mps.rplc.rplc31_0)); } } else sbuf_printf(sb, "%72s", ""); sbuf_printf(sb, "%4u%3u%3u%3u %#x", G_T6_SRAM_PRIO0(cls_lo), G_T6_SRAM_PRIO1(cls_lo), G_T6_SRAM_PRIO2(cls_lo), G_T6_SRAM_PRIO3(cls_lo), (cls_lo >> S_T6_MULTILISTEN0) & 0xf); } if (rc) (void) sbuf_finish(sb); else rc = sbuf_finish(sb); sbuf_delete(sb); return (rc); } static int sysctl_path_mtus(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; struct sbuf *sb; int rc; uint16_t mtus[NMTUS]; rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return (rc); sb = sbuf_new_for_sysctl(NULL, NULL, 256, req); if (sb == NULL) return (ENOMEM); t4_read_mtu_tbl(sc, mtus, NULL); sbuf_printf(sb, "%u %u %u %u %u %u %u %u %u %u %u %u %u %u %u %u", mtus[0], mtus[1], mtus[2], mtus[3], mtus[4], mtus[5], mtus[6], mtus[7], mtus[8], mtus[9], mtus[10], mtus[11], mtus[12], mtus[13], mtus[14], mtus[15]); rc = sbuf_finish(sb); sbuf_delete(sb); return (rc); } static int sysctl_pm_stats(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; struct sbuf *sb; int rc, i; uint32_t tx_cnt[MAX_PM_NSTATS], rx_cnt[MAX_PM_NSTATS]; uint64_t tx_cyc[MAX_PM_NSTATS], rx_cyc[MAX_PM_NSTATS]; static const char *tx_stats[MAX_PM_NSTATS] = { "Read:", "Write bypass:", "Write mem:", "Bypass + mem:", "Tx FIFO wait", NULL, "Tx latency" }; static const char *rx_stats[MAX_PM_NSTATS] = { "Read:", "Write bypass:", "Write mem:", "Flush:", - " Rx FIFO wait", NULL, "Rx latency" + "Rx FIFO wait", NULL, "Rx latency" }; rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return (rc); sb = sbuf_new_for_sysctl(NULL, NULL, 256, req); if (sb == NULL) return (ENOMEM); t4_pmtx_get_stats(sc, tx_cnt, tx_cyc); t4_pmrx_get_stats(sc, rx_cnt, rx_cyc); sbuf_printf(sb, " Tx pcmds Tx bytes"); for (i = 0; i < 4; i++) { sbuf_printf(sb, "\n%-13s %10u %20ju", tx_stats[i], tx_cnt[i], tx_cyc[i]); } sbuf_printf(sb, "\n Rx pcmds Rx bytes"); for (i = 0; i < 4; i++) { sbuf_printf(sb, "\n%-13s %10u %20ju", rx_stats[i], rx_cnt[i], rx_cyc[i]); } if (chip_id(sc) > CHELSIO_T5) { sbuf_printf(sb, "\n Total wait Total occupancy"); sbuf_printf(sb, "\n%-13s %10u %20ju", tx_stats[i], tx_cnt[i], tx_cyc[i]); sbuf_printf(sb, "\n%-13s %10u %20ju", rx_stats[i], rx_cnt[i], rx_cyc[i]); i += 2; MPASS(i < nitems(tx_stats)); sbuf_printf(sb, "\n Reads Total wait"); sbuf_printf(sb, "\n%-13s %10u %20ju", tx_stats[i], tx_cnt[i], tx_cyc[i]); sbuf_printf(sb, "\n%-13s %10u %20ju", rx_stats[i], rx_cnt[i], rx_cyc[i]); } rc = sbuf_finish(sb); sbuf_delete(sb); return (rc); } static int sysctl_rdma_stats(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; struct sbuf *sb; int rc; struct tp_rdma_stats stats; rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return (rc); sb = sbuf_new_for_sysctl(NULL, NULL, 256, req); if (sb == NULL) return (ENOMEM); mtx_lock(&sc->reg_lock); t4_tp_get_rdma_stats(sc, &stats); mtx_unlock(&sc->reg_lock); sbuf_printf(sb, "NoRQEModDefferals: %u\n", stats.rqe_dfr_mod); sbuf_printf(sb, "NoRQEPktDefferals: %u", stats.rqe_dfr_pkt); rc = sbuf_finish(sb); sbuf_delete(sb); return (rc); } static int sysctl_tcp_stats(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; struct sbuf *sb; int rc; struct tp_tcp_stats v4, v6; rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return (rc); sb = sbuf_new_for_sysctl(NULL, NULL, 256, req); if (sb == NULL) return (ENOMEM); mtx_lock(&sc->reg_lock); t4_tp_get_tcp_stats(sc, &v4, &v6); mtx_unlock(&sc->reg_lock); sbuf_printf(sb, " IP IPv6\n"); sbuf_printf(sb, "OutRsts: %20u %20u\n", v4.tcp_out_rsts, v6.tcp_out_rsts); sbuf_printf(sb, "InSegs: %20ju %20ju\n", v4.tcp_in_segs, v6.tcp_in_segs); sbuf_printf(sb, "OutSegs: %20ju %20ju\n", v4.tcp_out_segs, v6.tcp_out_segs); sbuf_printf(sb, "RetransSegs: %20ju %20ju", v4.tcp_retrans_segs, v6.tcp_retrans_segs); rc = sbuf_finish(sb); sbuf_delete(sb); return (rc); } static int sysctl_tids(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; struct sbuf *sb; int rc; struct tid_info *t = &sc->tids; rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return (rc); sb = sbuf_new_for_sysctl(NULL, NULL, 256, req); if (sb == NULL) return (ENOMEM); if (t->natids) { sbuf_printf(sb, "ATID range: 0-%u, in use: %u\n", t->natids - 1, t->atids_in_use); } if (t->ntids) { if (t4_read_reg(sc, A_LE_DB_CONFIG) & F_HASHEN) { uint32_t b; if (chip_id(sc) <= CHELSIO_T5) b = t4_read_reg(sc, A_LE_DB_SERVER_INDEX) / 4; else b = t4_read_reg(sc, A_LE_DB_SRVR_START_INDEX); if (b) { sbuf_printf(sb, "TID range: 0-%u, %u-%u", b - 1, t4_read_reg(sc, A_LE_DB_TID_HASHBASE) / 4, t->ntids - 1); } else { sbuf_printf(sb, "TID range: %u-%u", t4_read_reg(sc, A_LE_DB_TID_HASHBASE) / 4, t->ntids - 1); } } else sbuf_printf(sb, "TID range: 0-%u", t->ntids - 1); sbuf_printf(sb, ", in use: %u\n", atomic_load_acq_int(&t->tids_in_use)); } if (t->nstids) { sbuf_printf(sb, "STID range: %u-%u, in use: %u\n", t->stid_base, t->stid_base + t->nstids - 1, t->stids_in_use); } if (t->nftids) { sbuf_printf(sb, "FTID range: %u-%u\n", t->ftid_base, t->ftid_base + t->nftids - 1); } if (t->netids) { sbuf_printf(sb, "ETID range: %u-%u\n", t->etid_base, t->etid_base + t->netids - 1); } sbuf_printf(sb, "HW TID usage: %u IP users, %u IPv6 users", t4_read_reg(sc, A_LE_DB_ACT_CNT_IPV4), t4_read_reg(sc, A_LE_DB_ACT_CNT_IPV6)); rc = sbuf_finish(sb); sbuf_delete(sb); return (rc); } static int sysctl_tp_err_stats(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; struct sbuf *sb; int rc; struct tp_err_stats stats; rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return (rc); sb = sbuf_new_for_sysctl(NULL, NULL, 256, req); if (sb == NULL) return (ENOMEM); mtx_lock(&sc->reg_lock); t4_tp_get_err_stats(sc, &stats); mtx_unlock(&sc->reg_lock); if (sc->chip_params->nchan > 2) { sbuf_printf(sb, " channel 0 channel 1" " channel 2 channel 3\n"); sbuf_printf(sb, "macInErrs: %10u %10u %10u %10u\n", stats.mac_in_errs[0], stats.mac_in_errs[1], stats.mac_in_errs[2], stats.mac_in_errs[3]); sbuf_printf(sb, "hdrInErrs: %10u %10u %10u %10u\n", stats.hdr_in_errs[0], stats.hdr_in_errs[1], stats.hdr_in_errs[2], stats.hdr_in_errs[3]); sbuf_printf(sb, "tcpInErrs: %10u %10u %10u %10u\n", stats.tcp_in_errs[0], stats.tcp_in_errs[1], stats.tcp_in_errs[2], stats.tcp_in_errs[3]); sbuf_printf(sb, "tcp6InErrs: %10u %10u %10u %10u\n", stats.tcp6_in_errs[0], stats.tcp6_in_errs[1], stats.tcp6_in_errs[2], stats.tcp6_in_errs[3]); sbuf_printf(sb, "tnlCongDrops: %10u %10u %10u %10u\n", stats.tnl_cong_drops[0], stats.tnl_cong_drops[1], stats.tnl_cong_drops[2], stats.tnl_cong_drops[3]); sbuf_printf(sb, "tnlTxDrops: %10u %10u %10u %10u\n", stats.tnl_tx_drops[0], stats.tnl_tx_drops[1], stats.tnl_tx_drops[2], stats.tnl_tx_drops[3]); sbuf_printf(sb, "ofldVlanDrops: %10u %10u %10u %10u\n", stats.ofld_vlan_drops[0], stats.ofld_vlan_drops[1], stats.ofld_vlan_drops[2], stats.ofld_vlan_drops[3]); sbuf_printf(sb, "ofldChanDrops: %10u %10u %10u %10u\n\n", stats.ofld_chan_drops[0], stats.ofld_chan_drops[1], stats.ofld_chan_drops[2], stats.ofld_chan_drops[3]); } else { sbuf_printf(sb, " channel 0 channel 1\n"); sbuf_printf(sb, "macInErrs: %10u %10u\n", stats.mac_in_errs[0], stats.mac_in_errs[1]); sbuf_printf(sb, "hdrInErrs: %10u %10u\n", stats.hdr_in_errs[0], stats.hdr_in_errs[1]); sbuf_printf(sb, "tcpInErrs: %10u %10u\n", stats.tcp_in_errs[0], stats.tcp_in_errs[1]); sbuf_printf(sb, "tcp6InErrs: %10u %10u\n", stats.tcp6_in_errs[0], stats.tcp6_in_errs[1]); sbuf_printf(sb, "tnlCongDrops: %10u %10u\n", stats.tnl_cong_drops[0], stats.tnl_cong_drops[1]); sbuf_printf(sb, "tnlTxDrops: %10u %10u\n", stats.tnl_tx_drops[0], stats.tnl_tx_drops[1]); sbuf_printf(sb, "ofldVlanDrops: %10u %10u\n", stats.ofld_vlan_drops[0], stats.ofld_vlan_drops[1]); sbuf_printf(sb, "ofldChanDrops: %10u %10u\n\n", stats.ofld_chan_drops[0], stats.ofld_chan_drops[1]); } sbuf_printf(sb, "ofldNoNeigh: %u\nofldCongDefer: %u", stats.ofld_no_neigh, stats.ofld_cong_defer); rc = sbuf_finish(sb); sbuf_delete(sb); return (rc); } static int sysctl_tp_la_mask(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; struct tp_params *tpp = &sc->params.tp; u_int mask; int rc; mask = tpp->la_mask >> 16; rc = sysctl_handle_int(oidp, &mask, 0, req); if (rc != 0 || req->newptr == NULL) return (rc); if (mask > 0xffff) return (EINVAL); tpp->la_mask = mask << 16; t4_set_reg_field(sc, A_TP_DBG_LA_CONFIG, 0xffff0000U, tpp->la_mask); return (0); } struct field_desc { const char *name; u_int start; u_int width; }; static void field_desc_show(struct sbuf *sb, uint64_t v, const struct field_desc *f) { char buf[32]; int line_size = 0; while (f->name) { uint64_t mask = (1ULL << f->width) - 1; int len = snprintf(buf, sizeof(buf), "%s: %ju", f->name, ((uintmax_t)v >> f->start) & mask); if (line_size + len >= 79) { line_size = 8; sbuf_printf(sb, "\n "); } sbuf_printf(sb, "%s ", buf); line_size += len + 1; f++; } sbuf_printf(sb, "\n"); } static const struct field_desc tp_la0[] = { { "RcfOpCodeOut", 60, 4 }, { "State", 56, 4 }, { "WcfState", 52, 4 }, { "RcfOpcSrcOut", 50, 2 }, { "CRxError", 49, 1 }, { "ERxError", 48, 1 }, { "SanityFailed", 47, 1 }, { "SpuriousMsg", 46, 1 }, { "FlushInputMsg", 45, 1 }, { "FlushInputCpl", 44, 1 }, { "RssUpBit", 43, 1 }, { "RssFilterHit", 42, 1 }, { "Tid", 32, 10 }, { "InitTcb", 31, 1 }, { "LineNumber", 24, 7 }, { "Emsg", 23, 1 }, { "EdataOut", 22, 1 }, { "Cmsg", 21, 1 }, { "CdataOut", 20, 1 }, { "EreadPdu", 19, 1 }, { "CreadPdu", 18, 1 }, { "TunnelPkt", 17, 1 }, { "RcfPeerFin", 16, 1 }, { "RcfReasonOut", 12, 4 }, { "TxCchannel", 10, 2 }, { "RcfTxChannel", 8, 2 }, { "RxEchannel", 6, 2 }, { "RcfRxChannel", 5, 1 }, { "RcfDataOutSrdy", 4, 1 }, { "RxDvld", 3, 1 }, { "RxOoDvld", 2, 1 }, { "RxCongestion", 1, 1 }, { "TxCongestion", 0, 1 }, { NULL } }; static const struct field_desc tp_la1[] = { { "CplCmdIn", 56, 8 }, { "CplCmdOut", 48, 8 }, { "ESynOut", 47, 1 }, { "EAckOut", 46, 1 }, { "EFinOut", 45, 1 }, { "ERstOut", 44, 1 }, { "SynIn", 43, 1 }, { "AckIn", 42, 1 }, { "FinIn", 41, 1 }, { "RstIn", 40, 1 }, { "DataIn", 39, 1 }, { "DataInVld", 38, 1 }, { "PadIn", 37, 1 }, { "RxBufEmpty", 36, 1 }, { "RxDdp", 35, 1 }, { "RxFbCongestion", 34, 1 }, { "TxFbCongestion", 33, 1 }, { "TxPktSumSrdy", 32, 1 }, { "RcfUlpType", 28, 4 }, { "Eread", 27, 1 }, { "Ebypass", 26, 1 }, { "Esave", 25, 1 }, { "Static0", 24, 1 }, { "Cread", 23, 1 }, { "Cbypass", 22, 1 }, { "Csave", 21, 1 }, { "CPktOut", 20, 1 }, { "RxPagePoolFull", 18, 2 }, { "RxLpbkPkt", 17, 1 }, { "TxLpbkPkt", 16, 1 }, { "RxVfValid", 15, 1 }, { "SynLearned", 14, 1 }, { "SetDelEntry", 13, 1 }, { "SetInvEntry", 12, 1 }, { "CpcmdDvld", 11, 1 }, { "CpcmdSave", 10, 1 }, { "RxPstructsFull", 8, 2 }, { "EpcmdDvld", 7, 1 }, { "EpcmdFlush", 6, 1 }, { "EpcmdTrimPrefix", 5, 1 }, { "EpcmdTrimPostfix", 4, 1 }, { "ERssIp4Pkt", 3, 1 }, { "ERssIp6Pkt", 2, 1 }, { "ERssTcpUdpPkt", 1, 1 }, { "ERssFceFipPkt", 0, 1 }, { NULL } }; static const struct field_desc tp_la2[] = { { "CplCmdIn", 56, 8 }, { "MpsVfVld", 55, 1 }, { "MpsPf", 52, 3 }, { "MpsVf", 44, 8 }, { "SynIn", 43, 1 }, { "AckIn", 42, 1 }, { "FinIn", 41, 1 }, { "RstIn", 40, 1 }, { "DataIn", 39, 1 }, { "DataInVld", 38, 1 }, { "PadIn", 37, 1 }, { "RxBufEmpty", 36, 1 }, { "RxDdp", 35, 1 }, { "RxFbCongestion", 34, 1 }, { "TxFbCongestion", 33, 1 }, { "TxPktSumSrdy", 32, 1 }, { "RcfUlpType", 28, 4 }, { "Eread", 27, 1 }, { "Ebypass", 26, 1 }, { "Esave", 25, 1 }, { "Static0", 24, 1 }, { "Cread", 23, 1 }, { "Cbypass", 22, 1 }, { "Csave", 21, 1 }, { "CPktOut", 20, 1 }, { "RxPagePoolFull", 18, 2 }, { "RxLpbkPkt", 17, 1 }, { "TxLpbkPkt", 16, 1 }, { "RxVfValid", 15, 1 }, { "SynLearned", 14, 1 }, { "SetDelEntry", 13, 1 }, { "SetInvEntry", 12, 1 }, { "CpcmdDvld", 11, 1 }, { "CpcmdSave", 10, 1 }, { "RxPstructsFull", 8, 2 }, { "EpcmdDvld", 7, 1 }, { "EpcmdFlush", 6, 1 }, { "EpcmdTrimPrefix", 5, 1 }, { "EpcmdTrimPostfix", 4, 1 }, { "ERssIp4Pkt", 3, 1 }, { "ERssIp6Pkt", 2, 1 }, { "ERssTcpUdpPkt", 1, 1 }, { "ERssFceFipPkt", 0, 1 }, { NULL } }; static void tp_la_show(struct sbuf *sb, uint64_t *p, int idx) { field_desc_show(sb, *p, tp_la0); } static void tp_la_show2(struct sbuf *sb, uint64_t *p, int idx) { if (idx) sbuf_printf(sb, "\n"); field_desc_show(sb, p[0], tp_la0); if (idx < (TPLA_SIZE / 2 - 1) || p[1] != ~0ULL) field_desc_show(sb, p[1], tp_la0); } static void tp_la_show3(struct sbuf *sb, uint64_t *p, int idx) { if (idx) sbuf_printf(sb, "\n"); field_desc_show(sb, p[0], tp_la0); if (idx < (TPLA_SIZE / 2 - 1) || p[1] != ~0ULL) field_desc_show(sb, p[1], (p[0] & (1 << 17)) ? tp_la2 : tp_la1); } static int sysctl_tp_la(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; struct sbuf *sb; uint64_t *buf, *p; int rc; u_int i, inc; void (*show_func)(struct sbuf *, uint64_t *, int); rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return (rc); sb = sbuf_new_for_sysctl(NULL, NULL, 4096, req); if (sb == NULL) return (ENOMEM); buf = malloc(TPLA_SIZE * sizeof(uint64_t), M_CXGBE, M_ZERO | M_WAITOK); t4_tp_read_la(sc, buf, NULL); p = buf; switch (G_DBGLAMODE(t4_read_reg(sc, A_TP_DBG_LA_CONFIG))) { case 2: inc = 2; show_func = tp_la_show2; break; case 3: inc = 2; show_func = tp_la_show3; break; default: inc = 1; show_func = tp_la_show; } for (i = 0; i < TPLA_SIZE / inc; i++, p += inc) (*show_func)(sb, p, i); rc = sbuf_finish(sb); sbuf_delete(sb); free(buf, M_CXGBE); return (rc); } static int sysctl_tx_rate(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; struct sbuf *sb; int rc; u64 nrate[MAX_NCHAN], orate[MAX_NCHAN]; rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return (rc); sb = sbuf_new_for_sysctl(NULL, NULL, 256, req); if (sb == NULL) return (ENOMEM); t4_get_chan_txrate(sc, nrate, orate); if (sc->chip_params->nchan > 2) { sbuf_printf(sb, " channel 0 channel 1" " channel 2 channel 3\n"); sbuf_printf(sb, "NIC B/s: %10ju %10ju %10ju %10ju\n", nrate[0], nrate[1], nrate[2], nrate[3]); sbuf_printf(sb, "Offload B/s: %10ju %10ju %10ju %10ju", orate[0], orate[1], orate[2], orate[3]); } else { sbuf_printf(sb, " channel 0 channel 1\n"); sbuf_printf(sb, "NIC B/s: %10ju %10ju\n", nrate[0], nrate[1]); sbuf_printf(sb, "Offload B/s: %10ju %10ju", orate[0], orate[1]); } rc = sbuf_finish(sb); sbuf_delete(sb); return (rc); } static int sysctl_ulprx_la(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; struct sbuf *sb; uint32_t *buf, *p; int rc, i; rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return (rc); sb = sbuf_new_for_sysctl(NULL, NULL, 4096, req); if (sb == NULL) return (ENOMEM); buf = malloc(ULPRX_LA_SIZE * 8 * sizeof(uint32_t), M_CXGBE, M_ZERO | M_WAITOK); t4_ulprx_read_la(sc, buf); p = buf; sbuf_printf(sb, " Pcmd Type Message" " Data"); for (i = 0; i < ULPRX_LA_SIZE; i++, p += 8) { sbuf_printf(sb, "\n%08x%08x %4x %08x %08x%08x%08x%08x", p[1], p[0], p[2], p[3], p[7], p[6], p[5], p[4]); } rc = sbuf_finish(sb); sbuf_delete(sb); free(buf, M_CXGBE); return (rc); } static int sysctl_wcwr_stats(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; struct sbuf *sb; int rc, v; MPASS(chip_id(sc) >= CHELSIO_T5); rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return (rc); sb = sbuf_new_for_sysctl(NULL, NULL, 4096, req); if (sb == NULL) return (ENOMEM); v = t4_read_reg(sc, A_SGE_STAT_CFG); if (G_STATSOURCE_T5(v) == 7) { int mode; mode = is_t5(sc) ? G_STATMODE(v) : G_T6_STATMODE(v); if (mode == 0) { sbuf_printf(sb, "total %d, incomplete %d", t4_read_reg(sc, A_SGE_STAT_TOTAL), t4_read_reg(sc, A_SGE_STAT_MATCH)); } else if (mode == 1) { sbuf_printf(sb, "total %d, data overflow %d", t4_read_reg(sc, A_SGE_STAT_TOTAL), t4_read_reg(sc, A_SGE_STAT_MATCH)); } else { sbuf_printf(sb, "unknown mode %d", mode); } } rc = sbuf_finish(sb); sbuf_delete(sb); return (rc); } static int sysctl_tc_params(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; struct tx_sched_class *tc; struct t4_sched_class_params p; struct sbuf *sb; int i, rc, port_id, flags, mbps, gbps; rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return (rc); sb = sbuf_new_for_sysctl(NULL, NULL, 4096, req); if (sb == NULL) return (ENOMEM); port_id = arg2 >> 16; MPASS(port_id < sc->params.nports); MPASS(sc->port[port_id] != NULL); i = arg2 & 0xffff; MPASS(i < sc->chip_params->nsched_cls); tc = &sc->port[port_id]->tc[i]; rc = begin_synchronized_op(sc, NULL, HOLD_LOCK | SLEEP_OK | INTR_OK, "t4tc_p"); if (rc) goto done; flags = tc->flags; p = tc->params; end_synchronized_op(sc, LOCK_HELD); if ((flags & TX_SC_OK) == 0) { sbuf_printf(sb, "none"); goto done; } if (p.level == SCHED_CLASS_LEVEL_CL_WRR) { sbuf_printf(sb, "cl-wrr weight %u", p.weight); goto done; } else if (p.level == SCHED_CLASS_LEVEL_CL_RL) sbuf_printf(sb, "cl-rl"); else if (p.level == SCHED_CLASS_LEVEL_CH_RL) sbuf_printf(sb, "ch-rl"); else { rc = ENXIO; goto done; } if (p.ratemode == SCHED_CLASS_RATEMODE_REL) { /* XXX: top speed or actual link speed? */ gbps = port_top_speed(sc->port[port_id]); sbuf_printf(sb, " %u%% of %uGbps", p.maxrate, gbps); } else if (p.ratemode == SCHED_CLASS_RATEMODE_ABS) { switch (p.rateunit) { case SCHED_CLASS_RATEUNIT_BITS: mbps = p.maxrate / 1000; gbps = p.maxrate / 1000000; if (p.maxrate == gbps * 1000000) sbuf_printf(sb, " %uGbps", gbps); else if (p.maxrate == mbps * 1000) sbuf_printf(sb, " %uMbps", mbps); else sbuf_printf(sb, " %uKbps", p.maxrate); break; case SCHED_CLASS_RATEUNIT_PKTS: sbuf_printf(sb, " %upps", p.maxrate); break; default: rc = ENXIO; goto done; } } switch (p.mode) { case SCHED_CLASS_MODE_CLASS: sbuf_printf(sb, " aggregate"); break; case SCHED_CLASS_MODE_FLOW: sbuf_printf(sb, " per-flow"); break; default: rc = ENXIO; goto done; } done: if (rc == 0) rc = sbuf_finish(sb); sbuf_delete(sb); return (rc); } #endif #ifdef TCP_OFFLOAD static void unit_conv(char *buf, size_t len, u_int val, u_int factor) { u_int rem = val % factor; if (rem == 0) snprintf(buf, len, "%u", val / factor); else { while (rem % 10 == 0) rem /= 10; snprintf(buf, len, "%u.%u", val / factor, rem); } } static int sysctl_tp_tick(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; char buf[16]; u_int res, re; u_int cclk_ps = 1000000000 / sc->params.vpd.cclk; res = t4_read_reg(sc, A_TP_TIMER_RESOLUTION); switch (arg2) { case 0: /* timer_tick */ re = G_TIMERRESOLUTION(res); break; case 1: /* TCP timestamp tick */ re = G_TIMESTAMPRESOLUTION(res); break; case 2: /* DACK tick */ re = G_DELAYEDACKRESOLUTION(res); break; default: return (EDOOFUS); } unit_conv(buf, sizeof(buf), (cclk_ps << re), 1000000); return (sysctl_handle_string(oidp, buf, sizeof(buf), req)); } static int sysctl_tp_dack_timer(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; u_int res, dack_re, v; u_int cclk_ps = 1000000000 / sc->params.vpd.cclk; res = t4_read_reg(sc, A_TP_TIMER_RESOLUTION); dack_re = G_DELAYEDACKRESOLUTION(res); v = ((cclk_ps << dack_re) / 1000000) * t4_read_reg(sc, A_TP_DACK_TIMER); return (sysctl_handle_int(oidp, &v, 0, req)); } static int sysctl_tp_timer(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; int reg = arg2; u_int tre; u_long tp_tick_us, v; u_int cclk_ps = 1000000000 / sc->params.vpd.cclk; MPASS(reg == A_TP_RXT_MIN || reg == A_TP_RXT_MAX || reg == A_TP_PERS_MIN || reg == A_TP_PERS_MAX || reg == A_TP_KEEP_IDLE || A_TP_KEEP_INTVL || reg == A_TP_INIT_SRTT || reg == A_TP_FINWAIT2_TIMER); tre = G_TIMERRESOLUTION(t4_read_reg(sc, A_TP_TIMER_RESOLUTION)); tp_tick_us = (cclk_ps << tre) / 1000000; if (reg == A_TP_INIT_SRTT) v = tp_tick_us * G_INITSRTT(t4_read_reg(sc, reg)); else v = tp_tick_us * t4_read_reg(sc, reg); return (sysctl_handle_long(oidp, &v, 0, req)); } #endif static uint32_t fconf_iconf_to_mode(uint32_t fconf, uint32_t iconf) { uint32_t mode; mode = T4_FILTER_IPv4 | T4_FILTER_IPv6 | T4_FILTER_IP_SADDR | T4_FILTER_IP_DADDR | T4_FILTER_IP_SPORT | T4_FILTER_IP_DPORT; if (fconf & F_FRAGMENTATION) mode |= T4_FILTER_IP_FRAGMENT; if (fconf & F_MPSHITTYPE) mode |= T4_FILTER_MPS_HIT_TYPE; if (fconf & F_MACMATCH) mode |= T4_FILTER_MAC_IDX; if (fconf & F_ETHERTYPE) mode |= T4_FILTER_ETH_TYPE; if (fconf & F_PROTOCOL) mode |= T4_FILTER_IP_PROTO; if (fconf & F_TOS) mode |= T4_FILTER_IP_TOS; if (fconf & F_VLAN) mode |= T4_FILTER_VLAN; if (fconf & F_VNIC_ID) { mode |= T4_FILTER_VNIC; if (iconf & F_VNIC) mode |= T4_FILTER_IC_VNIC; } if (fconf & F_PORT) mode |= T4_FILTER_PORT; if (fconf & F_FCOE) mode |= T4_FILTER_FCoE; return (mode); } static uint32_t mode_to_fconf(uint32_t mode) { uint32_t fconf = 0; if (mode & T4_FILTER_IP_FRAGMENT) fconf |= F_FRAGMENTATION; if (mode & T4_FILTER_MPS_HIT_TYPE) fconf |= F_MPSHITTYPE; if (mode & T4_FILTER_MAC_IDX) fconf |= F_MACMATCH; if (mode & T4_FILTER_ETH_TYPE) fconf |= F_ETHERTYPE; if (mode & T4_FILTER_IP_PROTO) fconf |= F_PROTOCOL; if (mode & T4_FILTER_IP_TOS) fconf |= F_TOS; if (mode & T4_FILTER_VLAN) fconf |= F_VLAN; if (mode & T4_FILTER_VNIC) fconf |= F_VNIC_ID; if (mode & T4_FILTER_PORT) fconf |= F_PORT; if (mode & T4_FILTER_FCoE) fconf |= F_FCOE; return (fconf); } static uint32_t mode_to_iconf(uint32_t mode) { if (mode & T4_FILTER_IC_VNIC) return (F_VNIC); return (0); } static int check_fspec_against_fconf_iconf(struct adapter *sc, struct t4_filter_specification *fs) { struct tp_params *tpp = &sc->params.tp; uint32_t fconf = 0; if (fs->val.frag || fs->mask.frag) fconf |= F_FRAGMENTATION; if (fs->val.matchtype || fs->mask.matchtype) fconf |= F_MPSHITTYPE; if (fs->val.macidx || fs->mask.macidx) fconf |= F_MACMATCH; if (fs->val.ethtype || fs->mask.ethtype) fconf |= F_ETHERTYPE; if (fs->val.proto || fs->mask.proto) fconf |= F_PROTOCOL; if (fs->val.tos || fs->mask.tos) fconf |= F_TOS; if (fs->val.vlan_vld || fs->mask.vlan_vld) fconf |= F_VLAN; if (fs->val.ovlan_vld || fs->mask.ovlan_vld) { fconf |= F_VNIC_ID; if (tpp->ingress_config & F_VNIC) return (EINVAL); } if (fs->val.pfvf_vld || fs->mask.pfvf_vld) { fconf |= F_VNIC_ID; if ((tpp->ingress_config & F_VNIC) == 0) return (EINVAL); } if (fs->val.iport || fs->mask.iport) fconf |= F_PORT; if (fs->val.fcoe || fs->mask.fcoe) fconf |= F_FCOE; if ((tpp->vlan_pri_map | fconf) != tpp->vlan_pri_map) return (E2BIG); return (0); } static int get_filter_mode(struct adapter *sc, uint32_t *mode) { struct tp_params *tpp = &sc->params.tp; /* * We trust the cached values of the relevant TP registers. This means * things work reliably only if writes to those registers are always via * t4_set_filter_mode. */ *mode = fconf_iconf_to_mode(tpp->vlan_pri_map, tpp->ingress_config); return (0); } static int set_filter_mode(struct adapter *sc, uint32_t mode) { struct tp_params *tpp = &sc->params.tp; uint32_t fconf, iconf; int rc; iconf = mode_to_iconf(mode); if ((iconf ^ tpp->ingress_config) & F_VNIC) { /* * For now we just complain if A_TP_INGRESS_CONFIG is not * already set to the correct value for the requested filter * mode. It's not clear if it's safe to write to this register * on the fly. (And we trust the cached value of the register). */ return (EBUSY); } fconf = mode_to_fconf(mode); rc = begin_synchronized_op(sc, NULL, HOLD_LOCK | SLEEP_OK | INTR_OK, "t4setfm"); if (rc) return (rc); if (sc->tids.ftids_in_use > 0) { rc = EBUSY; goto done; } #ifdef TCP_OFFLOAD if (uld_active(sc, ULD_TOM)) { rc = EBUSY; goto done; } #endif rc = -t4_set_filter_mode(sc, fconf); done: end_synchronized_op(sc, LOCK_HELD); return (rc); } static inline uint64_t get_filter_hits(struct adapter *sc, uint32_t fid) { uint32_t tcb_addr; tcb_addr = t4_read_reg(sc, A_TP_CMM_TCB_BASE) + (fid + sc->tids.ftid_base) * TCB_SIZE; if (is_t4(sc)) { uint64_t hits; read_via_memwin(sc, 0, tcb_addr + 16, (uint32_t *)&hits, 8); return (be64toh(hits)); } else { uint32_t hits; read_via_memwin(sc, 0, tcb_addr + 24, &hits, 4); return (be32toh(hits)); } } static int get_filter(struct adapter *sc, struct t4_filter *t) { int i, rc, nfilters = sc->tids.nftids; struct filter_entry *f; rc = begin_synchronized_op(sc, NULL, HOLD_LOCK | SLEEP_OK | INTR_OK, "t4getf"); if (rc) return (rc); if (sc->tids.ftids_in_use == 0 || sc->tids.ftid_tab == NULL || t->idx >= nfilters) { t->idx = 0xffffffff; goto done; } f = &sc->tids.ftid_tab[t->idx]; for (i = t->idx; i < nfilters; i++, f++) { if (f->valid) { t->idx = i; t->l2tidx = f->l2t ? f->l2t->idx : 0; t->smtidx = f->smtidx; if (f->fs.hitcnts) t->hits = get_filter_hits(sc, t->idx); else t->hits = UINT64_MAX; t->fs = f->fs; goto done; } } t->idx = 0xffffffff; done: end_synchronized_op(sc, LOCK_HELD); return (0); } static int set_filter(struct adapter *sc, struct t4_filter *t) { unsigned int nfilters, nports; struct filter_entry *f; int i, rc; rc = begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4setf"); if (rc) return (rc); nfilters = sc->tids.nftids; nports = sc->params.nports; if (nfilters == 0) { rc = ENOTSUP; goto done; } if (t->idx >= nfilters) { rc = EINVAL; goto done; } /* Validate against the global filter mode and ingress config */ rc = check_fspec_against_fconf_iconf(sc, &t->fs); if (rc != 0) goto done; if (t->fs.action == FILTER_SWITCH && t->fs.eport >= nports) { rc = EINVAL; goto done; } if (t->fs.val.iport >= nports) { rc = EINVAL; goto done; } /* Can't specify an iq if not steering to it */ if (!t->fs.dirsteer && t->fs.iq) { rc = EINVAL; goto done; } /* IPv6 filter idx must be 4 aligned */ if (t->fs.type == 1 && ((t->idx & 0x3) || t->idx + 4 >= nfilters)) { rc = EINVAL; goto done; } if (!(sc->flags & FULL_INIT_DONE) && ((rc = adapter_full_init(sc)) != 0)) goto done; if (sc->tids.ftid_tab == NULL) { KASSERT(sc->tids.ftids_in_use == 0, ("%s: no memory allocated but filters_in_use > 0", __func__)); sc->tids.ftid_tab = malloc(sizeof (struct filter_entry) * nfilters, M_CXGBE, M_NOWAIT | M_ZERO); if (sc->tids.ftid_tab == NULL) { rc = ENOMEM; goto done; } mtx_init(&sc->tids.ftid_lock, "T4 filters", 0, MTX_DEF); } for (i = 0; i < 4; i++) { f = &sc->tids.ftid_tab[t->idx + i]; if (f->pending || f->valid) { rc = EBUSY; goto done; } if (f->locked) { rc = EPERM; goto done; } if (t->fs.type == 0) break; } f = &sc->tids.ftid_tab[t->idx]; f->fs = t->fs; rc = set_filter_wr(sc, t->idx); done: end_synchronized_op(sc, 0); if (rc == 0) { mtx_lock(&sc->tids.ftid_lock); for (;;) { if (f->pending == 0) { rc = f->valid ? 0 : EIO; break; } if (mtx_sleep(&sc->tids.ftid_tab, &sc->tids.ftid_lock, PCATCH, "t4setfw", 0)) { rc = EINPROGRESS; break; } } mtx_unlock(&sc->tids.ftid_lock); } return (rc); } static int del_filter(struct adapter *sc, struct t4_filter *t) { unsigned int nfilters; struct filter_entry *f; int rc; rc = begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4delf"); if (rc) return (rc); nfilters = sc->tids.nftids; if (nfilters == 0) { rc = ENOTSUP; goto done; } if (sc->tids.ftid_tab == NULL || sc->tids.ftids_in_use == 0 || t->idx >= nfilters) { rc = EINVAL; goto done; } if (!(sc->flags & FULL_INIT_DONE)) { rc = EAGAIN; goto done; } f = &sc->tids.ftid_tab[t->idx]; if (f->pending) { rc = EBUSY; goto done; } if (f->locked) { rc = EPERM; goto done; } if (f->valid) { t->fs = f->fs; /* extra info for the caller */ rc = del_filter_wr(sc, t->idx); } done: end_synchronized_op(sc, 0); if (rc == 0) { mtx_lock(&sc->tids.ftid_lock); for (;;) { if (f->pending == 0) { rc = f->valid ? EIO : 0; break; } if (mtx_sleep(&sc->tids.ftid_tab, &sc->tids.ftid_lock, PCATCH, "t4delfw", 0)) { rc = EINPROGRESS; break; } } mtx_unlock(&sc->tids.ftid_lock); } return (rc); } static void clear_filter(struct filter_entry *f) { if (f->l2t) t4_l2t_release(f->l2t); bzero(f, sizeof (*f)); } static int set_filter_wr(struct adapter *sc, int fidx) { struct filter_entry *f = &sc->tids.ftid_tab[fidx]; struct fw_filter_wr *fwr; unsigned int ftid, vnic_vld, vnic_vld_mask; struct wrq_cookie cookie; ASSERT_SYNCHRONIZED_OP(sc); if (f->fs.newdmac || f->fs.newvlan) { /* This filter needs an L2T entry; allocate one. */ f->l2t = t4_l2t_alloc_switching(sc->l2t); if (f->l2t == NULL) return (EAGAIN); if (t4_l2t_set_switching(sc, f->l2t, f->fs.vlan, f->fs.eport, f->fs.dmac)) { t4_l2t_release(f->l2t); f->l2t = NULL; return (ENOMEM); } } /* Already validated against fconf, iconf */ MPASS((f->fs.val.pfvf_vld & f->fs.val.ovlan_vld) == 0); MPASS((f->fs.mask.pfvf_vld & f->fs.mask.ovlan_vld) == 0); if (f->fs.val.pfvf_vld || f->fs.val.ovlan_vld) vnic_vld = 1; else vnic_vld = 0; if (f->fs.mask.pfvf_vld || f->fs.mask.ovlan_vld) vnic_vld_mask = 1; else vnic_vld_mask = 0; ftid = sc->tids.ftid_base + fidx; fwr = start_wrq_wr(&sc->sge.mgmtq, howmany(sizeof(*fwr), 16), &cookie); if (fwr == NULL) return (ENOMEM); bzero(fwr, sizeof(*fwr)); fwr->op_pkd = htobe32(V_FW_WR_OP(FW_FILTER_WR)); fwr->len16_pkd = htobe32(FW_LEN16(*fwr)); fwr->tid_to_iq = htobe32(V_FW_FILTER_WR_TID(ftid) | V_FW_FILTER_WR_RQTYPE(f->fs.type) | V_FW_FILTER_WR_NOREPLY(0) | V_FW_FILTER_WR_IQ(f->fs.iq)); fwr->del_filter_to_l2tix = htobe32(V_FW_FILTER_WR_RPTTID(f->fs.rpttid) | V_FW_FILTER_WR_DROP(f->fs.action == FILTER_DROP) | V_FW_FILTER_WR_DIRSTEER(f->fs.dirsteer) | V_FW_FILTER_WR_MASKHASH(f->fs.maskhash) | V_FW_FILTER_WR_DIRSTEERHASH(f->fs.dirsteerhash) | V_FW_FILTER_WR_LPBK(f->fs.action == FILTER_SWITCH) | V_FW_FILTER_WR_DMAC(f->fs.newdmac) | V_FW_FILTER_WR_SMAC(f->fs.newsmac) | V_FW_FILTER_WR_INSVLAN(f->fs.newvlan == VLAN_INSERT || f->fs.newvlan == VLAN_REWRITE) | V_FW_FILTER_WR_RMVLAN(f->fs.newvlan == VLAN_REMOVE || f->fs.newvlan == VLAN_REWRITE) | V_FW_FILTER_WR_HITCNTS(f->fs.hitcnts) | V_FW_FILTER_WR_TXCHAN(f->fs.eport) | V_FW_FILTER_WR_PRIO(f->fs.prio) | V_FW_FILTER_WR_L2TIX(f->l2t ? f->l2t->idx : 0)); fwr->ethtype = htobe16(f->fs.val.ethtype); fwr->ethtypem = htobe16(f->fs.mask.ethtype); fwr->frag_to_ovlan_vldm = (V_FW_FILTER_WR_FRAG(f->fs.val.frag) | V_FW_FILTER_WR_FRAGM(f->fs.mask.frag) | V_FW_FILTER_WR_IVLAN_VLD(f->fs.val.vlan_vld) | V_FW_FILTER_WR_OVLAN_VLD(vnic_vld) | V_FW_FILTER_WR_IVLAN_VLDM(f->fs.mask.vlan_vld) | V_FW_FILTER_WR_OVLAN_VLDM(vnic_vld_mask)); fwr->smac_sel = 0; fwr->rx_chan_rx_rpl_iq = htobe16(V_FW_FILTER_WR_RX_CHAN(0) | V_FW_FILTER_WR_RX_RPL_IQ(sc->sge.fwq.abs_id)); fwr->maci_to_matchtypem = htobe32(V_FW_FILTER_WR_MACI(f->fs.val.macidx) | V_FW_FILTER_WR_MACIM(f->fs.mask.macidx) | V_FW_FILTER_WR_FCOE(f->fs.val.fcoe) | V_FW_FILTER_WR_FCOEM(f->fs.mask.fcoe) | V_FW_FILTER_WR_PORT(f->fs.val.iport) | V_FW_FILTER_WR_PORTM(f->fs.mask.iport) | V_FW_FILTER_WR_MATCHTYPE(f->fs.val.matchtype) | V_FW_FILTER_WR_MATCHTYPEM(f->fs.mask.matchtype)); fwr->ptcl = f->fs.val.proto; fwr->ptclm = f->fs.mask.proto; fwr->ttyp = f->fs.val.tos; fwr->ttypm = f->fs.mask.tos; fwr->ivlan = htobe16(f->fs.val.vlan); fwr->ivlanm = htobe16(f->fs.mask.vlan); fwr->ovlan = htobe16(f->fs.val.vnic); fwr->ovlanm = htobe16(f->fs.mask.vnic); bcopy(f->fs.val.dip, fwr->lip, sizeof (fwr->lip)); bcopy(f->fs.mask.dip, fwr->lipm, sizeof (fwr->lipm)); bcopy(f->fs.val.sip, fwr->fip, sizeof (fwr->fip)); bcopy(f->fs.mask.sip, fwr->fipm, sizeof (fwr->fipm)); fwr->lp = htobe16(f->fs.val.dport); fwr->lpm = htobe16(f->fs.mask.dport); fwr->fp = htobe16(f->fs.val.sport); fwr->fpm = htobe16(f->fs.mask.sport); if (f->fs.newsmac) bcopy(f->fs.smac, fwr->sma, sizeof (fwr->sma)); f->pending = 1; sc->tids.ftids_in_use++; commit_wrq_wr(&sc->sge.mgmtq, fwr, &cookie); return (0); } static int del_filter_wr(struct adapter *sc, int fidx) { struct filter_entry *f = &sc->tids.ftid_tab[fidx]; struct fw_filter_wr *fwr; unsigned int ftid; struct wrq_cookie cookie; ftid = sc->tids.ftid_base + fidx; fwr = start_wrq_wr(&sc->sge.mgmtq, howmany(sizeof(*fwr), 16), &cookie); if (fwr == NULL) return (ENOMEM); bzero(fwr, sizeof (*fwr)); t4_mk_filtdelwr(ftid, fwr, sc->sge.fwq.abs_id); f->pending = 1; commit_wrq_wr(&sc->sge.mgmtq, fwr, &cookie); return (0); } int t4_filter_rpl(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) { struct adapter *sc = iq->adapter; const struct cpl_set_tcb_rpl *rpl = (const void *)(rss + 1); unsigned int idx = GET_TID(rpl); unsigned int rc; struct filter_entry *f; KASSERT(m == NULL, ("%s: payload with opcode %02x", __func__, rss->opcode)); MPASS(iq == &sc->sge.fwq); MPASS(is_ftid(sc, idx)); idx -= sc->tids.ftid_base; f = &sc->tids.ftid_tab[idx]; rc = G_COOKIE(rpl->cookie); mtx_lock(&sc->tids.ftid_lock); if (rc == FW_FILTER_WR_FLT_ADDED) { KASSERT(f->pending, ("%s: filter[%u] isn't pending.", __func__, idx)); f->smtidx = (be64toh(rpl->oldval) >> 24) & 0xff; f->pending = 0; /* asynchronous setup completed */ f->valid = 1; } else { if (rc != FW_FILTER_WR_FLT_DELETED) { /* Add or delete failed, display an error */ log(LOG_ERR, "filter %u setup failed with error %u\n", idx, rc); } clear_filter(f); sc->tids.ftids_in_use--; } wakeup(&sc->tids.ftid_tab); mtx_unlock(&sc->tids.ftid_lock); return (0); } static int set_tcb_rpl(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) { MPASS(iq->set_tcb_rpl != NULL); return (iq->set_tcb_rpl(iq, rss, m)); } static int l2t_write_rpl(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) { MPASS(iq->l2t_write_rpl != NULL); return (iq->l2t_write_rpl(iq, rss, m)); } static int get_sge_context(struct adapter *sc, struct t4_sge_context *cntxt) { int rc; if (cntxt->cid > M_CTXTQID) return (EINVAL); if (cntxt->mem_id != CTXT_EGRESS && cntxt->mem_id != CTXT_INGRESS && cntxt->mem_id != CTXT_FLM && cntxt->mem_id != CTXT_CNM) return (EINVAL); rc = begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4ctxt"); if (rc) return (rc); if (sc->flags & FW_OK) { rc = -t4_sge_ctxt_rd(sc, sc->mbox, cntxt->cid, cntxt->mem_id, &cntxt->data[0]); if (rc == 0) goto done; } /* * Read via firmware failed or wasn't even attempted. Read directly via * the backdoor. */ rc = -t4_sge_ctxt_rd_bd(sc, cntxt->cid, cntxt->mem_id, &cntxt->data[0]); done: end_synchronized_op(sc, 0); return (rc); } static int load_fw(struct adapter *sc, struct t4_data *fw) { int rc; uint8_t *fw_data; rc = begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4ldfw"); if (rc) return (rc); if (sc->flags & FULL_INIT_DONE) { rc = EBUSY; goto done; } fw_data = malloc(fw->len, M_CXGBE, M_WAITOK); if (fw_data == NULL) { rc = ENOMEM; goto done; } rc = copyin(fw->data, fw_data, fw->len); if (rc == 0) rc = -t4_load_fw(sc, fw_data, fw->len); free(fw_data, M_CXGBE); done: end_synchronized_op(sc, 0); return (rc); } +static int +load_cfg(struct adapter *sc, struct t4_data *cfg) +{ + int rc; + uint8_t *cfg_data = NULL; + + rc = begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4ldcf"); + if (rc) + return (rc); + + if (cfg->len == 0) { + /* clear */ + rc = -t4_load_cfg(sc, NULL, 0); + goto done; + } + + cfg_data = malloc(cfg->len, M_CXGBE, M_WAITOK); + if (cfg_data == NULL) { + rc = ENOMEM; + goto done; + } + + rc = copyin(cfg->data, cfg_data, cfg->len); + if (rc == 0) + rc = -t4_load_cfg(sc, cfg_data, cfg->len); + + free(cfg_data, M_CXGBE); +done: + end_synchronized_op(sc, 0); + return (rc); +} + #define MAX_READ_BUF_SIZE (128 * 1024) static int read_card_mem(struct adapter *sc, int win, struct t4_mem_range *mr) { uint32_t addr, remaining, n; uint32_t *buf; int rc; uint8_t *dst; rc = validate_mem_range(sc, mr->addr, mr->len); if (rc != 0) return (rc); buf = malloc(min(mr->len, MAX_READ_BUF_SIZE), M_CXGBE, M_WAITOK); addr = mr->addr; remaining = mr->len; dst = (void *)mr->data; while (remaining) { n = min(remaining, MAX_READ_BUF_SIZE); read_via_memwin(sc, 2, addr, buf, n); rc = copyout(buf, dst, n); if (rc != 0) break; dst += n; remaining -= n; addr += n; } free(buf, M_CXGBE); return (rc); } #undef MAX_READ_BUF_SIZE static int read_i2c(struct adapter *sc, struct t4_i2c_data *i2cd) { int rc; if (i2cd->len == 0 || i2cd->port_id >= sc->params.nports) return (EINVAL); if (i2cd->len > sizeof(i2cd->data)) return (EFBIG); rc = begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4i2crd"); if (rc) return (rc); rc = -t4_i2c_rd(sc, sc->mbox, i2cd->port_id, i2cd->dev_addr, i2cd->offset, i2cd->len, &i2cd->data[0]); end_synchronized_op(sc, 0); return (rc); } static int in_range(int val, int lo, int hi) { return (val < 0 || (val <= hi && val >= lo)); } static int set_sched_class_config(struct adapter *sc, int minmax) { int rc; if (minmax < 0) return (EINVAL); rc = begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4sscc"); if (rc) return (rc); rc = -t4_sched_config(sc, FW_SCHED_TYPE_PKTSCHED, minmax, 1); end_synchronized_op(sc, 0); return (rc); } static int set_sched_class_params(struct adapter *sc, struct t4_sched_class_params *p, int sleep_ok) { int rc, top_speed, fw_level, fw_mode, fw_rateunit, fw_ratemode; struct port_info *pi; struct tx_sched_class *tc; if (p->level == SCHED_CLASS_LEVEL_CL_RL) fw_level = FW_SCHED_PARAMS_LEVEL_CL_RL; else if (p->level == SCHED_CLASS_LEVEL_CL_WRR) fw_level = FW_SCHED_PARAMS_LEVEL_CL_WRR; else if (p->level == SCHED_CLASS_LEVEL_CH_RL) fw_level = FW_SCHED_PARAMS_LEVEL_CH_RL; else return (EINVAL); if (p->mode == SCHED_CLASS_MODE_CLASS) fw_mode = FW_SCHED_PARAMS_MODE_CLASS; else if (p->mode == SCHED_CLASS_MODE_FLOW) fw_mode = FW_SCHED_PARAMS_MODE_FLOW; else return (EINVAL); if (p->rateunit == SCHED_CLASS_RATEUNIT_BITS) fw_rateunit = FW_SCHED_PARAMS_UNIT_BITRATE; else if (p->rateunit == SCHED_CLASS_RATEUNIT_PKTS) fw_rateunit = FW_SCHED_PARAMS_UNIT_PKTRATE; else return (EINVAL); if (p->ratemode == SCHED_CLASS_RATEMODE_REL) fw_ratemode = FW_SCHED_PARAMS_RATE_REL; else if (p->ratemode == SCHED_CLASS_RATEMODE_ABS) fw_ratemode = FW_SCHED_PARAMS_RATE_ABS; else return (EINVAL); /* Vet our parameters ... */ if (!in_range(p->channel, 0, sc->chip_params->nchan - 1)) return (ERANGE); pi = sc->port[sc->chan_map[p->channel]]; if (pi == NULL) return (ENXIO); MPASS(pi->tx_chan == p->channel); top_speed = port_top_speed(pi) * 1000000; /* Gbps -> Kbps */ if (!in_range(p->cl, 0, sc->chip_params->nsched_cls) || !in_range(p->minrate, 0, top_speed) || !in_range(p->maxrate, 0, top_speed) || !in_range(p->weight, 0, 100)) return (ERANGE); /* * Translate any unset parameters into the firmware's * nomenclature and/or fail the call if the parameters * are required ... */ if (p->rateunit < 0 || p->ratemode < 0 || p->channel < 0 || p->cl < 0) return (EINVAL); if (p->minrate < 0) p->minrate = 0; if (p->maxrate < 0) { if (p->level == SCHED_CLASS_LEVEL_CL_RL || p->level == SCHED_CLASS_LEVEL_CH_RL) return (EINVAL); else p->maxrate = 0; } if (p->weight < 0) { if (p->level == SCHED_CLASS_LEVEL_CL_WRR) return (EINVAL); else p->weight = 0; } if (p->pktsize < 0) { if (p->level == SCHED_CLASS_LEVEL_CL_RL || p->level == SCHED_CLASS_LEVEL_CH_RL) return (EINVAL); else p->pktsize = 0; } rc = begin_synchronized_op(sc, NULL, sleep_ok ? (SLEEP_OK | INTR_OK) : HOLD_LOCK, "t4sscp"); if (rc) return (rc); tc = &pi->tc[p->cl]; tc->params = *p; rc = -t4_sched_params(sc, FW_SCHED_TYPE_PKTSCHED, fw_level, fw_mode, fw_rateunit, fw_ratemode, p->channel, p->cl, p->minrate, p->maxrate, p->weight, p->pktsize, sleep_ok); if (rc == 0) tc->flags |= TX_SC_OK; else { /* * Unknown state at this point, see tc->params for what was * attempted. */ tc->flags &= ~TX_SC_OK; } end_synchronized_op(sc, sleep_ok ? 0 : LOCK_HELD); return (rc); } int t4_set_sched_class(struct adapter *sc, struct t4_sched_params *p) { if (p->type != SCHED_CLASS_TYPE_PACKET) return (EINVAL); if (p->subcmd == SCHED_CLASS_SUBCMD_CONFIG) return (set_sched_class_config(sc, p->u.config.minmax)); if (p->subcmd == SCHED_CLASS_SUBCMD_PARAMS) return (set_sched_class_params(sc, &p->u.params, 1)); return (EINVAL); } int t4_set_sched_queue(struct adapter *sc, struct t4_sched_queue *p) { struct port_info *pi = NULL; struct vi_info *vi; struct sge_txq *txq; uint32_t fw_mnem, fw_queue, fw_class; int i, rc; rc = begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4setsq"); if (rc) return (rc); if (p->port >= sc->params.nports) { rc = EINVAL; goto done; } /* XXX: Only supported for the main VI. */ pi = sc->port[p->port]; vi = &pi->vi[0]; if (!(vi->flags & VI_INIT_DONE)) { /* tx queues not set up yet */ rc = EAGAIN; goto done; } if (!in_range(p->queue, 0, vi->ntxq - 1) || !in_range(p->cl, 0, sc->chip_params->nsched_cls - 1)) { rc = EINVAL; goto done; } /* * Create a template for the FW_PARAMS_CMD mnemonic and value (TX * Scheduling Class in this case). */ fw_mnem = (V_FW_PARAMS_MNEM(FW_PARAMS_MNEM_DMAQ) | V_FW_PARAMS_PARAM_X(FW_PARAMS_PARAM_DMAQ_EQ_SCHEDCLASS_ETH)); fw_class = p->cl < 0 ? 0xffffffff : p->cl; /* * If op.queue is non-negative, then we're only changing the scheduling * on a single specified TX queue. */ if (p->queue >= 0) { txq = &sc->sge.txq[vi->first_txq + p->queue]; fw_queue = (fw_mnem | V_FW_PARAMS_PARAM_YZ(txq->eq.cntxt_id)); rc = -t4_set_params(sc, sc->mbox, sc->pf, 0, 1, &fw_queue, &fw_class); goto done; } /* * Change the scheduling on all the TX queues for the * interface. */ for_each_txq(vi, i, txq) { fw_queue = (fw_mnem | V_FW_PARAMS_PARAM_YZ(txq->eq.cntxt_id)); rc = -t4_set_params(sc, sc->mbox, sc->pf, 0, 1, &fw_queue, &fw_class); if (rc) goto done; } rc = 0; done: end_synchronized_op(sc, 0); return (rc); } int t4_os_find_pci_capability(struct adapter *sc, int cap) { int i; return (pci_find_cap(sc->dev, cap, &i) == 0 ? i : 0); } int t4_os_pci_save_state(struct adapter *sc) { device_t dev; struct pci_devinfo *dinfo; dev = sc->dev; dinfo = device_get_ivars(dev); pci_cfg_save(dev, dinfo, 0); return (0); } int t4_os_pci_restore_state(struct adapter *sc) { device_t dev; struct pci_devinfo *dinfo; dev = sc->dev; dinfo = device_get_ivars(dev); pci_cfg_restore(dev, dinfo); return (0); } void t4_os_portmod_changed(const struct adapter *sc, int idx) { struct port_info *pi = sc->port[idx]; struct vi_info *vi; struct ifnet *ifp; int v; static const char *mod_str[] = { NULL, "LR", "SR", "ER", "TWINAX", "active TWINAX", "LRM" }; for_each_vi(pi, v, vi) { build_medialist(pi, &vi->media); } ifp = pi->vi[0].ifp; if (pi->mod_type == FW_PORT_MOD_TYPE_NONE) if_printf(ifp, "transceiver unplugged.\n"); else if (pi->mod_type == FW_PORT_MOD_TYPE_UNKNOWN) if_printf(ifp, "unknown transceiver inserted.\n"); else if (pi->mod_type == FW_PORT_MOD_TYPE_NOTSUPPORTED) if_printf(ifp, "unsupported transceiver inserted.\n"); else if (pi->mod_type > 0 && pi->mod_type < nitems(mod_str)) { if_printf(ifp, "%s transceiver inserted.\n", mod_str[pi->mod_type]); } else { if_printf(ifp, "transceiver (type %d) inserted.\n", pi->mod_type); } } void t4_os_link_changed(struct adapter *sc, int idx, int link_stat, int reason) { struct port_info *pi = sc->port[idx]; struct vi_info *vi; struct ifnet *ifp; int v; if (link_stat) pi->linkdnrc = -1; else { if (reason >= 0) pi->linkdnrc = reason; } for_each_vi(pi, v, vi) { ifp = vi->ifp; if (ifp == NULL) continue; if (link_stat) { ifp->if_baudrate = IF_Mbps(pi->link_cfg.speed); if_link_state_change(ifp, LINK_STATE_UP); } else { if_link_state_change(ifp, LINK_STATE_DOWN); } } } void t4_iterate(void (*func)(struct adapter *, void *), void *arg) { struct adapter *sc; sx_slock(&t4_list_lock); SLIST_FOREACH(sc, &t4_list, link) { /* * func should not make any assumptions about what state sc is * in - the only guarantee is that sc->sc_lock is a valid lock. */ func(sc, arg); } sx_sunlock(&t4_list_lock); } static int t4_ioctl(struct cdev *dev, unsigned long cmd, caddr_t data, int fflag, struct thread *td) { int rc; struct adapter *sc = dev->si_drv1; rc = priv_check(td, PRIV_DRIVER); if (rc != 0) return (rc); switch (cmd) { case CHELSIO_T4_GETREG: { struct t4_reg *edata = (struct t4_reg *)data; if ((edata->addr & 0x3) != 0 || edata->addr >= sc->mmio_len) return (EFAULT); if (edata->size == 4) edata->val = t4_read_reg(sc, edata->addr); else if (edata->size == 8) edata->val = t4_read_reg64(sc, edata->addr); else return (EINVAL); break; } case CHELSIO_T4_SETREG: { struct t4_reg *edata = (struct t4_reg *)data; if ((edata->addr & 0x3) != 0 || edata->addr >= sc->mmio_len) return (EFAULT); if (edata->size == 4) { if (edata->val & 0xffffffff00000000) return (EINVAL); t4_write_reg(sc, edata->addr, (uint32_t) edata->val); } else if (edata->size == 8) t4_write_reg64(sc, edata->addr, edata->val); else return (EINVAL); break; } case CHELSIO_T4_REGDUMP: { struct t4_regdump *regs = (struct t4_regdump *)data; int reglen = t4_get_regs_len(sc); uint8_t *buf; if (regs->len < reglen) { regs->len = reglen; /* hint to the caller */ return (ENOBUFS); } regs->len = reglen; buf = malloc(reglen, M_CXGBE, M_WAITOK | M_ZERO); get_regs(sc, regs, buf); rc = copyout(buf, regs->data, reglen); free(buf, M_CXGBE); break; } case CHELSIO_T4_GET_FILTER_MODE: rc = get_filter_mode(sc, (uint32_t *)data); break; case CHELSIO_T4_SET_FILTER_MODE: rc = set_filter_mode(sc, *(uint32_t *)data); break; case CHELSIO_T4_GET_FILTER: rc = get_filter(sc, (struct t4_filter *)data); break; case CHELSIO_T4_SET_FILTER: rc = set_filter(sc, (struct t4_filter *)data); break; case CHELSIO_T4_DEL_FILTER: rc = del_filter(sc, (struct t4_filter *)data); break; case CHELSIO_T4_GET_SGE_CONTEXT: rc = get_sge_context(sc, (struct t4_sge_context *)data); break; case CHELSIO_T4_LOAD_FW: rc = load_fw(sc, (struct t4_data *)data); break; case CHELSIO_T4_GET_MEM: rc = read_card_mem(sc, 2, (struct t4_mem_range *)data); break; case CHELSIO_T4_GET_I2C: rc = read_i2c(sc, (struct t4_i2c_data *)data); break; case CHELSIO_T4_CLEAR_STATS: { int i, v; u_int port_id = *(uint32_t *)data; struct port_info *pi; struct vi_info *vi; if (port_id >= sc->params.nports) return (EINVAL); pi = sc->port[port_id]; if (pi == NULL) return (EIO); /* MAC stats */ t4_clr_port_stats(sc, pi->tx_chan); pi->tx_parse_error = 0; mtx_lock(&sc->reg_lock); for_each_vi(pi, v, vi) { if (vi->flags & VI_INIT_DONE) t4_clr_vi_stats(sc, vi->viid); } mtx_unlock(&sc->reg_lock); /* * Since this command accepts a port, clear stats for * all VIs on this port. */ for_each_vi(pi, v, vi) { if (vi->flags & VI_INIT_DONE) { struct sge_rxq *rxq; struct sge_txq *txq; struct sge_wrq *wrq; for_each_rxq(vi, i, rxq) { #if defined(INET) || defined(INET6) rxq->lro.lro_queued = 0; rxq->lro.lro_flushed = 0; #endif rxq->rxcsum = 0; rxq->vlan_extraction = 0; } for_each_txq(vi, i, txq) { txq->txcsum = 0; txq->tso_wrs = 0; txq->vlan_insertion = 0; txq->imm_wrs = 0; txq->sgl_wrs = 0; txq->txpkt_wrs = 0; txq->txpkts0_wrs = 0; txq->txpkts1_wrs = 0; txq->txpkts0_pkts = 0; txq->txpkts1_pkts = 0; mp_ring_reset_stats(txq->r); } #ifdef TCP_OFFLOAD /* nothing to clear for each ofld_rxq */ for_each_ofld_txq(vi, i, wrq) { wrq->tx_wrs_direct = 0; wrq->tx_wrs_copied = 0; } #endif if (IS_MAIN_VI(vi)) { wrq = &sc->sge.ctrlq[pi->port_id]; wrq->tx_wrs_direct = 0; wrq->tx_wrs_copied = 0; } } } break; } case CHELSIO_T4_SCHED_CLASS: rc = t4_set_sched_class(sc, (struct t4_sched_params *)data); break; case CHELSIO_T4_SCHED_QUEUE: rc = t4_set_sched_queue(sc, (struct t4_sched_queue *)data); break; case CHELSIO_T4_GET_TRACER: rc = t4_get_tracer(sc, (struct t4_tracer *)data); break; case CHELSIO_T4_SET_TRACER: rc = t4_set_tracer(sc, (struct t4_tracer *)data); + break; + case CHELSIO_T4_LOAD_CFG: + rc = load_cfg(sc, (struct t4_data *)data); break; default: rc = ENOTTY; } return (rc); } void t4_db_full(struct adapter *sc) { CXGBE_UNIMPLEMENTED(__func__); } void t4_db_dropped(struct adapter *sc) { CXGBE_UNIMPLEMENTED(__func__); } #ifdef TCP_OFFLOAD static int toe_capability(struct vi_info *vi, int enable) { int rc; struct port_info *pi = vi->pi; struct adapter *sc = pi->adapter; ASSERT_SYNCHRONIZED_OP(sc); if (!is_offload(sc)) return (ENODEV); if (enable) { if ((vi->ifp->if_capenable & IFCAP_TOE) != 0) { /* TOE is already enabled. */ return (0); } /* * We need the port's queues around so that we're able to send * and receive CPLs to/from the TOE even if the ifnet for this * port has never been UP'd administratively. */ if (!(vi->flags & VI_INIT_DONE)) { rc = vi_full_init(vi); if (rc) return (rc); } if (!(pi->vi[0].flags & VI_INIT_DONE)) { rc = vi_full_init(&pi->vi[0]); if (rc) return (rc); } if (isset(&sc->offload_map, pi->port_id)) { /* TOE is enabled on another VI of this port. */ pi->uld_vis++; return (0); } if (!uld_active(sc, ULD_TOM)) { rc = t4_activate_uld(sc, ULD_TOM); if (rc == EAGAIN) { log(LOG_WARNING, "You must kldload t4_tom.ko before trying " "to enable TOE on a cxgbe interface.\n"); } if (rc != 0) return (rc); KASSERT(sc->tom_softc != NULL, ("%s: TOM activated but softc NULL", __func__)); KASSERT(uld_active(sc, ULD_TOM), ("%s: TOM activated but flag not set", __func__)); } /* Activate iWARP and iSCSI too, if the modules are loaded. */ if (!uld_active(sc, ULD_IWARP)) (void) t4_activate_uld(sc, ULD_IWARP); if (!uld_active(sc, ULD_ISCSI)) (void) t4_activate_uld(sc, ULD_ISCSI); pi->uld_vis++; setbit(&sc->offload_map, pi->port_id); } else { pi->uld_vis--; if (!isset(&sc->offload_map, pi->port_id) || pi->uld_vis > 0) return (0); KASSERT(uld_active(sc, ULD_TOM), ("%s: TOM never initialized?", __func__)); clrbit(&sc->offload_map, pi->port_id); } return (0); } /* * Add an upper layer driver to the global list. */ int t4_register_uld(struct uld_info *ui) { int rc = 0; struct uld_info *u; sx_xlock(&t4_uld_list_lock); SLIST_FOREACH(u, &t4_uld_list, link) { if (u->uld_id == ui->uld_id) { rc = EEXIST; goto done; } } SLIST_INSERT_HEAD(&t4_uld_list, ui, link); ui->refcount = 0; done: sx_xunlock(&t4_uld_list_lock); return (rc); } int t4_unregister_uld(struct uld_info *ui) { int rc = EINVAL; struct uld_info *u; sx_xlock(&t4_uld_list_lock); SLIST_FOREACH(u, &t4_uld_list, link) { if (u == ui) { if (ui->refcount > 0) { rc = EBUSY; goto done; } SLIST_REMOVE(&t4_uld_list, ui, uld_info, link); rc = 0; goto done; } } done: sx_xunlock(&t4_uld_list_lock); return (rc); } int t4_activate_uld(struct adapter *sc, int id) { int rc; struct uld_info *ui; ASSERT_SYNCHRONIZED_OP(sc); if (id < 0 || id > ULD_MAX) return (EINVAL); rc = EAGAIN; /* kldoad the module with this ULD and try again. */ sx_slock(&t4_uld_list_lock); SLIST_FOREACH(ui, &t4_uld_list, link) { if (ui->uld_id == id) { if (!(sc->flags & FULL_INIT_DONE)) { rc = adapter_full_init(sc); if (rc != 0) break; } rc = ui->activate(sc); if (rc == 0) { setbit(&sc->active_ulds, id); ui->refcount++; } break; } } sx_sunlock(&t4_uld_list_lock); return (rc); } int t4_deactivate_uld(struct adapter *sc, int id) { int rc; struct uld_info *ui; ASSERT_SYNCHRONIZED_OP(sc); if (id < 0 || id > ULD_MAX) return (EINVAL); rc = ENXIO; sx_slock(&t4_uld_list_lock); SLIST_FOREACH(ui, &t4_uld_list, link) { if (ui->uld_id == id) { rc = ui->deactivate(sc); if (rc == 0) { clrbit(&sc->active_ulds, id); ui->refcount--; } break; } } sx_sunlock(&t4_uld_list_lock); return (rc); } int uld_active(struct adapter *sc, int uld_id) { MPASS(uld_id >= 0 && uld_id <= ULD_MAX); return (isset(&sc->active_ulds, uld_id)); } #endif /* * Come up with reasonable defaults for some of the tunables, provided they're * not set by the user (in which case we'll use the values as is). */ static void tweak_tunables(void) { int nc = mp_ncpus; /* our snapshot of the number of CPUs */ if (t4_ntxq10g < 1) { #ifdef RSS t4_ntxq10g = rss_getnumbuckets(); #else t4_ntxq10g = min(nc, NTXQ_10G); #endif } if (t4_ntxq1g < 1) { #ifdef RSS /* XXX: way too many for 1GbE? */ t4_ntxq1g = rss_getnumbuckets(); #else t4_ntxq1g = min(nc, NTXQ_1G); #endif } if (t4_ntxq_vi < 1) t4_ntxq_vi = min(nc, NTXQ_VI); if (t4_nrxq10g < 1) { #ifdef RSS t4_nrxq10g = rss_getnumbuckets(); #else t4_nrxq10g = min(nc, NRXQ_10G); #endif } if (t4_nrxq1g < 1) { #ifdef RSS /* XXX: way too many for 1GbE? */ t4_nrxq1g = rss_getnumbuckets(); #else t4_nrxq1g = min(nc, NRXQ_1G); #endif } if (t4_nrxq_vi < 1) t4_nrxq_vi = min(nc, NRXQ_VI); #ifdef TCP_OFFLOAD if (t4_nofldtxq10g < 1) t4_nofldtxq10g = min(nc, NOFLDTXQ_10G); if (t4_nofldtxq1g < 1) t4_nofldtxq1g = min(nc, NOFLDTXQ_1G); if (t4_nofldtxq_vi < 1) t4_nofldtxq_vi = min(nc, NOFLDTXQ_VI); if (t4_nofldrxq10g < 1) t4_nofldrxq10g = min(nc, NOFLDRXQ_10G); if (t4_nofldrxq1g < 1) t4_nofldrxq1g = min(nc, NOFLDRXQ_1G); if (t4_nofldrxq_vi < 1) t4_nofldrxq_vi = min(nc, NOFLDRXQ_VI); if (t4_toecaps_allowed == -1) t4_toecaps_allowed = FW_CAPS_CONFIG_TOE; if (t4_rdmacaps_allowed == -1) { t4_rdmacaps_allowed = FW_CAPS_CONFIG_RDMA_RDDP | FW_CAPS_CONFIG_RDMA_RDMAC; } if (t4_iscsicaps_allowed == -1) { t4_iscsicaps_allowed = FW_CAPS_CONFIG_ISCSI_INITIATOR_PDU | FW_CAPS_CONFIG_ISCSI_TARGET_PDU | FW_CAPS_CONFIG_ISCSI_T10DIF; } #else if (t4_toecaps_allowed == -1) t4_toecaps_allowed = 0; if (t4_rdmacaps_allowed == -1) t4_rdmacaps_allowed = 0; if (t4_iscsicaps_allowed == -1) t4_iscsicaps_allowed = 0; #endif #ifdef DEV_NETMAP if (t4_nnmtxq_vi < 1) t4_nnmtxq_vi = min(nc, NNMTXQ_VI); if (t4_nnmrxq_vi < 1) t4_nnmrxq_vi = min(nc, NNMRXQ_VI); #endif if (t4_tmr_idx_10g < 0 || t4_tmr_idx_10g >= SGE_NTIMERS) t4_tmr_idx_10g = TMR_IDX_10G; if (t4_pktc_idx_10g < -1 || t4_pktc_idx_10g >= SGE_NCOUNTERS) t4_pktc_idx_10g = PKTC_IDX_10G; if (t4_tmr_idx_1g < 0 || t4_tmr_idx_1g >= SGE_NTIMERS) t4_tmr_idx_1g = TMR_IDX_1G; if (t4_pktc_idx_1g < -1 || t4_pktc_idx_1g >= SGE_NCOUNTERS) t4_pktc_idx_1g = PKTC_IDX_1G; if (t4_qsize_txq < 128) t4_qsize_txq = 128; if (t4_qsize_rxq < 128) t4_qsize_rxq = 128; while (t4_qsize_rxq & 7) t4_qsize_rxq++; t4_intr_types &= INTR_MSIX | INTR_MSI | INTR_INTX; } #ifdef DDB static void t4_dump_tcb(struct adapter *sc, int tid) { uint32_t base, i, j, off, pf, reg, save, tcb_addr, win_pos; reg = PCIE_MEM_ACCESS_REG(A_PCIE_MEM_ACCESS_OFFSET, 2); save = t4_read_reg(sc, reg); base = sc->memwin[2].mw_base; /* Dump TCB for the tid */ tcb_addr = t4_read_reg(sc, A_TP_CMM_TCB_BASE); tcb_addr += tid * TCB_SIZE; if (is_t4(sc)) { pf = 0; win_pos = tcb_addr & ~0xf; /* start must be 16B aligned */ } else { pf = V_PFNUM(sc->pf); win_pos = tcb_addr & ~0x7f; /* start must be 128B aligned */ } t4_write_reg(sc, reg, win_pos | pf); t4_read_reg(sc, reg); off = tcb_addr - win_pos; for (i = 0; i < 4; i++) { uint32_t buf[8]; for (j = 0; j < 8; j++, off += 4) buf[j] = htonl(t4_read_reg(sc, base + off)); db_printf("%08x %08x %08x %08x %08x %08x %08x %08x\n", buf[0], buf[1], buf[2], buf[3], buf[4], buf[5], buf[6], buf[7]); } t4_write_reg(sc, reg, save); t4_read_reg(sc, reg); } static void t4_dump_devlog(struct adapter *sc) { struct devlog_params *dparams = &sc->params.devlog; struct fw_devlog_e e; int i, first, j, m, nentries, rc; uint64_t ftstamp = UINT64_MAX; if (dparams->start == 0) { db_printf("devlog params not valid\n"); return; } nentries = dparams->size / sizeof(struct fw_devlog_e); m = fwmtype_to_hwmtype(dparams->memtype); /* Find the first entry. */ first = -1; for (i = 0; i < nentries && !db_pager_quit; i++) { rc = -t4_mem_read(sc, m, dparams->start + i * sizeof(e), sizeof(e), (void *)&e); if (rc != 0) break; if (e.timestamp == 0) break; e.timestamp = be64toh(e.timestamp); if (e.timestamp < ftstamp) { ftstamp = e.timestamp; first = i; } } if (first == -1) return; i = first; do { rc = -t4_mem_read(sc, m, dparams->start + i * sizeof(e), sizeof(e), (void *)&e); if (rc != 0) return; if (e.timestamp == 0) return; e.timestamp = be64toh(e.timestamp); e.seqno = be32toh(e.seqno); for (j = 0; j < 8; j++) e.params[j] = be32toh(e.params[j]); db_printf("%10d %15ju %8s %8s ", e.seqno, e.timestamp, (e.level < nitems(devlog_level_strings) ? devlog_level_strings[e.level] : "UNKNOWN"), (e.facility < nitems(devlog_facility_strings) ? devlog_facility_strings[e.facility] : "UNKNOWN")); db_printf(e.fmt, e.params[0], e.params[1], e.params[2], e.params[3], e.params[4], e.params[5], e.params[6], e.params[7]); if (++i == nentries) i = 0; } while (i != first && !db_pager_quit); } static struct command_table db_t4_table = LIST_HEAD_INITIALIZER(db_t4_table); _DB_SET(_show, t4, NULL, db_show_table, 0, &db_t4_table); DB_FUNC(devlog, db_show_devlog, db_t4_table, CS_OWN, NULL) { device_t dev; int t; bool valid; valid = false; t = db_read_token(); if (t == tIDENT) { dev = device_lookup_by_name(db_tok_string); valid = true; } db_skip_to_eol(); if (!valid) { db_printf("usage: show t4 devlog \n"); return; } if (dev == NULL) { db_printf("device not found\n"); return; } t4_dump_devlog(device_get_softc(dev)); } DB_FUNC(tcb, db_show_t4tcb, db_t4_table, CS_OWN, NULL) { device_t dev; int radix, tid, t; bool valid; valid = false; radix = db_radix; db_radix = 10; t = db_read_token(); if (t == tIDENT) { dev = device_lookup_by_name(db_tok_string); t = db_read_token(); if (t == tNUMBER) { tid = db_tok_number; valid = true; } } db_radix = radix; db_skip_to_eol(); if (!valid) { db_printf("usage: show t4 tcb \n"); return; } if (dev == NULL) { db_printf("device not found\n"); return; } if (tid < 0) { db_printf("invalid tid\n"); return; } t4_dump_tcb(device_get_softc(dev), tid); } #endif static struct sx mlu; /* mod load unload */ SX_SYSINIT(cxgbe_mlu, &mlu, "cxgbe mod load/unload"); static int mod_event(module_t mod, int cmd, void *arg) { int rc = 0; static int loaded = 0; switch (cmd) { case MOD_LOAD: sx_xlock(&mlu); if (loaded++ == 0) { t4_sge_modload(); t4_register_cpl_handler(CPL_SET_TCB_RPL, set_tcb_rpl); t4_register_cpl_handler(CPL_L2T_WRITE_RPL, l2t_write_rpl); t4_register_cpl_handler(CPL_TRACE_PKT, t4_trace_pkt); t4_register_cpl_handler(CPL_T5_TRACE_PKT, t5_trace_pkt); sx_init(&t4_list_lock, "T4/T5 adapters"); SLIST_INIT(&t4_list); #ifdef TCP_OFFLOAD sx_init(&t4_uld_list_lock, "T4/T5 ULDs"); SLIST_INIT(&t4_uld_list); #endif t4_tracer_modload(); tweak_tunables(); } sx_xunlock(&mlu); break; case MOD_UNLOAD: sx_xlock(&mlu); if (--loaded == 0) { int tries; sx_slock(&t4_list_lock); if (!SLIST_EMPTY(&t4_list)) { rc = EBUSY; sx_sunlock(&t4_list_lock); goto done_unload; } #ifdef TCP_OFFLOAD sx_slock(&t4_uld_list_lock); if (!SLIST_EMPTY(&t4_uld_list)) { rc = EBUSY; sx_sunlock(&t4_uld_list_lock); sx_sunlock(&t4_list_lock); goto done_unload; } #endif tries = 0; while (tries++ < 5 && t4_sge_extfree_refs() != 0) { uprintf("%ju clusters with custom free routine " "still is use.\n", t4_sge_extfree_refs()); pause("t4unload", 2 * hz); } #ifdef TCP_OFFLOAD sx_sunlock(&t4_uld_list_lock); #endif sx_sunlock(&t4_list_lock); if (t4_sge_extfree_refs() == 0) { t4_tracer_modunload(); #ifdef TCP_OFFLOAD sx_destroy(&t4_uld_list_lock); #endif sx_destroy(&t4_list_lock); t4_sge_modunload(); loaded = 0; } else { rc = EBUSY; loaded++; /* undo earlier decrement */ } } done_unload: sx_xunlock(&mlu); break; } return (rc); } static devclass_t t4_devclass, t5_devclass, t6_devclass; static devclass_t cxgbe_devclass, cxl_devclass, cc_devclass; static devclass_t vcxgbe_devclass, vcxl_devclass, vcc_devclass; DRIVER_MODULE(t4nex, pci, t4_driver, t4_devclass, mod_event, 0); MODULE_VERSION(t4nex, 1); MODULE_DEPEND(t4nex, firmware, 1, 1, 1); #ifdef DEV_NETMAP MODULE_DEPEND(t4nex, netmap, 1, 1, 1); #endif /* DEV_NETMAP */ DRIVER_MODULE(t5nex, pci, t5_driver, t5_devclass, mod_event, 0); MODULE_VERSION(t5nex, 1); MODULE_DEPEND(t5nex, firmware, 1, 1, 1); #ifdef DEV_NETMAP MODULE_DEPEND(t5nex, netmap, 1, 1, 1); #endif /* DEV_NETMAP */ DRIVER_MODULE(t6nex, pci, t6_driver, t6_devclass, mod_event, 0); MODULE_VERSION(t6nex, 1); MODULE_DEPEND(t6nex, firmware, 1, 1, 1); #ifdef DEV_NETMAP MODULE_DEPEND(t6nex, netmap, 1, 1, 1); #endif /* DEV_NETMAP */ DRIVER_MODULE(cxgbe, t4nex, cxgbe_driver, cxgbe_devclass, 0, 0); MODULE_VERSION(cxgbe, 1); DRIVER_MODULE(cxl, t5nex, cxl_driver, cxl_devclass, 0, 0); MODULE_VERSION(cxl, 1); DRIVER_MODULE(cc, t6nex, cc_driver, cc_devclass, 0, 0); MODULE_VERSION(cc, 1); DRIVER_MODULE(vcxgbe, cxgbe, vcxgbe_driver, vcxgbe_devclass, 0, 0); MODULE_VERSION(vcxgbe, 1); DRIVER_MODULE(vcxl, cxl, vcxl_driver, vcxl_devclass, 0, 0); MODULE_VERSION(vcxl, 1); DRIVER_MODULE(vcc, cc, vcc_driver, vcc_devclass, 0, 0); MODULE_VERSION(vcc, 1); Index: user/alc/PQ_LAUNDRY/sys/dev/qlxgbe/ql_hw.c =================================================================== --- user/alc/PQ_LAUNDRY/sys/dev/qlxgbe/ql_hw.c (revision 306831) +++ user/alc/PQ_LAUNDRY/sys/dev/qlxgbe/ql_hw.c (revision 306832) @@ -1,4966 +1,5068 @@ /* * Copyright (c) 2013-2016 Qlogic Corporation * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * File: ql_hw.c * Author : David C Somayajulu, Qlogic Corporation, Aliso Viejo, CA 92656. * Content: Contains Hardware dependent functions */ #include __FBSDID("$FreeBSD$"); #include "ql_os.h" #include "ql_hw.h" #include "ql_def.h" #include "ql_inline.h" #include "ql_ver.h" #include "ql_glbl.h" #include "ql_dbg.h" #include "ql_minidump.h" /* * Static Functions */ static void qla_del_rcv_cntxt(qla_host_t *ha); static int qla_init_rcv_cntxt(qla_host_t *ha); static void qla_del_xmt_cntxt(qla_host_t *ha); static int qla_init_xmt_cntxt(qla_host_t *ha); static void qla_hw_tx_done_locked(qla_host_t *ha, uint32_t txr_idx); static int qla_mbx_cmd(qla_host_t *ha, uint32_t *h_mbox, uint32_t n_hmbox, uint32_t *fw_mbox, uint32_t n_fwmbox, uint32_t no_pause); static int qla_config_intr_cntxt(qla_host_t *ha, uint32_t start_idx, uint32_t num_intrs, uint32_t create); static int qla_config_rss(qla_host_t *ha, uint16_t cntxt_id); static int qla_config_intr_coalesce(qla_host_t *ha, uint16_t cntxt_id, int tenable, int rcv); static int qla_set_mac_rcv_mode(qla_host_t *ha, uint32_t mode); static int qla_link_event_req(qla_host_t *ha, uint16_t cntxt_id); static int qla_tx_tso(qla_host_t *ha, struct mbuf *mp, q80_tx_cmd_t *tx_cmd, uint8_t *hdr); static int qla_hw_add_all_mcast(qla_host_t *ha); static int qla_hw_del_all_mcast(qla_host_t *ha); static int qla_add_rcv_rings(qla_host_t *ha, uint32_t sds_idx, uint32_t nsds); static int qla_init_nic_func(qla_host_t *ha); static int qla_stop_nic_func(qla_host_t *ha); static int qla_query_fw_dcbx_caps(qla_host_t *ha); static int qla_set_port_config(qla_host_t *ha, uint32_t cfg_bits); static int qla_get_port_config(qla_host_t *ha, uint32_t *cfg_bits); static void qla_get_quick_stats(qla_host_t *ha); static int qla_set_cam_search_mode(qla_host_t *ha, uint32_t search_mode); static int qla_get_cam_search_mode(qla_host_t *ha); static void ql_minidump_free(qla_host_t *ha); static int qla_sysctl_get_drvr_stats(SYSCTL_HANDLER_ARGS) { int err = 0, ret; qla_host_t *ha; uint32_t i; err = sysctl_handle_int(oidp, &ret, 0, req); if (err || !req->newptr) return (err); if (ret == 1) { ha = (qla_host_t *)arg1; for (i = 0; i < ha->hw.num_sds_rings; i++) { device_printf(ha->pci_dev, "%s: sds_ring[%d] = %p\n", __func__,i, (void *)ha->hw.sds[i].intr_count); device_printf(ha->pci_dev, "%s: sds_ring[%d].spurious_intr_count = %p\n", __func__, i, (void *)ha->hw.sds[i].spurious_intr_count); device_printf(ha->pci_dev, "%s: sds_ring[%d].rx_free = %d\n", __func__,i, ha->hw.sds[i].rx_free); } for (i = 0; i < ha->hw.num_tx_rings; i++) device_printf(ha->pci_dev, "%s: tx[%d] = %p\n", __func__,i, (void *)ha->tx_ring[i].count); for (i = 0; i < ha->hw.num_rds_rings; i++) device_printf(ha->pci_dev, "%s: rds_ring[%d] = %p\n", __func__,i, (void *)ha->hw.rds[i].count); device_printf(ha->pci_dev, "%s: lro_pkt_count = %p\n", __func__, (void *)ha->lro_pkt_count); device_printf(ha->pci_dev, "%s: lro_bytes = %p\n", __func__, (void *)ha->lro_bytes); #ifdef QL_ENABLE_ISCSI_TLV device_printf(ha->pci_dev, "%s: iscsi_pkts = %p\n", __func__, (void *)ha->hw.iscsi_pkt_count); #endif /* #ifdef QL_ENABLE_ISCSI_TLV */ } return (err); } static int qla_sysctl_get_quick_stats(SYSCTL_HANDLER_ARGS) { int err, ret = 0; qla_host_t *ha; err = sysctl_handle_int(oidp, &ret, 0, req); if (err || !req->newptr) return (err); if (ret == 1) { ha = (qla_host_t *)arg1; qla_get_quick_stats(ha); } return (err); } #ifdef QL_DBG static void qla_stop_pegs(qla_host_t *ha) { uint32_t val = 1; ql_rdwr_indreg32(ha, Q8_CRB_PEG_0, &val, 0); ql_rdwr_indreg32(ha, Q8_CRB_PEG_1, &val, 0); ql_rdwr_indreg32(ha, Q8_CRB_PEG_2, &val, 0); ql_rdwr_indreg32(ha, Q8_CRB_PEG_3, &val, 0); ql_rdwr_indreg32(ha, Q8_CRB_PEG_4, &val, 0); device_printf(ha->pci_dev, "%s PEGS HALTED!!!!!\n", __func__); } static int qla_sysctl_stop_pegs(SYSCTL_HANDLER_ARGS) { int err, ret = 0; qla_host_t *ha; err = sysctl_handle_int(oidp, &ret, 0, req); if (err || !req->newptr) return (err); if (ret == 1) { ha = (qla_host_t *)arg1; (void)QLA_LOCK(ha, __func__, 0); qla_stop_pegs(ha); QLA_UNLOCK(ha, __func__); } return err; } #endif /* #ifdef QL_DBG */ static int qla_validate_set_port_cfg_bit(uint32_t bits) { if ((bits & 0xF) > 1) return (-1); if (((bits >> 4) & 0xF) > 2) return (-1); if (((bits >> 8) & 0xF) > 2) return (-1); return (0); } static int qla_sysctl_port_cfg(SYSCTL_HANDLER_ARGS) { int err, ret = 0; qla_host_t *ha; uint32_t cfg_bits; err = sysctl_handle_int(oidp, &ret, 0, req); if (err || !req->newptr) return (err); if ((qla_validate_set_port_cfg_bit((uint32_t)ret) == 0)) { ha = (qla_host_t *)arg1; err = qla_get_port_config(ha, &cfg_bits); if (err) goto qla_sysctl_set_port_cfg_exit; if (ret & 0x1) { cfg_bits |= Q8_PORT_CFG_BITS_DCBX_ENABLE; } else { cfg_bits &= ~Q8_PORT_CFG_BITS_DCBX_ENABLE; } ret = ret >> 4; cfg_bits &= ~Q8_PORT_CFG_BITS_PAUSE_CFG_MASK; if ((ret & 0xF) == 0) { cfg_bits |= Q8_PORT_CFG_BITS_PAUSE_DISABLED; } else if ((ret & 0xF) == 1){ cfg_bits |= Q8_PORT_CFG_BITS_PAUSE_STD; } else { cfg_bits |= Q8_PORT_CFG_BITS_PAUSE_PPM; } ret = ret >> 4; cfg_bits &= ~Q8_PORT_CFG_BITS_STDPAUSE_DIR_MASK; if (ret == 0) { cfg_bits |= Q8_PORT_CFG_BITS_STDPAUSE_XMT_RCV; } else if (ret == 1){ cfg_bits |= Q8_PORT_CFG_BITS_STDPAUSE_XMT; } else { cfg_bits |= Q8_PORT_CFG_BITS_STDPAUSE_RCV; } err = qla_set_port_config(ha, cfg_bits); } else { ha = (qla_host_t *)arg1; err = qla_get_port_config(ha, &cfg_bits); } qla_sysctl_set_port_cfg_exit: return err; } static int qla_sysctl_set_cam_search_mode(SYSCTL_HANDLER_ARGS) { int err, ret = 0; qla_host_t *ha; err = sysctl_handle_int(oidp, &ret, 0, req); if (err || !req->newptr) return (err); ha = (qla_host_t *)arg1; if ((ret == Q8_HW_CONFIG_CAM_SEARCH_MODE_INTERNAL) || (ret == Q8_HW_CONFIG_CAM_SEARCH_MODE_AUTO)) { err = qla_set_cam_search_mode(ha, (uint32_t)ret); } else { device_printf(ha->pci_dev, "%s: ret = %d\n", __func__, ret); } return (err); } static int qla_sysctl_get_cam_search_mode(SYSCTL_HANDLER_ARGS) { int err, ret = 0; qla_host_t *ha; err = sysctl_handle_int(oidp, &ret, 0, req); if (err || !req->newptr) return (err); ha = (qla_host_t *)arg1; err = qla_get_cam_search_mode(ha); return (err); } /* * Name: ql_hw_add_sysctls * Function: Add P3Plus specific sysctls */ void ql_hw_add_sysctls(qla_host_t *ha) { device_t dev; dev = ha->pci_dev; ha->hw.num_sds_rings = MAX_SDS_RINGS; ha->hw.num_rds_rings = MAX_RDS_RINGS; ha->hw.num_tx_rings = NUM_TX_RINGS; SYSCTL_ADD_UINT(device_get_sysctl_ctx(dev), SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), OID_AUTO, "num_rds_rings", CTLFLAG_RD, &ha->hw.num_rds_rings, ha->hw.num_rds_rings, "Number of Rcv Descriptor Rings"); SYSCTL_ADD_UINT(device_get_sysctl_ctx(dev), SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), OID_AUTO, "num_sds_rings", CTLFLAG_RD, &ha->hw.num_sds_rings, ha->hw.num_sds_rings, "Number of Status Descriptor Rings"); SYSCTL_ADD_UINT(device_get_sysctl_ctx(dev), SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), OID_AUTO, "num_tx_rings", CTLFLAG_RD, &ha->hw.num_tx_rings, ha->hw.num_tx_rings, "Number of Transmit Rings"); SYSCTL_ADD_UINT(device_get_sysctl_ctx(dev), SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), OID_AUTO, "tx_ring_index", CTLFLAG_RW, &ha->txr_idx, ha->txr_idx, "Tx Ring Used"); SYSCTL_ADD_PROC(device_get_sysctl_ctx(dev), SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), OID_AUTO, "drvr_stats", CTLTYPE_INT | CTLFLAG_RW, (void *)ha, 0, qla_sysctl_get_drvr_stats, "I", "Driver Maintained Statistics"); SYSCTL_ADD_PROC(device_get_sysctl_ctx(dev), SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), OID_AUTO, "quick_stats", CTLTYPE_INT | CTLFLAG_RW, (void *)ha, 0, qla_sysctl_get_quick_stats, "I", "Quick Statistics"); SYSCTL_ADD_UINT(device_get_sysctl_ctx(dev), SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), OID_AUTO, "max_tx_segs", CTLFLAG_RD, &ha->hw.max_tx_segs, ha->hw.max_tx_segs, "Max # of Segments in a non-TSO pkt"); ha->hw.sds_cidx_thres = 32; SYSCTL_ADD_UINT(device_get_sysctl_ctx(dev), SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), OID_AUTO, "sds_cidx_thres", CTLFLAG_RW, &ha->hw.sds_cidx_thres, ha->hw.sds_cidx_thres, "Number of SDS entries to process before updating" " SDS Ring Consumer Index"); ha->hw.rds_pidx_thres = 32; SYSCTL_ADD_UINT(device_get_sysctl_ctx(dev), SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), OID_AUTO, "rds_pidx_thres", CTLFLAG_RW, &ha->hw.rds_pidx_thres, ha->hw.rds_pidx_thres, "Number of Rcv Rings Entries to post before updating" " RDS Ring Producer Index"); ha->hw.rcv_intr_coalesce = (3 << 16) | 256; SYSCTL_ADD_UINT(device_get_sysctl_ctx(dev), SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), OID_AUTO, "rcv_intr_coalesce", CTLFLAG_RW, &ha->hw.rcv_intr_coalesce, ha->hw.rcv_intr_coalesce, "Rcv Intr Coalescing Parameters\n" "\tbits 15:0 max packets\n" "\tbits 31:16 max micro-seconds to wait\n" "\tplease run\n" "\tifconfig down && ifconfig up\n" "\tto take effect \n"); ha->hw.xmt_intr_coalesce = (64 << 16) | 64; SYSCTL_ADD_UINT(device_get_sysctl_ctx(dev), SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), OID_AUTO, "xmt_intr_coalesce", CTLFLAG_RW, &ha->hw.xmt_intr_coalesce, ha->hw.xmt_intr_coalesce, "Xmt Intr Coalescing Parameters\n" "\tbits 15:0 max packets\n" "\tbits 31:16 max micro-seconds to wait\n" "\tplease run\n" "\tifconfig down && ifconfig up\n" "\tto take effect \n"); SYSCTL_ADD_PROC(device_get_sysctl_ctx(dev), SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), OID_AUTO, "port_cfg", CTLTYPE_INT | CTLFLAG_RW, (void *)ha, 0, qla_sysctl_port_cfg, "I", "Set Port Configuration if values below " "otherwise Get Port Configuration\n" "\tBits 0-3 ; 1 = DCBX Enable; 0 = DCBX Disable\n" "\tBits 4-7 : 0 = no pause; 1 = std ; 2 = ppm \n" "\tBits 8-11: std pause cfg; 0 = xmt and rcv;" " 1 = xmt only; 2 = rcv only;\n" ); SYSCTL_ADD_PROC(device_get_sysctl_ctx(dev), SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), OID_AUTO, "set_cam_search_mode", CTLTYPE_INT | CTLFLAG_RW, (void *)ha, 0, qla_sysctl_set_cam_search_mode, "I", "Set CAM Search Mode" "\t 1 = search mode internal\n" "\t 2 = search mode auto\n"); SYSCTL_ADD_PROC(device_get_sysctl_ctx(dev), SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), OID_AUTO, "get_cam_search_mode", CTLTYPE_INT | CTLFLAG_RW, (void *)ha, 0, qla_sysctl_get_cam_search_mode, "I", "Get CAM Search Mode" "\t 1 = search mode internal\n" "\t 2 = search mode auto\n"); ha->hw.enable_9kb = 1; SYSCTL_ADD_UINT(device_get_sysctl_ctx(dev), SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), OID_AUTO, "enable_9kb", CTLFLAG_RW, &ha->hw.enable_9kb, ha->hw.enable_9kb, "Enable 9Kbyte Buffers when MTU = 9000"); ha->hw.mdump_active = 0; SYSCTL_ADD_UINT(device_get_sysctl_ctx(dev), SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), OID_AUTO, "minidump_active", CTLFLAG_RW, &ha->hw.mdump_active, ha->hw.mdump_active, "Minidump retrieval is Active"); ha->hw.mdump_done = 0; SYSCTL_ADD_UINT(device_get_sysctl_ctx(dev), SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), OID_AUTO, "mdump_done", CTLFLAG_RW, &ha->hw.mdump_done, ha->hw.mdump_done, "Minidump has been done and available for retrieval"); ha->hw.mdump_capture_mask = 0xF; SYSCTL_ADD_UINT(device_get_sysctl_ctx(dev), SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), OID_AUTO, "minidump_capture_mask", CTLFLAG_RW, &ha->hw.mdump_capture_mask, ha->hw.mdump_capture_mask, "Minidump capture mask"); #ifdef QL_DBG ha->err_inject = 0; SYSCTL_ADD_UINT(device_get_sysctl_ctx(dev), SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), OID_AUTO, "err_inject", CTLFLAG_RW, &ha->err_inject, ha->err_inject, "Error to be injected\n" "\t\t\t 0: No Errors\n" "\t\t\t 1: rcv: rxb struct invalid\n" "\t\t\t 2: rcv: mp == NULL\n" "\t\t\t 3: lro: rxb struct invalid\n" "\t\t\t 4: lro: mp == NULL\n" "\t\t\t 5: rcv: num handles invalid\n" "\t\t\t 6: reg: indirect reg rd_wr failure\n" "\t\t\t 7: ocm: offchip memory rd_wr failure\n" "\t\t\t 8: mbx: mailbox command failure\n" "\t\t\t 9: heartbeat failure\n" "\t\t\t A: temperature failure\n" "\t\t\t 11: m_getcl or m_getjcl failure\n" ); SYSCTL_ADD_PROC(device_get_sysctl_ctx(dev), SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), OID_AUTO, "peg_stop", CTLTYPE_INT | CTLFLAG_RW, (void *)ha, 0, qla_sysctl_stop_pegs, "I", "Peg Stop"); #endif /* #ifdef QL_DBG */ ha->hw.user_pri_nic = 0; SYSCTL_ADD_UINT(device_get_sysctl_ctx(dev), SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), OID_AUTO, "user_pri_nic", CTLFLAG_RW, &ha->hw.user_pri_nic, ha->hw.user_pri_nic, "VLAN Tag User Priority for Normal Ethernet Packets"); ha->hw.user_pri_iscsi = 4; SYSCTL_ADD_UINT(device_get_sysctl_ctx(dev), SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), OID_AUTO, "user_pri_iscsi", CTLFLAG_RW, &ha->hw.user_pri_iscsi, ha->hw.user_pri_iscsi, "VLAN Tag User Priority for iSCSI Packets"); } void ql_hw_link_status(qla_host_t *ha) { device_printf(ha->pci_dev, "cable_oui\t\t 0x%08x\n", ha->hw.cable_oui); if (ha->hw.link_up) { device_printf(ha->pci_dev, "link Up\n"); } else { device_printf(ha->pci_dev, "link Down\n"); } if (ha->hw.flags.fduplex) { device_printf(ha->pci_dev, "Full Duplex\n"); } else { device_printf(ha->pci_dev, "Half Duplex\n"); } if (ha->hw.flags.autoneg) { device_printf(ha->pci_dev, "Auto Negotiation Enabled\n"); } else { device_printf(ha->pci_dev, "Auto Negotiation Disabled\n"); } switch (ha->hw.link_speed) { case 0x710: device_printf(ha->pci_dev, "link speed\t\t 10Gps\n"); break; case 0x3E8: device_printf(ha->pci_dev, "link speed\t\t 1Gps\n"); break; case 0x64: device_printf(ha->pci_dev, "link speed\t\t 100Mbps\n"); break; default: device_printf(ha->pci_dev, "link speed\t\t Unknown\n"); break; } switch (ha->hw.module_type) { case 0x01: device_printf(ha->pci_dev, "Module Type 10GBase-LRM\n"); break; case 0x02: device_printf(ha->pci_dev, "Module Type 10GBase-LR\n"); break; case 0x03: device_printf(ha->pci_dev, "Module Type 10GBase-SR\n"); break; case 0x04: device_printf(ha->pci_dev, "Module Type 10GE Passive Copper(Compliant)[%d m]\n", ha->hw.cable_length); break; case 0x05: device_printf(ha->pci_dev, "Module Type 10GE Active" " Limiting Copper(Compliant)[%d m]\n", ha->hw.cable_length); break; case 0x06: device_printf(ha->pci_dev, "Module Type 10GE Passive Copper" " (Legacy, Best Effort)[%d m]\n", ha->hw.cable_length); break; case 0x07: device_printf(ha->pci_dev, "Module Type 1000Base-SX\n"); break; case 0x08: device_printf(ha->pci_dev, "Module Type 1000Base-LX\n"); break; case 0x09: device_printf(ha->pci_dev, "Module Type 1000Base-CX\n"); break; case 0x0A: device_printf(ha->pci_dev, "Module Type 1000Base-T\n"); break; case 0x0B: device_printf(ha->pci_dev, "Module Type 1GE Passive Copper" "(Legacy, Best Effort)\n"); break; default: device_printf(ha->pci_dev, "Unknown Module Type 0x%x\n", ha->hw.module_type); break; } if (ha->hw.link_faults == 1) device_printf(ha->pci_dev, "SFP Power Fault\n"); } /* * Name: ql_free_dma * Function: Frees the DMA'able memory allocated in ql_alloc_dma() */ void ql_free_dma(qla_host_t *ha) { uint32_t i; if (ha->hw.dma_buf.flags.sds_ring) { for (i = 0; i < ha->hw.num_sds_rings; i++) { ql_free_dmabuf(ha, &ha->hw.dma_buf.sds_ring[i]); } ha->hw.dma_buf.flags.sds_ring = 0; } if (ha->hw.dma_buf.flags.rds_ring) { for (i = 0; i < ha->hw.num_rds_rings; i++) { ql_free_dmabuf(ha, &ha->hw.dma_buf.rds_ring[i]); } ha->hw.dma_buf.flags.rds_ring = 0; } if (ha->hw.dma_buf.flags.tx_ring) { ql_free_dmabuf(ha, &ha->hw.dma_buf.tx_ring); ha->hw.dma_buf.flags.tx_ring = 0; } ql_minidump_free(ha); } /* * Name: ql_alloc_dma * Function: Allocates DMA'able memory for Tx/Rx Rings, Tx/Rx Contexts. */ int ql_alloc_dma(qla_host_t *ha) { device_t dev; uint32_t i, j, size, tx_ring_size; qla_hw_t *hw; qla_hw_tx_cntxt_t *tx_cntxt; uint8_t *vaddr; bus_addr_t paddr; dev = ha->pci_dev; QL_DPRINT2(ha, (dev, "%s: enter\n", __func__)); hw = &ha->hw; /* * Allocate Transmit Ring */ tx_ring_size = (sizeof(q80_tx_cmd_t) * NUM_TX_DESCRIPTORS); size = (tx_ring_size * ha->hw.num_tx_rings); hw->dma_buf.tx_ring.alignment = 8; hw->dma_buf.tx_ring.size = size + PAGE_SIZE; if (ql_alloc_dmabuf(ha, &hw->dma_buf.tx_ring)) { device_printf(dev, "%s: tx ring alloc failed\n", __func__); goto ql_alloc_dma_exit; } vaddr = (uint8_t *)hw->dma_buf.tx_ring.dma_b; paddr = hw->dma_buf.tx_ring.dma_addr; for (i = 0; i < ha->hw.num_tx_rings; i++) { tx_cntxt = (qla_hw_tx_cntxt_t *)&hw->tx_cntxt[i]; tx_cntxt->tx_ring_base = (q80_tx_cmd_t *)vaddr; tx_cntxt->tx_ring_paddr = paddr; vaddr += tx_ring_size; paddr += tx_ring_size; } for (i = 0; i < ha->hw.num_tx_rings; i++) { tx_cntxt = (qla_hw_tx_cntxt_t *)&hw->tx_cntxt[i]; tx_cntxt->tx_cons = (uint32_t *)vaddr; tx_cntxt->tx_cons_paddr = paddr; vaddr += sizeof (uint32_t); paddr += sizeof (uint32_t); } ha->hw.dma_buf.flags.tx_ring = 1; QL_DPRINT2(ha, (dev, "%s: tx_ring phys %p virt %p\n", __func__, (void *)(hw->dma_buf.tx_ring.dma_addr), hw->dma_buf.tx_ring.dma_b)); /* * Allocate Receive Descriptor Rings */ for (i = 0; i < hw->num_rds_rings; i++) { hw->dma_buf.rds_ring[i].alignment = 8; hw->dma_buf.rds_ring[i].size = (sizeof(q80_recv_desc_t)) * NUM_RX_DESCRIPTORS; if (ql_alloc_dmabuf(ha, &hw->dma_buf.rds_ring[i])) { device_printf(dev, "%s: rds ring[%d] alloc failed\n", __func__, i); for (j = 0; j < i; j++) ql_free_dmabuf(ha, &hw->dma_buf.rds_ring[j]); goto ql_alloc_dma_exit; } QL_DPRINT4(ha, (dev, "%s: rx_ring[%d] phys %p virt %p\n", __func__, i, (void *)(hw->dma_buf.rds_ring[i].dma_addr), hw->dma_buf.rds_ring[i].dma_b)); } hw->dma_buf.flags.rds_ring = 1; /* * Allocate Status Descriptor Rings */ for (i = 0; i < hw->num_sds_rings; i++) { hw->dma_buf.sds_ring[i].alignment = 8; hw->dma_buf.sds_ring[i].size = (sizeof(q80_stat_desc_t)) * NUM_STATUS_DESCRIPTORS; if (ql_alloc_dmabuf(ha, &hw->dma_buf.sds_ring[i])) { device_printf(dev, "%s: sds ring alloc failed\n", __func__); for (j = 0; j < i; j++) ql_free_dmabuf(ha, &hw->dma_buf.sds_ring[j]); goto ql_alloc_dma_exit; } QL_DPRINT4(ha, (dev, "%s: sds_ring[%d] phys %p virt %p\n", __func__, i, (void *)(hw->dma_buf.sds_ring[i].dma_addr), hw->dma_buf.sds_ring[i].dma_b)); } for (i = 0; i < hw->num_sds_rings; i++) { hw->sds[i].sds_ring_base = (q80_stat_desc_t *)hw->dma_buf.sds_ring[i].dma_b; } hw->dma_buf.flags.sds_ring = 1; return 0; ql_alloc_dma_exit: ql_free_dma(ha); return -1; } #define Q8_MBX_MSEC_DELAY 5000 static int qla_mbx_cmd(qla_host_t *ha, uint32_t *h_mbox, uint32_t n_hmbox, uint32_t *fw_mbox, uint32_t n_fwmbox, uint32_t no_pause) { uint32_t i; uint32_t data; int ret = 0; if (QL_ERR_INJECT(ha, INJCT_MBX_CMD_FAILURE)) { ret = -3; ha->qla_initiate_recovery = 1; goto exit_qla_mbx_cmd; } if (no_pause) i = 1000; else i = Q8_MBX_MSEC_DELAY; while (i) { data = READ_REG32(ha, Q8_HOST_MBOX_CNTRL); if (data == 0) break; if (no_pause) { DELAY(1000); } else { qla_mdelay(__func__, 1); } i--; } if (i == 0) { device_printf(ha->pci_dev, "%s: host_mbx_cntrl 0x%08x\n", __func__, data); ret = -1; ha->qla_initiate_recovery = 1; goto exit_qla_mbx_cmd; } for (i = 0; i < n_hmbox; i++) { WRITE_REG32(ha, (Q8_HOST_MBOX0 + (i << 2)), *h_mbox); h_mbox++; } WRITE_REG32(ha, Q8_HOST_MBOX_CNTRL, 0x1); i = Q8_MBX_MSEC_DELAY; while (i) { data = READ_REG32(ha, Q8_FW_MBOX_CNTRL); if ((data & 0x3) == 1) { data = READ_REG32(ha, Q8_FW_MBOX0); if ((data & 0xF000) != 0x8000) break; } if (no_pause) { DELAY(1000); } else { qla_mdelay(__func__, 1); } i--; } if (i == 0) { device_printf(ha->pci_dev, "%s: fw_mbx_cntrl 0x%08x\n", __func__, data); ret = -2; ha->qla_initiate_recovery = 1; goto exit_qla_mbx_cmd; } for (i = 0; i < n_fwmbox; i++) { *fw_mbox++ = READ_REG32(ha, (Q8_FW_MBOX0 + (i << 2))); } WRITE_REG32(ha, Q8_FW_MBOX_CNTRL, 0x0); WRITE_REG32(ha, ha->hw.mbx_intr_mask_offset, 0x0); exit_qla_mbx_cmd: return (ret); } int qla_get_nic_partition(qla_host_t *ha, uint32_t *supports_9kb, uint32_t *num_rcvq) { uint32_t *mbox, err; device_t dev = ha->pci_dev; bzero(ha->hw.mbox, (sizeof (uint32_t) * Q8_NUM_MBOX)); mbox = ha->hw.mbox; mbox[0] = Q8_MBX_GET_NIC_PARTITION | (0x2 << 16) | (0x2 << 29); if (qla_mbx_cmd(ha, mbox, 2, mbox, 19, 0)) { device_printf(dev, "%s: failed0\n", __func__); return (-1); } err = mbox[0] >> 25; if (supports_9kb != NULL) { if (mbox[16] & 0x80) /* bit 7 of mbox 16 */ *supports_9kb = 1; else *supports_9kb = 0; } if (num_rcvq != NULL) *num_rcvq = ((mbox[6] >> 16) & 0xFFFF); if ((err != 1) && (err != 0)) { device_printf(dev, "%s: failed1 [0x%08x]\n", __func__, err); return (-1); } return 0; } static int qla_config_intr_cntxt(qla_host_t *ha, uint32_t start_idx, uint32_t num_intrs, uint32_t create) { uint32_t i, err; device_t dev = ha->pci_dev; q80_config_intr_t *c_intr; q80_config_intr_rsp_t *c_intr_rsp; c_intr = (q80_config_intr_t *)ha->hw.mbox; bzero(c_intr, (sizeof (q80_config_intr_t))); c_intr->opcode = Q8_MBX_CONFIG_INTR; c_intr->count_version = (sizeof (q80_config_intr_t) >> 2); c_intr->count_version |= Q8_MBX_CMD_VERSION; c_intr->nentries = num_intrs; for (i = 0; i < num_intrs; i++) { if (create) { c_intr->intr[i].cmd_type = Q8_MBX_CONFIG_INTR_CREATE; c_intr->intr[i].msix_index = start_idx + 1 + i; } else { c_intr->intr[i].cmd_type = Q8_MBX_CONFIG_INTR_DELETE; c_intr->intr[i].msix_index = ha->hw.intr_id[(start_idx + i)]; } c_intr->intr[i].cmd_type |= Q8_MBX_CONFIG_INTR_TYPE_MSI_X; } if (qla_mbx_cmd(ha, (uint32_t *)c_intr, (sizeof (q80_config_intr_t) >> 2), ha->hw.mbox, (sizeof (q80_config_intr_rsp_t) >> 2), 0)) { device_printf(dev, "%s: failed0\n", __func__); return (-1); } c_intr_rsp = (q80_config_intr_rsp_t *)ha->hw.mbox; err = Q8_MBX_RSP_STATUS(c_intr_rsp->regcnt_status); if (err) { device_printf(dev, "%s: failed1 [0x%08x, %d]\n", __func__, err, c_intr_rsp->nentries); for (i = 0; i < c_intr_rsp->nentries; i++) { device_printf(dev, "%s: [%d]:[0x%x 0x%x 0x%x]\n", __func__, i, c_intr_rsp->intr[i].status, c_intr_rsp->intr[i].intr_id, c_intr_rsp->intr[i].intr_src); } return (-1); } for (i = 0; ((i < num_intrs) && create); i++) { if (!c_intr_rsp->intr[i].status) { ha->hw.intr_id[(start_idx + i)] = c_intr_rsp->intr[i].intr_id; ha->hw.intr_src[(start_idx + i)] = c_intr_rsp->intr[i].intr_src; } } return (0); } /* * Name: qla_config_rss * Function: Configure RSS for the context/interface. */ static const uint64_t rss_key[] = { 0xbeac01fa6a42b73bULL, 0x8030f20c77cb2da3ULL, 0xae7b30b4d0ca2bcbULL, 0x43a38fb04167253dULL, 0x255b0ec26d5a56daULL }; static int qla_config_rss(qla_host_t *ha, uint16_t cntxt_id) { q80_config_rss_t *c_rss; q80_config_rss_rsp_t *c_rss_rsp; uint32_t err, i; device_t dev = ha->pci_dev; c_rss = (q80_config_rss_t *)ha->hw.mbox; bzero(c_rss, (sizeof (q80_config_rss_t))); c_rss->opcode = Q8_MBX_CONFIG_RSS; c_rss->count_version = (sizeof (q80_config_rss_t) >> 2); c_rss->count_version |= Q8_MBX_CMD_VERSION; c_rss->hash_type = (Q8_MBX_RSS_HASH_TYPE_IPV4_TCP_IP | Q8_MBX_RSS_HASH_TYPE_IPV6_TCP_IP); //c_rss->hash_type = (Q8_MBX_RSS_HASH_TYPE_IPV4_TCP | // Q8_MBX_RSS_HASH_TYPE_IPV6_TCP); c_rss->flags = Q8_MBX_RSS_FLAGS_ENABLE_RSS; c_rss->flags |= Q8_MBX_RSS_FLAGS_USE_IND_TABLE; c_rss->indtbl_mask = Q8_MBX_RSS_INDTBL_MASK; c_rss->indtbl_mask |= Q8_MBX_RSS_FLAGS_MULTI_RSS_VALID; c_rss->flags |= Q8_MBX_RSS_FLAGS_TYPE_CRSS; c_rss->cntxt_id = cntxt_id; for (i = 0; i < 5; i++) { c_rss->rss_key[i] = rss_key[i]; } if (qla_mbx_cmd(ha, (uint32_t *)c_rss, (sizeof (q80_config_rss_t) >> 2), ha->hw.mbox, (sizeof(q80_config_rss_rsp_t) >> 2), 0)) { device_printf(dev, "%s: failed0\n", __func__); return (-1); } c_rss_rsp = (q80_config_rss_rsp_t *)ha->hw.mbox; err = Q8_MBX_RSP_STATUS(c_rss_rsp->regcnt_status); if (err) { device_printf(dev, "%s: failed1 [0x%08x]\n", __func__, err); return (-1); } return 0; } static int qla_set_rss_ind_table(qla_host_t *ha, uint32_t start_idx, uint32_t count, uint16_t cntxt_id, uint8_t *ind_table) { q80_config_rss_ind_table_t *c_rss_ind; q80_config_rss_ind_table_rsp_t *c_rss_ind_rsp; uint32_t err; device_t dev = ha->pci_dev; if ((count > Q8_RSS_IND_TBL_SIZE) || ((start_idx + count - 1) > Q8_RSS_IND_TBL_MAX_IDX)) { device_printf(dev, "%s: illegal count [%d, %d]\n", __func__, start_idx, count); return (-1); } c_rss_ind = (q80_config_rss_ind_table_t *)ha->hw.mbox; bzero(c_rss_ind, sizeof (q80_config_rss_ind_table_t)); c_rss_ind->opcode = Q8_MBX_CONFIG_RSS_TABLE; c_rss_ind->count_version = (sizeof (q80_config_rss_ind_table_t) >> 2); c_rss_ind->count_version |= Q8_MBX_CMD_VERSION; c_rss_ind->start_idx = start_idx; c_rss_ind->end_idx = start_idx + count - 1; c_rss_ind->cntxt_id = cntxt_id; bcopy(ind_table, c_rss_ind->ind_table, count); if (qla_mbx_cmd(ha, (uint32_t *)c_rss_ind, (sizeof (q80_config_rss_ind_table_t) >> 2), ha->hw.mbox, (sizeof(q80_config_rss_ind_table_rsp_t) >> 2), 0)) { device_printf(dev, "%s: failed0\n", __func__); return (-1); } c_rss_ind_rsp = (q80_config_rss_ind_table_rsp_t *)ha->hw.mbox; err = Q8_MBX_RSP_STATUS(c_rss_ind_rsp->regcnt_status); if (err) { device_printf(dev, "%s: failed1 [0x%08x]\n", __func__, err); return (-1); } return 0; } /* * Name: qla_config_intr_coalesce * Function: Configure Interrupt Coalescing. */ static int qla_config_intr_coalesce(qla_host_t *ha, uint16_t cntxt_id, int tenable, int rcv) { q80_config_intr_coalesc_t *intrc; q80_config_intr_coalesc_rsp_t *intrc_rsp; uint32_t err, i; device_t dev = ha->pci_dev; intrc = (q80_config_intr_coalesc_t *)ha->hw.mbox; bzero(intrc, (sizeof (q80_config_intr_coalesc_t))); intrc->opcode = Q8_MBX_CONFIG_INTR_COALESCE; intrc->count_version = (sizeof (q80_config_intr_coalesc_t) >> 2); intrc->count_version |= Q8_MBX_CMD_VERSION; if (rcv) { intrc->flags = Q8_MBX_INTRC_FLAGS_RCV; intrc->max_pkts = ha->hw.rcv_intr_coalesce & 0xFFFF; intrc->max_mswait = (ha->hw.rcv_intr_coalesce >> 16) & 0xFFFF; } else { intrc->flags = Q8_MBX_INTRC_FLAGS_XMT; intrc->max_pkts = ha->hw.xmt_intr_coalesce & 0xFFFF; intrc->max_mswait = (ha->hw.xmt_intr_coalesce >> 16) & 0xFFFF; } intrc->cntxt_id = cntxt_id; if (tenable) { intrc->flags |= Q8_MBX_INTRC_FLAGS_PERIODIC; intrc->timer_type = Q8_MBX_INTRC_TIMER_PERIODIC; for (i = 0; i < ha->hw.num_sds_rings; i++) { intrc->sds_ring_mask |= (1 << i); } intrc->ms_timeout = 1000; } if (qla_mbx_cmd(ha, (uint32_t *)intrc, (sizeof (q80_config_intr_coalesc_t) >> 2), ha->hw.mbox, (sizeof(q80_config_intr_coalesc_rsp_t) >> 2), 0)) { device_printf(dev, "%s: failed0\n", __func__); return (-1); } intrc_rsp = (q80_config_intr_coalesc_rsp_t *)ha->hw.mbox; err = Q8_MBX_RSP_STATUS(intrc_rsp->regcnt_status); if (err) { device_printf(dev, "%s: failed1 [0x%08x]\n", __func__, err); return (-1); } return 0; } /* * Name: qla_config_mac_addr * Function: binds a MAC address to the context/interface. * Can be unicast, multicast or broadcast. */ static int -qla_config_mac_addr(qla_host_t *ha, uint8_t *mac_addr, uint32_t add_mac) +qla_config_mac_addr(qla_host_t *ha, uint8_t *mac_addr, uint32_t add_mac, + uint32_t num_mac) { q80_config_mac_addr_t *cmac; q80_config_mac_addr_rsp_t *cmac_rsp; uint32_t err; device_t dev = ha->pci_dev; + int i; + uint8_t *mac_cpy = mac_addr; + if (num_mac > Q8_MAX_MAC_ADDRS) { + device_printf(dev, "%s: %s num_mac [0x%x] > Q8_MAX_MAC_ADDRS\n", + __func__, (add_mac ? "Add" : "Del"), num_mac); + return (-1); + } + cmac = (q80_config_mac_addr_t *)ha->hw.mbox; bzero(cmac, (sizeof (q80_config_mac_addr_t))); cmac->opcode = Q8_MBX_CONFIG_MAC_ADDR; cmac->count_version = sizeof (q80_config_mac_addr_t) >> 2; cmac->count_version |= Q8_MBX_CMD_VERSION; if (add_mac) cmac->cmd = Q8_MBX_CMAC_CMD_ADD_MAC_ADDR; else cmac->cmd = Q8_MBX_CMAC_CMD_DEL_MAC_ADDR; cmac->cmd |= Q8_MBX_CMAC_CMD_CAM_INGRESS; - cmac->nmac_entries = 1; + cmac->nmac_entries = num_mac; cmac->cntxt_id = ha->hw.rcv_cntxt_id; - bcopy(mac_addr, cmac->mac_addr[0].addr, 6); + for (i = 0; i < num_mac; i++) { + bcopy(mac_addr, cmac->mac_addr[i].addr, Q8_ETHER_ADDR_LEN); + mac_addr = mac_addr + ETHER_ADDR_LEN; + } + if (qla_mbx_cmd(ha, (uint32_t *)cmac, (sizeof (q80_config_mac_addr_t) >> 2), ha->hw.mbox, (sizeof(q80_config_mac_addr_rsp_t) >> 2), 1)) { device_printf(dev, "%s: %s failed0\n", __func__, (add_mac ? "Add" : "Del")); return (-1); } cmac_rsp = (q80_config_mac_addr_rsp_t *)ha->hw.mbox; err = Q8_MBX_RSP_STATUS(cmac_rsp->regcnt_status); if (err) { - device_printf(dev, "%s: %s " - "%02x:%02x:%02x:%02x:%02x:%02x failed1 [0x%08x]\n", - __func__, (add_mac ? "Add" : "Del"), - mac_addr[0], mac_addr[1], mac_addr[2], - mac_addr[3], mac_addr[4], mac_addr[5], err); + device_printf(dev, "%s: %s failed1 [0x%08x]\n", __func__, + (add_mac ? "Add" : "Del"), err); + for (i = 0; i < num_mac; i++) { + device_printf(dev, "%s: %02x:%02x:%02x:%02x:%02x:%02x\n", + __func__, mac_cpy[0], mac_cpy[1], mac_cpy[2], + mac_cpy[3], mac_cpy[4], mac_cpy[5]); + mac_cpy += ETHER_ADDR_LEN; + } return (-1); } return 0; } /* * Name: qla_set_mac_rcv_mode * Function: Enable/Disable AllMulticast and Promiscous Modes. */ static int qla_set_mac_rcv_mode(qla_host_t *ha, uint32_t mode) { q80_config_mac_rcv_mode_t *rcv_mode; uint32_t err; q80_config_mac_rcv_mode_rsp_t *rcv_mode_rsp; device_t dev = ha->pci_dev; rcv_mode = (q80_config_mac_rcv_mode_t *)ha->hw.mbox; bzero(rcv_mode, (sizeof (q80_config_mac_rcv_mode_t))); rcv_mode->opcode = Q8_MBX_CONFIG_MAC_RX_MODE; rcv_mode->count_version = sizeof (q80_config_mac_rcv_mode_t) >> 2; rcv_mode->count_version |= Q8_MBX_CMD_VERSION; rcv_mode->mode = mode; rcv_mode->cntxt_id = ha->hw.rcv_cntxt_id; if (qla_mbx_cmd(ha, (uint32_t *)rcv_mode, (sizeof (q80_config_mac_rcv_mode_t) >> 2), ha->hw.mbox, (sizeof(q80_config_mac_rcv_mode_rsp_t) >> 2), 1)) { device_printf(dev, "%s: failed0\n", __func__); return (-1); } rcv_mode_rsp = (q80_config_mac_rcv_mode_rsp_t *)ha->hw.mbox; err = Q8_MBX_RSP_STATUS(rcv_mode_rsp->regcnt_status); if (err) { device_printf(dev, "%s: failed1 [0x%08x]\n", __func__, err); return (-1); } return 0; } int ql_set_promisc(qla_host_t *ha) { int ret; ha->hw.mac_rcv_mode |= Q8_MBX_MAC_RCV_PROMISC_ENABLE; ret = qla_set_mac_rcv_mode(ha, ha->hw.mac_rcv_mode); return (ret); } void qla_reset_promisc(qla_host_t *ha) { ha->hw.mac_rcv_mode &= ~Q8_MBX_MAC_RCV_PROMISC_ENABLE; (void)qla_set_mac_rcv_mode(ha, ha->hw.mac_rcv_mode); } int ql_set_allmulti(qla_host_t *ha) { int ret; ha->hw.mac_rcv_mode |= Q8_MBX_MAC_ALL_MULTI_ENABLE; ret = qla_set_mac_rcv_mode(ha, ha->hw.mac_rcv_mode); return (ret); } void qla_reset_allmulti(qla_host_t *ha) { ha->hw.mac_rcv_mode &= ~Q8_MBX_MAC_ALL_MULTI_ENABLE; (void)qla_set_mac_rcv_mode(ha, ha->hw.mac_rcv_mode); } /* * Name: ql_set_max_mtu * Function: * Sets the maximum transfer unit size for the specified rcv context. */ int ql_set_max_mtu(qla_host_t *ha, uint32_t mtu, uint16_t cntxt_id) { device_t dev; q80_set_max_mtu_t *max_mtu; q80_set_max_mtu_rsp_t *max_mtu_rsp; uint32_t err; dev = ha->pci_dev; max_mtu = (q80_set_max_mtu_t *)ha->hw.mbox; bzero(max_mtu, (sizeof (q80_set_max_mtu_t))); max_mtu->opcode = Q8_MBX_SET_MAX_MTU; max_mtu->count_version = (sizeof (q80_set_max_mtu_t) >> 2); max_mtu->count_version |= Q8_MBX_CMD_VERSION; max_mtu->cntxt_id = cntxt_id; max_mtu->mtu = mtu; if (qla_mbx_cmd(ha, (uint32_t *)max_mtu, (sizeof (q80_set_max_mtu_t) >> 2), ha->hw.mbox, (sizeof (q80_set_max_mtu_rsp_t) >> 2), 1)) { device_printf(dev, "%s: failed\n", __func__); return -1; } max_mtu_rsp = (q80_set_max_mtu_rsp_t *)ha->hw.mbox; err = Q8_MBX_RSP_STATUS(max_mtu_rsp->regcnt_status); if (err) { device_printf(dev, "%s: failed [0x%08x]\n", __func__, err); } return 0; } static int qla_link_event_req(qla_host_t *ha, uint16_t cntxt_id) { device_t dev; q80_link_event_t *lnk; q80_link_event_rsp_t *lnk_rsp; uint32_t err; dev = ha->pci_dev; lnk = (q80_link_event_t *)ha->hw.mbox; bzero(lnk, (sizeof (q80_link_event_t))); lnk->opcode = Q8_MBX_LINK_EVENT_REQ; lnk->count_version = (sizeof (q80_link_event_t) >> 2); lnk->count_version |= Q8_MBX_CMD_VERSION; lnk->cntxt_id = cntxt_id; lnk->cmd = Q8_LINK_EVENT_CMD_ENABLE_ASYNC; if (qla_mbx_cmd(ha, (uint32_t *)lnk, (sizeof (q80_link_event_t) >> 2), ha->hw.mbox, (sizeof (q80_link_event_rsp_t) >> 2), 0)) { device_printf(dev, "%s: failed\n", __func__); return -1; } lnk_rsp = (q80_link_event_rsp_t *)ha->hw.mbox; err = Q8_MBX_RSP_STATUS(lnk_rsp->regcnt_status); if (err) { device_printf(dev, "%s: failed [0x%08x]\n", __func__, err); } return 0; } static int qla_config_fw_lro(qla_host_t *ha, uint16_t cntxt_id) { device_t dev; q80_config_fw_lro_t *fw_lro; q80_config_fw_lro_rsp_t *fw_lro_rsp; uint32_t err; dev = ha->pci_dev; fw_lro = (q80_config_fw_lro_t *)ha->hw.mbox; bzero(fw_lro, sizeof(q80_config_fw_lro_t)); fw_lro->opcode = Q8_MBX_CONFIG_FW_LRO; fw_lro->count_version = (sizeof (q80_config_fw_lro_t) >> 2); fw_lro->count_version |= Q8_MBX_CMD_VERSION; fw_lro->flags |= Q8_MBX_FW_LRO_IPV4 | Q8_MBX_FW_LRO_IPV4_WO_DST_IP_CHK; fw_lro->flags |= Q8_MBX_FW_LRO_IPV6 | Q8_MBX_FW_LRO_IPV6_WO_DST_IP_CHK; fw_lro->cntxt_id = cntxt_id; if (qla_mbx_cmd(ha, (uint32_t *)fw_lro, (sizeof (q80_config_fw_lro_t) >> 2), ha->hw.mbox, (sizeof (q80_config_fw_lro_rsp_t) >> 2), 0)) { device_printf(dev, "%s: failed\n", __func__); return -1; } fw_lro_rsp = (q80_config_fw_lro_rsp_t *)ha->hw.mbox; err = Q8_MBX_RSP_STATUS(fw_lro_rsp->regcnt_status); if (err) { device_printf(dev, "%s: failed [0x%08x]\n", __func__, err); } return 0; } static int qla_set_cam_search_mode(qla_host_t *ha, uint32_t search_mode) { device_t dev; q80_hw_config_t *hw_config; q80_hw_config_rsp_t *hw_config_rsp; uint32_t err; dev = ha->pci_dev; hw_config = (q80_hw_config_t *)ha->hw.mbox; bzero(hw_config, sizeof (q80_hw_config_t)); hw_config->opcode = Q8_MBX_HW_CONFIG; hw_config->count_version = Q8_HW_CONFIG_SET_CAM_SEARCH_MODE_COUNT; hw_config->count_version |= Q8_MBX_CMD_VERSION; hw_config->cmd = Q8_HW_CONFIG_SET_CAM_SEARCH_MODE; hw_config->u.set_cam_search_mode.mode = search_mode; if (qla_mbx_cmd(ha, (uint32_t *)hw_config, (sizeof (q80_hw_config_t) >> 2), ha->hw.mbox, (sizeof (q80_hw_config_rsp_t) >> 2), 0)) { device_printf(dev, "%s: failed\n", __func__); return -1; } hw_config_rsp = (q80_hw_config_rsp_t *)ha->hw.mbox; err = Q8_MBX_RSP_STATUS(hw_config_rsp->regcnt_status); if (err) { device_printf(dev, "%s: failed [0x%08x]\n", __func__, err); } return 0; } static int qla_get_cam_search_mode(qla_host_t *ha) { device_t dev; q80_hw_config_t *hw_config; q80_hw_config_rsp_t *hw_config_rsp; uint32_t err; dev = ha->pci_dev; hw_config = (q80_hw_config_t *)ha->hw.mbox; bzero(hw_config, sizeof (q80_hw_config_t)); hw_config->opcode = Q8_MBX_HW_CONFIG; hw_config->count_version = Q8_HW_CONFIG_GET_CAM_SEARCH_MODE_COUNT; hw_config->count_version |= Q8_MBX_CMD_VERSION; hw_config->cmd = Q8_HW_CONFIG_GET_CAM_SEARCH_MODE; if (qla_mbx_cmd(ha, (uint32_t *)hw_config, (sizeof (q80_hw_config_t) >> 2), ha->hw.mbox, (sizeof (q80_hw_config_rsp_t) >> 2), 0)) { device_printf(dev, "%s: failed\n", __func__); return -1; } hw_config_rsp = (q80_hw_config_rsp_t *)ha->hw.mbox; err = Q8_MBX_RSP_STATUS(hw_config_rsp->regcnt_status); if (err) { device_printf(dev, "%s: failed [0x%08x]\n", __func__, err); } else { device_printf(dev, "%s: cam search mode [0x%08x]\n", __func__, hw_config_rsp->u.get_cam_search_mode.mode); } return 0; } static void qla_xmt_stats(qla_host_t *ha, q80_xmt_stats_t *xstat, int i) { device_t dev = ha->pci_dev; if (i < ha->hw.num_tx_rings) { device_printf(dev, "%s[%d]: total_bytes\t\t%" PRIu64 "\n", __func__, i, xstat->total_bytes); device_printf(dev, "%s[%d]: total_pkts\t\t%" PRIu64 "\n", __func__, i, xstat->total_pkts); device_printf(dev, "%s[%d]: errors\t\t%" PRIu64 "\n", __func__, i, xstat->errors); device_printf(dev, "%s[%d]: pkts_dropped\t%" PRIu64 "\n", __func__, i, xstat->pkts_dropped); device_printf(dev, "%s[%d]: switch_pkts\t\t%" PRIu64 "\n", __func__, i, xstat->switch_pkts); device_printf(dev, "%s[%d]: num_buffers\t\t%" PRIu64 "\n", __func__, i, xstat->num_buffers); } else { device_printf(dev, "%s: total_bytes\t\t\t%" PRIu64 "\n", __func__, xstat->total_bytes); device_printf(dev, "%s: total_pkts\t\t\t%" PRIu64 "\n", __func__, xstat->total_pkts); device_printf(dev, "%s: errors\t\t\t%" PRIu64 "\n", __func__, xstat->errors); device_printf(dev, "%s: pkts_dropped\t\t\t%" PRIu64 "\n", __func__, xstat->pkts_dropped); device_printf(dev, "%s: switch_pkts\t\t\t%" PRIu64 "\n", __func__, xstat->switch_pkts); device_printf(dev, "%s: num_buffers\t\t\t%" PRIu64 "\n", __func__, xstat->num_buffers); } } static void qla_rcv_stats(qla_host_t *ha, q80_rcv_stats_t *rstat) { device_t dev = ha->pci_dev; device_printf(dev, "%s: total_bytes\t\t\t%" PRIu64 "\n", __func__, rstat->total_bytes); device_printf(dev, "%s: total_pkts\t\t\t%" PRIu64 "\n", __func__, rstat->total_pkts); device_printf(dev, "%s: lro_pkt_count\t\t%" PRIu64 "\n", __func__, rstat->lro_pkt_count); device_printf(dev, "%s: sw_pkt_count\t\t\t%" PRIu64 "\n", __func__, rstat->sw_pkt_count); device_printf(dev, "%s: ip_chksum_err\t\t%" PRIu64 "\n", __func__, rstat->ip_chksum_err); device_printf(dev, "%s: pkts_wo_acntxts\t\t%" PRIu64 "\n", __func__, rstat->pkts_wo_acntxts); device_printf(dev, "%s: pkts_dropped_no_sds_card\t%" PRIu64 "\n", __func__, rstat->pkts_dropped_no_sds_card); device_printf(dev, "%s: pkts_dropped_no_sds_host\t%" PRIu64 "\n", __func__, rstat->pkts_dropped_no_sds_host); device_printf(dev, "%s: oversized_pkts\t\t%" PRIu64 "\n", __func__, rstat->oversized_pkts); device_printf(dev, "%s: pkts_dropped_no_rds\t\t%" PRIu64 "\n", __func__, rstat->pkts_dropped_no_rds); device_printf(dev, "%s: unxpctd_mcast_pkts\t\t%" PRIu64 "\n", __func__, rstat->unxpctd_mcast_pkts); device_printf(dev, "%s: re1_fbq_error\t\t%" PRIu64 "\n", __func__, rstat->re1_fbq_error); device_printf(dev, "%s: invalid_mac_addr\t\t%" PRIu64 "\n", __func__, rstat->invalid_mac_addr); device_printf(dev, "%s: rds_prime_trys\t\t%" PRIu64 "\n", __func__, rstat->rds_prime_trys); device_printf(dev, "%s: rds_prime_success\t\t%" PRIu64 "\n", __func__, rstat->rds_prime_success); device_printf(dev, "%s: lro_flows_added\t\t%" PRIu64 "\n", __func__, rstat->lro_flows_added); device_printf(dev, "%s: lro_flows_deleted\t\t%" PRIu64 "\n", __func__, rstat->lro_flows_deleted); device_printf(dev, "%s: lro_flows_active\t\t%" PRIu64 "\n", __func__, rstat->lro_flows_active); device_printf(dev, "%s: pkts_droped_unknown\t\t%" PRIu64 "\n", __func__, rstat->pkts_droped_unknown); } static void qla_mac_stats(qla_host_t *ha, q80_mac_stats_t *mstat) { device_t dev = ha->pci_dev; device_printf(dev, "%s: xmt_frames\t\t\t%" PRIu64 "\n", __func__, mstat->xmt_frames); device_printf(dev, "%s: xmt_bytes\t\t\t%" PRIu64 "\n", __func__, mstat->xmt_bytes); device_printf(dev, "%s: xmt_mcast_pkts\t\t%" PRIu64 "\n", __func__, mstat->xmt_mcast_pkts); device_printf(dev, "%s: xmt_bcast_pkts\t\t%" PRIu64 "\n", __func__, mstat->xmt_bcast_pkts); device_printf(dev, "%s: xmt_pause_frames\t\t%" PRIu64 "\n", __func__, mstat->xmt_pause_frames); device_printf(dev, "%s: xmt_cntrl_pkts\t\t%" PRIu64 "\n", __func__, mstat->xmt_cntrl_pkts); device_printf(dev, "%s: xmt_pkt_lt_64bytes\t\t%" PRIu64 "\n", __func__, mstat->xmt_pkt_lt_64bytes); device_printf(dev, "%s: xmt_pkt_lt_127bytes\t\t%" PRIu64 "\n", __func__, mstat->xmt_pkt_lt_127bytes); device_printf(dev, "%s: xmt_pkt_lt_255bytes\t\t%" PRIu64 "\n", __func__, mstat->xmt_pkt_lt_255bytes); device_printf(dev, "%s: xmt_pkt_lt_511bytes\t\t%" PRIu64 "\n", __func__, mstat->xmt_pkt_lt_511bytes); device_printf(dev, "%s: xmt_pkt_lt_1023bytes\t\t%" PRIu64 "\n", __func__, mstat->xmt_pkt_lt_1023bytes); device_printf(dev, "%s: xmt_pkt_lt_1518bytes\t\t%" PRIu64 "\n", __func__, mstat->xmt_pkt_lt_1518bytes); device_printf(dev, "%s: xmt_pkt_gt_1518bytes\t\t%" PRIu64 "\n", __func__, mstat->xmt_pkt_gt_1518bytes); device_printf(dev, "%s: rcv_frames\t\t\t%" PRIu64 "\n", __func__, mstat->rcv_frames); device_printf(dev, "%s: rcv_bytes\t\t\t%" PRIu64 "\n", __func__, mstat->rcv_bytes); device_printf(dev, "%s: rcv_mcast_pkts\t\t%" PRIu64 "\n", __func__, mstat->rcv_mcast_pkts); device_printf(dev, "%s: rcv_bcast_pkts\t\t%" PRIu64 "\n", __func__, mstat->rcv_bcast_pkts); device_printf(dev, "%s: rcv_pause_frames\t\t%" PRIu64 "\n", __func__, mstat->rcv_pause_frames); device_printf(dev, "%s: rcv_cntrl_pkts\t\t%" PRIu64 "\n", __func__, mstat->rcv_cntrl_pkts); device_printf(dev, "%s: rcv_pkt_lt_64bytes\t\t%" PRIu64 "\n", __func__, mstat->rcv_pkt_lt_64bytes); device_printf(dev, "%s: rcv_pkt_lt_127bytes\t\t%" PRIu64 "\n", __func__, mstat->rcv_pkt_lt_127bytes); device_printf(dev, "%s: rcv_pkt_lt_255bytes\t\t%" PRIu64 "\n", __func__, mstat->rcv_pkt_lt_255bytes); device_printf(dev, "%s: rcv_pkt_lt_511bytes\t\t%" PRIu64 "\n", __func__, mstat->rcv_pkt_lt_511bytes); device_printf(dev, "%s: rcv_pkt_lt_1023bytes\t\t%" PRIu64 "\n", __func__, mstat->rcv_pkt_lt_1023bytes); device_printf(dev, "%s: rcv_pkt_lt_1518bytes\t\t%" PRIu64 "\n", __func__, mstat->rcv_pkt_lt_1518bytes); device_printf(dev, "%s: rcv_pkt_gt_1518bytes\t\t%" PRIu64 "\n", __func__, mstat->rcv_pkt_gt_1518bytes); device_printf(dev, "%s: rcv_len_error\t\t%" PRIu64 "\n", __func__, mstat->rcv_len_error); device_printf(dev, "%s: rcv_len_small\t\t%" PRIu64 "\n", __func__, mstat->rcv_len_small); device_printf(dev, "%s: rcv_len_large\t\t%" PRIu64 "\n", __func__, mstat->rcv_len_large); device_printf(dev, "%s: rcv_jabber\t\t\t%" PRIu64 "\n", __func__, mstat->rcv_jabber); device_printf(dev, "%s: rcv_dropped\t\t\t%" PRIu64 "\n", __func__, mstat->rcv_dropped); device_printf(dev, "%s: fcs_error\t\t\t%" PRIu64 "\n", __func__, mstat->fcs_error); device_printf(dev, "%s: align_error\t\t\t%" PRIu64 "\n", __func__, mstat->align_error); } static int qla_get_hw_stats(qla_host_t *ha, uint32_t cmd, uint32_t rsp_size) { device_t dev; q80_get_stats_t *stat; q80_get_stats_rsp_t *stat_rsp; uint32_t err; dev = ha->pci_dev; stat = (q80_get_stats_t *)ha->hw.mbox; bzero(stat, (sizeof (q80_get_stats_t))); stat->opcode = Q8_MBX_GET_STATS; stat->count_version = 2; stat->count_version |= Q8_MBX_CMD_VERSION; stat->cmd = cmd; if (qla_mbx_cmd(ha, (uint32_t *)stat, 2, ha->hw.mbox, (rsp_size >> 2), 0)) { device_printf(dev, "%s: failed\n", __func__); return -1; } stat_rsp = (q80_get_stats_rsp_t *)ha->hw.mbox; err = Q8_MBX_RSP_STATUS(stat_rsp->regcnt_status); if (err) { return -1; } return 0; } void ql_get_stats(qla_host_t *ha) { q80_get_stats_rsp_t *stat_rsp; q80_mac_stats_t *mstat; q80_xmt_stats_t *xstat; q80_rcv_stats_t *rstat; uint32_t cmd; int i; stat_rsp = (q80_get_stats_rsp_t *)ha->hw.mbox; /* * Get MAC Statistics */ cmd = Q8_GET_STATS_CMD_TYPE_MAC; // cmd |= Q8_GET_STATS_CMD_CLEAR; cmd |= ((ha->pci_func & 0x1) << 16); if (qla_get_hw_stats(ha, cmd, sizeof (q80_get_stats_rsp_t)) == 0) { mstat = (q80_mac_stats_t *)&stat_rsp->u.mac; qla_mac_stats(ha, mstat); } else { device_printf(ha->pci_dev, "%s: mac failed [0x%08x]\n", __func__, ha->hw.mbox[0]); } /* * Get RCV Statistics */ cmd = Q8_GET_STATS_CMD_RCV | Q8_GET_STATS_CMD_TYPE_CNTXT; // cmd |= Q8_GET_STATS_CMD_CLEAR; cmd |= (ha->hw.rcv_cntxt_id << 16); if (qla_get_hw_stats(ha, cmd, sizeof (q80_get_stats_rsp_t)) == 0) { rstat = (q80_rcv_stats_t *)&stat_rsp->u.rcv; qla_rcv_stats(ha, rstat); } else { device_printf(ha->pci_dev, "%s: rcv failed [0x%08x]\n", __func__, ha->hw.mbox[0]); } /* * Get XMT Statistics */ for (i = 0 ; i < ha->hw.num_tx_rings; i++) { cmd = Q8_GET_STATS_CMD_XMT | Q8_GET_STATS_CMD_TYPE_CNTXT; // cmd |= Q8_GET_STATS_CMD_CLEAR; cmd |= (ha->hw.tx_cntxt[i].tx_cntxt_id << 16); if (qla_get_hw_stats(ha, cmd, sizeof(q80_get_stats_rsp_t)) == 0) { xstat = (q80_xmt_stats_t *)&stat_rsp->u.xmt; qla_xmt_stats(ha, xstat, i); } else { device_printf(ha->pci_dev, "%s: xmt failed [0x%08x]\n", __func__, ha->hw.mbox[0]); } } return; } static void qla_get_quick_stats(qla_host_t *ha) { q80_get_mac_rcv_xmt_stats_rsp_t *stat_rsp; q80_mac_stats_t *mstat; q80_xmt_stats_t *xstat; q80_rcv_stats_t *rstat; uint32_t cmd; stat_rsp = (q80_get_mac_rcv_xmt_stats_rsp_t *)ha->hw.mbox; cmd = Q8_GET_STATS_CMD_TYPE_ALL; // cmd |= Q8_GET_STATS_CMD_CLEAR; // cmd |= ((ha->pci_func & 0x3) << 16); cmd |= (0xFFFF << 16); if (qla_get_hw_stats(ha, cmd, sizeof (q80_get_mac_rcv_xmt_stats_rsp_t)) == 0) { mstat = (q80_mac_stats_t *)&stat_rsp->mac; rstat = (q80_rcv_stats_t *)&stat_rsp->rcv; xstat = (q80_xmt_stats_t *)&stat_rsp->xmt; qla_mac_stats(ha, mstat); qla_rcv_stats(ha, rstat); qla_xmt_stats(ha, xstat, ha->hw.num_tx_rings); } else { device_printf(ha->pci_dev, "%s: failed [0x%08x]\n", __func__, ha->hw.mbox[0]); } return; } /* * Name: qla_tx_tso * Function: Checks if the packet to be transmitted is a candidate for * Large TCP Segment Offload. If yes, the appropriate fields in the Tx * Ring Structure are plugged in. */ static int qla_tx_tso(qla_host_t *ha, struct mbuf *mp, q80_tx_cmd_t *tx_cmd, uint8_t *hdr) { struct ether_vlan_header *eh; struct ip *ip = NULL; struct ip6_hdr *ip6 = NULL; struct tcphdr *th = NULL; uint32_t ehdrlen, hdrlen, ip_hlen, tcp_hlen, tcp_opt_off; uint16_t etype, opcode, offload = 1; device_t dev; dev = ha->pci_dev; eh = mtod(mp, struct ether_vlan_header *); if (eh->evl_encap_proto == htons(ETHERTYPE_VLAN)) { ehdrlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN; etype = ntohs(eh->evl_proto); } else { ehdrlen = ETHER_HDR_LEN; etype = ntohs(eh->evl_encap_proto); } hdrlen = 0; switch (etype) { case ETHERTYPE_IP: tcp_opt_off = ehdrlen + sizeof(struct ip) + sizeof(struct tcphdr); if (mp->m_len < tcp_opt_off) { m_copydata(mp, 0, tcp_opt_off, hdr); ip = (struct ip *)(hdr + ehdrlen); } else { ip = (struct ip *)(mp->m_data + ehdrlen); } ip_hlen = ip->ip_hl << 2; opcode = Q8_TX_CMD_OP_XMT_TCP_LSO; if ((ip->ip_p != IPPROTO_TCP) || (ip_hlen != sizeof (struct ip))){ /* IP Options are not supported */ offload = 0; } else th = (struct tcphdr *)((caddr_t)ip + ip_hlen); break; case ETHERTYPE_IPV6: tcp_opt_off = ehdrlen + sizeof(struct ip6_hdr) + sizeof (struct tcphdr); if (mp->m_len < tcp_opt_off) { m_copydata(mp, 0, tcp_opt_off, hdr); ip6 = (struct ip6_hdr *)(hdr + ehdrlen); } else { ip6 = (struct ip6_hdr *)(mp->m_data + ehdrlen); } ip_hlen = sizeof(struct ip6_hdr); opcode = Q8_TX_CMD_OP_XMT_TCP_LSO_IPV6; if (ip6->ip6_nxt != IPPROTO_TCP) { //device_printf(dev, "%s: ipv6\n", __func__); offload = 0; } else th = (struct tcphdr *)((caddr_t)ip6 + ip_hlen); break; default: QL_DPRINT8(ha, (dev, "%s: type!=ip\n", __func__)); offload = 0; break; } if (!offload) return (-1); tcp_hlen = th->th_off << 2; hdrlen = ehdrlen + ip_hlen + tcp_hlen; if (mp->m_len < hdrlen) { if (mp->m_len < tcp_opt_off) { if (tcp_hlen > sizeof(struct tcphdr)) { m_copydata(mp, tcp_opt_off, (tcp_hlen - sizeof(struct tcphdr)), &hdr[tcp_opt_off]); } } else { m_copydata(mp, 0, hdrlen, hdr); } } tx_cmd->mss = mp->m_pkthdr.tso_segsz; tx_cmd->flags_opcode = opcode ; tx_cmd->tcp_hdr_off = ip_hlen + ehdrlen; tx_cmd->total_hdr_len = hdrlen; /* Check for Multicast least significant bit of MSB == 1 */ if (eh->evl_dhost[0] & 0x01) { tx_cmd->flags_opcode |= Q8_TX_CMD_FLAGS_MULTICAST; } if (mp->m_len < hdrlen) { printf("%d\n", hdrlen); return (1); } return (0); } /* * Name: qla_tx_chksum * Function: Checks if the packet to be transmitted is a candidate for * TCP/UDP Checksum offload. If yes, the appropriate fields in the Tx * Ring Structure are plugged in. */ static int qla_tx_chksum(qla_host_t *ha, struct mbuf *mp, uint32_t *op_code, uint32_t *tcp_hdr_off) { struct ether_vlan_header *eh; struct ip *ip; struct ip6_hdr *ip6; uint32_t ehdrlen, ip_hlen; uint16_t etype, opcode, offload = 1; device_t dev; uint8_t buf[sizeof(struct ip6_hdr)]; dev = ha->pci_dev; *op_code = 0; if ((mp->m_pkthdr.csum_flags & (CSUM_TCP|CSUM_UDP)) == 0) return (-1); eh = mtod(mp, struct ether_vlan_header *); if (eh->evl_encap_proto == htons(ETHERTYPE_VLAN)) { ehdrlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN; etype = ntohs(eh->evl_proto); } else { ehdrlen = ETHER_HDR_LEN; etype = ntohs(eh->evl_encap_proto); } switch (etype) { case ETHERTYPE_IP: ip = (struct ip *)(mp->m_data + ehdrlen); ip_hlen = sizeof (struct ip); if (mp->m_len < (ehdrlen + ip_hlen)) { m_copydata(mp, ehdrlen, sizeof(struct ip), buf); ip = (struct ip *)buf; } if (ip->ip_p == IPPROTO_TCP) opcode = Q8_TX_CMD_OP_XMT_TCP_CHKSUM; else if (ip->ip_p == IPPROTO_UDP) opcode = Q8_TX_CMD_OP_XMT_UDP_CHKSUM; else { //device_printf(dev, "%s: ipv4\n", __func__); offload = 0; } break; case ETHERTYPE_IPV6: ip6 = (struct ip6_hdr *)(mp->m_data + ehdrlen); ip_hlen = sizeof(struct ip6_hdr); if (mp->m_len < (ehdrlen + ip_hlen)) { m_copydata(mp, ehdrlen, sizeof (struct ip6_hdr), buf); ip6 = (struct ip6_hdr *)buf; } if (ip6->ip6_nxt == IPPROTO_TCP) opcode = Q8_TX_CMD_OP_XMT_TCP_CHKSUM_IPV6; else if (ip6->ip6_nxt == IPPROTO_UDP) opcode = Q8_TX_CMD_OP_XMT_UDP_CHKSUM_IPV6; else { //device_printf(dev, "%s: ipv6\n", __func__); offload = 0; } break; default: offload = 0; break; } if (!offload) return (-1); *op_code = opcode; *tcp_hdr_off = (ip_hlen + ehdrlen); return (0); } #define QLA_TX_MIN_FREE 2 /* * Name: ql_hw_send * Function: Transmits a packet. It first checks if the packet is a * candidate for Large TCP Segment Offload and then for UDP/TCP checksum * offload. If either of these creteria are not met, it is transmitted * as a regular ethernet frame. */ int ql_hw_send(qla_host_t *ha, bus_dma_segment_t *segs, int nsegs, uint32_t tx_idx, struct mbuf *mp, uint32_t txr_idx, uint32_t iscsi_pdu) { struct ether_vlan_header *eh; qla_hw_t *hw = &ha->hw; q80_tx_cmd_t *tx_cmd, tso_cmd; bus_dma_segment_t *c_seg; uint32_t num_tx_cmds, hdr_len = 0; uint32_t total_length = 0, bytes, tx_cmd_count = 0, txr_next; device_t dev; int i, ret; uint8_t *src = NULL, *dst = NULL; uint8_t frame_hdr[QL_FRAME_HDR_SIZE]; uint32_t op_code = 0; uint32_t tcp_hdr_off = 0; dev = ha->pci_dev; /* * Always make sure there is atleast one empty slot in the tx_ring * tx_ring is considered full when there only one entry available */ num_tx_cmds = (nsegs + (Q8_TX_CMD_MAX_SEGMENTS - 1)) >> 2; total_length = mp->m_pkthdr.len; if (total_length > QLA_MAX_TSO_FRAME_SIZE) { device_printf(dev, "%s: total length exceeds maxlen(%d)\n", __func__, total_length); return (-1); } eh = mtod(mp, struct ether_vlan_header *); if (mp->m_pkthdr.csum_flags & CSUM_TSO) { bzero((void *)&tso_cmd, sizeof(q80_tx_cmd_t)); src = frame_hdr; ret = qla_tx_tso(ha, mp, &tso_cmd, src); if (!(ret & ~1)) { /* find the additional tx_cmd descriptors required */ if (mp->m_flags & M_VLANTAG) tso_cmd.total_hdr_len += ETHER_VLAN_ENCAP_LEN; hdr_len = tso_cmd.total_hdr_len; bytes = sizeof(q80_tx_cmd_t) - Q8_TX_CMD_TSO_ALIGN; bytes = QL_MIN(bytes, hdr_len); num_tx_cmds++; hdr_len -= bytes; while (hdr_len) { bytes = QL_MIN((sizeof(q80_tx_cmd_t)), hdr_len); hdr_len -= bytes; num_tx_cmds++; } hdr_len = tso_cmd.total_hdr_len; if (ret == 0) src = (uint8_t *)eh; } else return (EINVAL); } else { (void)qla_tx_chksum(ha, mp, &op_code, &tcp_hdr_off); } if (iscsi_pdu) ha->hw.iscsi_pkt_count++; if (hw->tx_cntxt[txr_idx].txr_free <= (num_tx_cmds + QLA_TX_MIN_FREE)) { qla_hw_tx_done_locked(ha, txr_idx); if (hw->tx_cntxt[txr_idx].txr_free <= (num_tx_cmds + QLA_TX_MIN_FREE)) { QL_DPRINT8(ha, (dev, "%s: (hw->txr_free <= " "(num_tx_cmds + QLA_TX_MIN_FREE))\n", __func__)); return (-1); } } tx_cmd = &hw->tx_cntxt[txr_idx].tx_ring_base[tx_idx]; if (!(mp->m_pkthdr.csum_flags & CSUM_TSO)) { if (nsegs > ha->hw.max_tx_segs) ha->hw.max_tx_segs = nsegs; bzero((void *)tx_cmd, sizeof(q80_tx_cmd_t)); if (op_code) { tx_cmd->flags_opcode = op_code; tx_cmd->tcp_hdr_off = tcp_hdr_off; } else { tx_cmd->flags_opcode = Q8_TX_CMD_OP_XMT_ETHER; } } else { bcopy(&tso_cmd, tx_cmd, sizeof(q80_tx_cmd_t)); ha->tx_tso_frames++; } if (eh->evl_encap_proto == htons(ETHERTYPE_VLAN)) { tx_cmd->flags_opcode |= Q8_TX_CMD_FLAGS_VLAN_TAGGED; if (iscsi_pdu) eh->evl_tag |= ha->hw.user_pri_iscsi << 13; } else if (mp->m_flags & M_VLANTAG) { if (hdr_len) { /* TSO */ tx_cmd->flags_opcode |= (Q8_TX_CMD_FLAGS_VLAN_TAGGED | Q8_TX_CMD_FLAGS_HW_VLAN_ID); tx_cmd->tcp_hdr_off += ETHER_VLAN_ENCAP_LEN; } else tx_cmd->flags_opcode |= Q8_TX_CMD_FLAGS_HW_VLAN_ID; ha->hw_vlan_tx_frames++; tx_cmd->vlan_tci = mp->m_pkthdr.ether_vtag; if (iscsi_pdu) { tx_cmd->vlan_tci |= ha->hw.user_pri_iscsi << 13; mp->m_pkthdr.ether_vtag = tx_cmd->vlan_tci; } } tx_cmd->n_bufs = (uint8_t)nsegs; tx_cmd->data_len_lo = (uint8_t)(total_length & 0xFF); tx_cmd->data_len_hi = qla_host_to_le16(((uint16_t)(total_length >> 8))); tx_cmd->cntxtid = Q8_TX_CMD_PORT_CNXTID(ha->pci_func); c_seg = segs; while (1) { for (i = 0; ((i < Q8_TX_CMD_MAX_SEGMENTS) && nsegs); i++) { switch (i) { case 0: tx_cmd->buf1_addr = c_seg->ds_addr; tx_cmd->buf1_len = c_seg->ds_len; break; case 1: tx_cmd->buf2_addr = c_seg->ds_addr; tx_cmd->buf2_len = c_seg->ds_len; break; case 2: tx_cmd->buf3_addr = c_seg->ds_addr; tx_cmd->buf3_len = c_seg->ds_len; break; case 3: tx_cmd->buf4_addr = c_seg->ds_addr; tx_cmd->buf4_len = c_seg->ds_len; break; } c_seg++; nsegs--; } txr_next = hw->tx_cntxt[txr_idx].txr_next = (hw->tx_cntxt[txr_idx].txr_next + 1) & (NUM_TX_DESCRIPTORS - 1); tx_cmd_count++; if (!nsegs) break; tx_cmd = &hw->tx_cntxt[txr_idx].tx_ring_base[txr_next]; bzero((void *)tx_cmd, sizeof(q80_tx_cmd_t)); } if (mp->m_pkthdr.csum_flags & CSUM_TSO) { /* TSO : Copy the header in the following tx cmd descriptors */ txr_next = hw->tx_cntxt[txr_idx].txr_next; tx_cmd = &hw->tx_cntxt[txr_idx].tx_ring_base[txr_next]; bzero((void *)tx_cmd, sizeof(q80_tx_cmd_t)); bytes = sizeof(q80_tx_cmd_t) - Q8_TX_CMD_TSO_ALIGN; bytes = QL_MIN(bytes, hdr_len); dst = (uint8_t *)tx_cmd + Q8_TX_CMD_TSO_ALIGN; if (mp->m_flags & M_VLANTAG) { /* first copy the src/dst MAC addresses */ bcopy(src, dst, (ETHER_ADDR_LEN * 2)); dst += (ETHER_ADDR_LEN * 2); src += (ETHER_ADDR_LEN * 2); *((uint16_t *)dst) = htons(ETHERTYPE_VLAN); dst += 2; *((uint16_t *)dst) = htons(mp->m_pkthdr.ether_vtag); dst += 2; /* bytes left in src header */ hdr_len -= ((ETHER_ADDR_LEN * 2) + ETHER_VLAN_ENCAP_LEN); /* bytes left in TxCmd Entry */ bytes -= ((ETHER_ADDR_LEN * 2) + ETHER_VLAN_ENCAP_LEN); bcopy(src, dst, bytes); src += bytes; hdr_len -= bytes; } else { bcopy(src, dst, bytes); src += bytes; hdr_len -= bytes; } txr_next = hw->tx_cntxt[txr_idx].txr_next = (hw->tx_cntxt[txr_idx].txr_next + 1) & (NUM_TX_DESCRIPTORS - 1); tx_cmd_count++; while (hdr_len) { tx_cmd = &hw->tx_cntxt[txr_idx].tx_ring_base[txr_next]; bzero((void *)tx_cmd, sizeof(q80_tx_cmd_t)); bytes = QL_MIN((sizeof(q80_tx_cmd_t)), hdr_len); bcopy(src, tx_cmd, bytes); src += bytes; hdr_len -= bytes; txr_next = hw->tx_cntxt[txr_idx].txr_next = (hw->tx_cntxt[txr_idx].txr_next + 1) & (NUM_TX_DESCRIPTORS - 1); tx_cmd_count++; } } hw->tx_cntxt[txr_idx].txr_free = hw->tx_cntxt[txr_idx].txr_free - tx_cmd_count; QL_UPDATE_TX_PRODUCER_INDEX(ha, hw->tx_cntxt[txr_idx].txr_next,\ txr_idx); QL_DPRINT8(ha, (dev, "%s: return\n", __func__)); return (0); } #define Q8_CONFIG_IND_TBL_SIZE 32 /* < Q8_RSS_IND_TBL_SIZE and power of 2 */ static int qla_config_rss_ind_table(qla_host_t *ha) { uint32_t i, count; uint8_t rss_ind_tbl[Q8_CONFIG_IND_TBL_SIZE]; for (i = 0; i < Q8_CONFIG_IND_TBL_SIZE; i++) { rss_ind_tbl[i] = i % ha->hw.num_sds_rings; } for (i = 0; i <= Q8_RSS_IND_TBL_MAX_IDX ; i = i + Q8_CONFIG_IND_TBL_SIZE) { if ((i + Q8_CONFIG_IND_TBL_SIZE) > Q8_RSS_IND_TBL_MAX_IDX) { count = Q8_RSS_IND_TBL_MAX_IDX - i + 1; } else { count = Q8_CONFIG_IND_TBL_SIZE; } if (qla_set_rss_ind_table(ha, i, count, ha->hw.rcv_cntxt_id, rss_ind_tbl)) return (-1); } return (0); } /* * Name: ql_del_hw_if * Function: Destroys the hardware specific entities corresponding to an * Ethernet Interface */ void ql_del_hw_if(qla_host_t *ha) { uint32_t i; uint32_t num_msix; (void)qla_stop_nic_func(ha); qla_del_rcv_cntxt(ha); + qla_del_xmt_cntxt(ha); if (ha->hw.flags.init_intr_cnxt) { for (i = 0; i < ha->hw.num_sds_rings; ) { if ((i + Q8_MAX_INTR_VECTORS) < ha->hw.num_sds_rings) num_msix = Q8_MAX_INTR_VECTORS; else num_msix = ha->hw.num_sds_rings - i; qla_config_intr_cntxt(ha, i, num_msix, 0); i += num_msix; } ha->hw.flags.init_intr_cnxt = 0; } + return; } void qla_confirm_9kb_enable(qla_host_t *ha) { uint32_t supports_9kb = 0; ha->hw.mbx_intr_mask_offset = READ_REG32(ha, Q8_MBOX_INT_MASK_MSIX); /* Use MSI-X vector 0; Enable Firmware Mailbox Interrupt */ WRITE_REG32(ha, Q8_MBOX_INT_ENABLE, BIT_2); WRITE_REG32(ha, ha->hw.mbx_intr_mask_offset, 0x0); qla_get_nic_partition(ha, &supports_9kb, NULL); if (!supports_9kb) ha->hw.enable_9kb = 0; return; } /* * Name: ql_init_hw_if * Function: Creates the hardware specific entities corresponding to an * Ethernet Interface - Transmit and Receive Contexts. Sets the MAC Address * corresponding to the interface. Enables LRO if allowed. */ int ql_init_hw_if(qla_host_t *ha) { device_t dev; uint32_t i; uint8_t bcast_mac[6]; qla_rdesc_t *rdesc; uint32_t num_msix; dev = ha->pci_dev; for (i = 0; i < ha->hw.num_sds_rings; i++) { bzero(ha->hw.dma_buf.sds_ring[i].dma_b, ha->hw.dma_buf.sds_ring[i].size); } for (i = 0; i < ha->hw.num_sds_rings; ) { if ((i + Q8_MAX_INTR_VECTORS) < ha->hw.num_sds_rings) num_msix = Q8_MAX_INTR_VECTORS; else num_msix = ha->hw.num_sds_rings - i; if (qla_config_intr_cntxt(ha, i, num_msix, 1)) { if (i > 0) { num_msix = i; for (i = 0; i < num_msix; ) { qla_config_intr_cntxt(ha, i, Q8_MAX_INTR_VECTORS, 0); i += Q8_MAX_INTR_VECTORS; } } return (-1); } i = i + num_msix; } ha->hw.flags.init_intr_cnxt = 1; /* * Create Receive Context */ if (qla_init_rcv_cntxt(ha)) { return (-1); } for (i = 0; i < ha->hw.num_rds_rings; i++) { rdesc = &ha->hw.rds[i]; rdesc->rx_next = NUM_RX_DESCRIPTORS - 2; rdesc->rx_in = 0; /* Update the RDS Producer Indices */ QL_UPDATE_RDS_PRODUCER_INDEX(ha, rdesc->prod_std,\ rdesc->rx_next); } /* * Create Transmit Context */ if (qla_init_xmt_cntxt(ha)) { qla_del_rcv_cntxt(ha); return (-1); } ha->hw.max_tx_segs = 0; - if (qla_config_mac_addr(ha, ha->hw.mac_addr, 1)) + if (qla_config_mac_addr(ha, ha->hw.mac_addr, 1, 1)) return(-1); ha->hw.flags.unicast_mac = 1; bcast_mac[0] = 0xFF; bcast_mac[1] = 0xFF; bcast_mac[2] = 0xFF; bcast_mac[3] = 0xFF; bcast_mac[4] = 0xFF; bcast_mac[5] = 0xFF; - if (qla_config_mac_addr(ha, bcast_mac, 1)) + if (qla_config_mac_addr(ha, bcast_mac, 1, 1)) return (-1); ha->hw.flags.bcast_mac = 1; /* * program any cached multicast addresses */ if (qla_hw_add_all_mcast(ha)) return (-1); if (qla_config_rss(ha, ha->hw.rcv_cntxt_id)) return (-1); if (qla_config_rss_ind_table(ha)) return (-1); if (qla_config_intr_coalesce(ha, ha->hw.rcv_cntxt_id, 0, 1)) return (-1); if (qla_link_event_req(ha, ha->hw.rcv_cntxt_id)) return (-1); if (qla_config_fw_lro(ha, ha->hw.rcv_cntxt_id)) return (-1); if (qla_init_nic_func(ha)) return (-1); if (qla_query_fw_dcbx_caps(ha)) return (-1); for (i = 0; i < ha->hw.num_sds_rings; i++) QL_ENABLE_INTERRUPTS(ha, i); return (0); } static int qla_map_sds_to_rds(qla_host_t *ha, uint32_t start_idx, uint32_t num_idx) { device_t dev = ha->pci_dev; q80_rq_map_sds_to_rds_t *map_rings; q80_rsp_map_sds_to_rds_t *map_rings_rsp; uint32_t i, err; qla_hw_t *hw = &ha->hw; map_rings = (q80_rq_map_sds_to_rds_t *)ha->hw.mbox; bzero(map_rings, sizeof(q80_rq_map_sds_to_rds_t)); map_rings->opcode = Q8_MBX_MAP_SDS_TO_RDS; map_rings->count_version = (sizeof (q80_rq_map_sds_to_rds_t) >> 2); map_rings->count_version |= Q8_MBX_CMD_VERSION; map_rings->cntxt_id = hw->rcv_cntxt_id; map_rings->num_rings = num_idx; for (i = 0; i < num_idx; i++) { map_rings->sds_rds[i].sds_ring = i + start_idx; map_rings->sds_rds[i].rds_ring = i + start_idx; } if (qla_mbx_cmd(ha, (uint32_t *)map_rings, (sizeof (q80_rq_map_sds_to_rds_t) >> 2), ha->hw.mbox, (sizeof(q80_rsp_add_rcv_rings_t) >> 2), 0)) { device_printf(dev, "%s: failed0\n", __func__); return (-1); } map_rings_rsp = (q80_rsp_map_sds_to_rds_t *)ha->hw.mbox; err = Q8_MBX_RSP_STATUS(map_rings_rsp->regcnt_status); if (err) { device_printf(dev, "%s: failed1 [0x%08x]\n", __func__, err); return (-1); } return (0); } /* * Name: qla_init_rcv_cntxt * Function: Creates the Receive Context. */ static int qla_init_rcv_cntxt(qla_host_t *ha) { q80_rq_rcv_cntxt_t *rcntxt; q80_rsp_rcv_cntxt_t *rcntxt_rsp; q80_stat_desc_t *sdesc; int i, j; qla_hw_t *hw = &ha->hw; device_t dev; uint32_t err; uint32_t rcntxt_sds_rings; uint32_t rcntxt_rds_rings; uint32_t max_idx; dev = ha->pci_dev; /* * Create Receive Context */ for (i = 0; i < hw->num_sds_rings; i++) { sdesc = (q80_stat_desc_t *)&hw->sds[i].sds_ring_base[0]; for (j = 0; j < NUM_STATUS_DESCRIPTORS; j++) { sdesc->data[0] = 1ULL; sdesc->data[1] = 1ULL; } } rcntxt_sds_rings = hw->num_sds_rings; if (hw->num_sds_rings > MAX_RCNTXT_SDS_RINGS) rcntxt_sds_rings = MAX_RCNTXT_SDS_RINGS; rcntxt_rds_rings = hw->num_rds_rings; if (hw->num_rds_rings > MAX_RDS_RING_SETS) rcntxt_rds_rings = MAX_RDS_RING_SETS; rcntxt = (q80_rq_rcv_cntxt_t *)ha->hw.mbox; bzero(rcntxt, (sizeof (q80_rq_rcv_cntxt_t))); rcntxt->opcode = Q8_MBX_CREATE_RX_CNTXT; rcntxt->count_version = (sizeof (q80_rq_rcv_cntxt_t) >> 2); rcntxt->count_version |= Q8_MBX_CMD_VERSION; rcntxt->cap0 = Q8_RCV_CNTXT_CAP0_BASEFW | Q8_RCV_CNTXT_CAP0_LRO | Q8_RCV_CNTXT_CAP0_HW_LRO | Q8_RCV_CNTXT_CAP0_RSS | Q8_RCV_CNTXT_CAP0_SGL_LRO; if (ha->hw.enable_9kb) rcntxt->cap0 |= Q8_RCV_CNTXT_CAP0_SINGLE_JUMBO; else rcntxt->cap0 |= Q8_RCV_CNTXT_CAP0_SGL_JUMBO; if (ha->hw.num_rds_rings > 1) { rcntxt->nrds_sets_rings = rcntxt_rds_rings | (1 << 5); rcntxt->cap0 |= Q8_RCV_CNTXT_CAP0_MULTI_RDS; } else rcntxt->nrds_sets_rings = 0x1 | (1 << 5); rcntxt->nsds_rings = rcntxt_sds_rings; rcntxt->rds_producer_mode = Q8_RCV_CNTXT_RDS_PROD_MODE_UNIQUE; rcntxt->rcv_vpid = 0; for (i = 0; i < rcntxt_sds_rings; i++) { rcntxt->sds[i].paddr = qla_host_to_le64(hw->dma_buf.sds_ring[i].dma_addr); rcntxt->sds[i].size = qla_host_to_le32(NUM_STATUS_DESCRIPTORS); if (ha->msix_count == 2) { rcntxt->sds[i].intr_id = qla_host_to_le16(hw->intr_id[0]); rcntxt->sds[i].intr_src_bit = qla_host_to_le16((i)); } else { rcntxt->sds[i].intr_id = qla_host_to_le16(hw->intr_id[i]); rcntxt->sds[i].intr_src_bit = qla_host_to_le16(0); } } for (i = 0; i < rcntxt_rds_rings; i++) { rcntxt->rds[i].paddr_std = qla_host_to_le64(hw->dma_buf.rds_ring[i].dma_addr); if (ha->hw.enable_9kb) rcntxt->rds[i].std_bsize = qla_host_to_le64(MJUM9BYTES); else rcntxt->rds[i].std_bsize = qla_host_to_le64(MCLBYTES); rcntxt->rds[i].std_nentries = qla_host_to_le32(NUM_RX_DESCRIPTORS); } if (qla_mbx_cmd(ha, (uint32_t *)rcntxt, (sizeof (q80_rq_rcv_cntxt_t) >> 2), ha->hw.mbox, (sizeof(q80_rsp_rcv_cntxt_t) >> 2), 0)) { device_printf(dev, "%s: failed0\n", __func__); return (-1); } rcntxt_rsp = (q80_rsp_rcv_cntxt_t *)ha->hw.mbox; err = Q8_MBX_RSP_STATUS(rcntxt_rsp->regcnt_status); if (err) { device_printf(dev, "%s: failed1 [0x%08x]\n", __func__, err); return (-1); } for (i = 0; i < rcntxt_sds_rings; i++) { hw->sds[i].sds_consumer = rcntxt_rsp->sds_cons[i]; } for (i = 0; i < rcntxt_rds_rings; i++) { hw->rds[i].prod_std = rcntxt_rsp->rds[i].prod_std; } hw->rcv_cntxt_id = rcntxt_rsp->cntxt_id; ha->hw.flags.init_rx_cnxt = 1; if (hw->num_sds_rings > MAX_RCNTXT_SDS_RINGS) { for (i = MAX_RCNTXT_SDS_RINGS; i < hw->num_sds_rings;) { if ((i + MAX_RCNTXT_SDS_RINGS) < hw->num_sds_rings) max_idx = MAX_RCNTXT_SDS_RINGS; else max_idx = hw->num_sds_rings - i; err = qla_add_rcv_rings(ha, i, max_idx); if (err) return -1; i += max_idx; } } if (hw->num_rds_rings > 1) { for (i = 0; i < hw->num_rds_rings; ) { if ((i + MAX_SDS_TO_RDS_MAP) < hw->num_rds_rings) max_idx = MAX_SDS_TO_RDS_MAP; else max_idx = hw->num_rds_rings - i; err = qla_map_sds_to_rds(ha, i, max_idx); if (err) return -1; i += max_idx; } } return (0); } static int qla_add_rcv_rings(qla_host_t *ha, uint32_t sds_idx, uint32_t nsds) { device_t dev = ha->pci_dev; q80_rq_add_rcv_rings_t *add_rcv; q80_rsp_add_rcv_rings_t *add_rcv_rsp; uint32_t i,j, err; qla_hw_t *hw = &ha->hw; add_rcv = (q80_rq_add_rcv_rings_t *)ha->hw.mbox; bzero(add_rcv, sizeof (q80_rq_add_rcv_rings_t)); add_rcv->opcode = Q8_MBX_ADD_RX_RINGS; add_rcv->count_version = (sizeof (q80_rq_add_rcv_rings_t) >> 2); add_rcv->count_version |= Q8_MBX_CMD_VERSION; add_rcv->nrds_sets_rings = nsds | (1 << 5); add_rcv->nsds_rings = nsds; add_rcv->cntxt_id = hw->rcv_cntxt_id; for (i = 0; i < nsds; i++) { j = i + sds_idx; add_rcv->sds[i].paddr = qla_host_to_le64(hw->dma_buf.sds_ring[j].dma_addr); add_rcv->sds[i].size = qla_host_to_le32(NUM_STATUS_DESCRIPTORS); if (ha->msix_count == 2) { add_rcv->sds[i].intr_id = qla_host_to_le16(hw->intr_id[0]); add_rcv->sds[i].intr_src_bit = qla_host_to_le16(j); } else { add_rcv->sds[i].intr_id = qla_host_to_le16(hw->intr_id[j]); add_rcv->sds[i].intr_src_bit = qla_host_to_le16(0); } } for (i = 0; (i < nsds); i++) { j = i + sds_idx; add_rcv->rds[i].paddr_std = qla_host_to_le64(hw->dma_buf.rds_ring[j].dma_addr); if (ha->hw.enable_9kb) add_rcv->rds[i].std_bsize = qla_host_to_le64(MJUM9BYTES); else add_rcv->rds[i].std_bsize = qla_host_to_le64(MCLBYTES); add_rcv->rds[i].std_nentries = qla_host_to_le32(NUM_RX_DESCRIPTORS); } if (qla_mbx_cmd(ha, (uint32_t *)add_rcv, (sizeof (q80_rq_add_rcv_rings_t) >> 2), ha->hw.mbox, (sizeof(q80_rsp_add_rcv_rings_t) >> 2), 0)) { device_printf(dev, "%s: failed0\n", __func__); return (-1); } add_rcv_rsp = (q80_rsp_add_rcv_rings_t *)ha->hw.mbox; err = Q8_MBX_RSP_STATUS(add_rcv_rsp->regcnt_status); if (err) { device_printf(dev, "%s: failed1 [0x%08x]\n", __func__, err); return (-1); } for (i = 0; i < nsds; i++) { hw->sds[(i + sds_idx)].sds_consumer = add_rcv_rsp->sds_cons[i]; } for (i = 0; i < nsds; i++) { hw->rds[(i + sds_idx)].prod_std = add_rcv_rsp->rds[i].prod_std; } return (0); } /* * Name: qla_del_rcv_cntxt * Function: Destroys the Receive Context. */ static void qla_del_rcv_cntxt(qla_host_t *ha) { device_t dev = ha->pci_dev; q80_rcv_cntxt_destroy_t *rcntxt; q80_rcv_cntxt_destroy_rsp_t *rcntxt_rsp; uint32_t err; uint8_t bcast_mac[6]; if (!ha->hw.flags.init_rx_cnxt) return; if (qla_hw_del_all_mcast(ha)) return; if (ha->hw.flags.bcast_mac) { bcast_mac[0] = 0xFF; bcast_mac[1] = 0xFF; bcast_mac[2] = 0xFF; bcast_mac[3] = 0xFF; bcast_mac[4] = 0xFF; bcast_mac[5] = 0xFF; - if (qla_config_mac_addr(ha, bcast_mac, 0)) + if (qla_config_mac_addr(ha, bcast_mac, 0, 1)) return; ha->hw.flags.bcast_mac = 0; } if (ha->hw.flags.unicast_mac) { - if (qla_config_mac_addr(ha, ha->hw.mac_addr, 0)) + if (qla_config_mac_addr(ha, ha->hw.mac_addr, 0, 1)) return; ha->hw.flags.unicast_mac = 0; } rcntxt = (q80_rcv_cntxt_destroy_t *)ha->hw.mbox; bzero(rcntxt, (sizeof (q80_rcv_cntxt_destroy_t))); rcntxt->opcode = Q8_MBX_DESTROY_RX_CNTXT; rcntxt->count_version = (sizeof (q80_rcv_cntxt_destroy_t) >> 2); rcntxt->count_version |= Q8_MBX_CMD_VERSION; rcntxt->cntxt_id = ha->hw.rcv_cntxt_id; if (qla_mbx_cmd(ha, (uint32_t *)rcntxt, (sizeof (q80_rcv_cntxt_destroy_t) >> 2), ha->hw.mbox, (sizeof(q80_rcv_cntxt_destroy_rsp_t) >> 2), 0)) { device_printf(dev, "%s: failed0\n", __func__); return; } rcntxt_rsp = (q80_rcv_cntxt_destroy_rsp_t *)ha->hw.mbox; err = Q8_MBX_RSP_STATUS(rcntxt_rsp->regcnt_status); if (err) { device_printf(dev, "%s: failed1 [0x%08x]\n", __func__, err); } ha->hw.flags.init_rx_cnxt = 0; return; } /* * Name: qla_init_xmt_cntxt * Function: Creates the Transmit Context. */ static int qla_init_xmt_cntxt_i(qla_host_t *ha, uint32_t txr_idx) { device_t dev; qla_hw_t *hw = &ha->hw; q80_rq_tx_cntxt_t *tcntxt; q80_rsp_tx_cntxt_t *tcntxt_rsp; uint32_t err; qla_hw_tx_cntxt_t *hw_tx_cntxt; hw_tx_cntxt = &hw->tx_cntxt[txr_idx]; dev = ha->pci_dev; /* * Create Transmit Context */ tcntxt = (q80_rq_tx_cntxt_t *)ha->hw.mbox; bzero(tcntxt, (sizeof (q80_rq_tx_cntxt_t))); tcntxt->opcode = Q8_MBX_CREATE_TX_CNTXT; tcntxt->count_version = (sizeof (q80_rq_tx_cntxt_t) >> 2); tcntxt->count_version |= Q8_MBX_CMD_VERSION; #ifdef QL_ENABLE_ISCSI_TLV tcntxt->cap0 = Q8_TX_CNTXT_CAP0_BASEFW | Q8_TX_CNTXT_CAP0_LSO | Q8_TX_CNTXT_CAP0_TC; if (txr_idx >= (ha->hw.num_tx_rings >> 1)) { tcntxt->traffic_class = 1; } #else tcntxt->cap0 = Q8_TX_CNTXT_CAP0_BASEFW | Q8_TX_CNTXT_CAP0_LSO; #endif /* #ifdef QL_ENABLE_ISCSI_TLV */ tcntxt->ntx_rings = 1; tcntxt->tx_ring[0].paddr = qla_host_to_le64(hw_tx_cntxt->tx_ring_paddr); tcntxt->tx_ring[0].tx_consumer = qla_host_to_le64(hw_tx_cntxt->tx_cons_paddr); tcntxt->tx_ring[0].nentries = qla_host_to_le16(NUM_TX_DESCRIPTORS); tcntxt->tx_ring[0].intr_id = qla_host_to_le16(hw->intr_id[0]); tcntxt->tx_ring[0].intr_src_bit = qla_host_to_le16(0); hw_tx_cntxt->txr_free = NUM_TX_DESCRIPTORS; hw_tx_cntxt->txr_next = hw_tx_cntxt->txr_comp = 0; if (qla_mbx_cmd(ha, (uint32_t *)tcntxt, (sizeof (q80_rq_tx_cntxt_t) >> 2), ha->hw.mbox, (sizeof(q80_rsp_tx_cntxt_t) >> 2), 0)) { device_printf(dev, "%s: failed0\n", __func__); return (-1); } tcntxt_rsp = (q80_rsp_tx_cntxt_t *)ha->hw.mbox; err = Q8_MBX_RSP_STATUS(tcntxt_rsp->regcnt_status); if (err) { device_printf(dev, "%s: failed1 [0x%08x]\n", __func__, err); return -1; } hw_tx_cntxt->tx_prod_reg = tcntxt_rsp->tx_ring[0].prod_index; hw_tx_cntxt->tx_cntxt_id = tcntxt_rsp->tx_ring[0].cntxt_id; if (qla_config_intr_coalesce(ha, hw_tx_cntxt->tx_cntxt_id, 0, 0)) return (-1); return (0); } /* * Name: qla_del_xmt_cntxt * Function: Destroys the Transmit Context. */ static int qla_del_xmt_cntxt_i(qla_host_t *ha, uint32_t txr_idx) { device_t dev = ha->pci_dev; q80_tx_cntxt_destroy_t *tcntxt; q80_tx_cntxt_destroy_rsp_t *tcntxt_rsp; uint32_t err; tcntxt = (q80_tx_cntxt_destroy_t *)ha->hw.mbox; bzero(tcntxt, (sizeof (q80_tx_cntxt_destroy_t))); tcntxt->opcode = Q8_MBX_DESTROY_TX_CNTXT; tcntxt->count_version = (sizeof (q80_tx_cntxt_destroy_t) >> 2); tcntxt->count_version |= Q8_MBX_CMD_VERSION; tcntxt->cntxt_id = ha->hw.tx_cntxt[txr_idx].tx_cntxt_id; if (qla_mbx_cmd(ha, (uint32_t *)tcntxt, (sizeof (q80_tx_cntxt_destroy_t) >> 2), ha->hw.mbox, (sizeof (q80_tx_cntxt_destroy_rsp_t) >> 2), 0)) { device_printf(dev, "%s: failed0\n", __func__); return (-1); } tcntxt_rsp = (q80_tx_cntxt_destroy_rsp_t *)ha->hw.mbox; err = Q8_MBX_RSP_STATUS(tcntxt_rsp->regcnt_status); if (err) { device_printf(dev, "%s: failed1 [0x%08x]\n", __func__, err); return (-1); } return (0); } static void qla_del_xmt_cntxt(qla_host_t *ha) { uint32_t i; if (!ha->hw.flags.init_tx_cnxt) return; for (i = 0; i < ha->hw.num_tx_rings; i++) { if (qla_del_xmt_cntxt_i(ha, i)) break; } ha->hw.flags.init_tx_cnxt = 0; } static int qla_init_xmt_cntxt(qla_host_t *ha) { uint32_t i, j; for (i = 0; i < ha->hw.num_tx_rings; i++) { if (qla_init_xmt_cntxt_i(ha, i) != 0) { for (j = 0; j < i; j++) qla_del_xmt_cntxt_i(ha, j); return (-1); } } ha->hw.flags.init_tx_cnxt = 1; return (0); } static int -qla_hw_add_all_mcast(qla_host_t *ha) +qla_hw_all_mcast(qla_host_t *ha, uint32_t add_mcast) { int i, nmcast; + uint32_t count = 0; + uint8_t *mcast; nmcast = ha->hw.nmcast; + QL_DPRINT2(ha, (ha->pci_dev, + "%s:[0x%x] enter nmcast = %d \n", __func__, add_mcast, nmcast)); + + mcast = ha->hw.mac_addr_arr; + memset(mcast, 0, (Q8_MAX_MAC_ADDRS * ETHER_ADDR_LEN)); + for (i = 0 ; ((i < Q8_MAX_NUM_MULTICAST_ADDRS) && nmcast); i++) { if ((ha->hw.mcast[i].addr[0] != 0) || (ha->hw.mcast[i].addr[1] != 0) || (ha->hw.mcast[i].addr[2] != 0) || (ha->hw.mcast[i].addr[3] != 0) || (ha->hw.mcast[i].addr[4] != 0) || (ha->hw.mcast[i].addr[5] != 0)) { - if (qla_config_mac_addr(ha, ha->hw.mcast[i].addr, 1)) { - device_printf(ha->pci_dev, "%s: failed\n", - __func__); - return (-1); + bcopy(ha->hw.mcast[i].addr, mcast, ETHER_ADDR_LEN); + mcast = mcast + ETHER_ADDR_LEN; + count++; + + if (count == Q8_MAX_MAC_ADDRS) { + if (qla_config_mac_addr(ha, ha->hw.mac_addr_arr, + add_mcast, count)) { + device_printf(ha->pci_dev, + "%s: failed\n", __func__); + return (-1); + } + + count = 0; + mcast = ha->hw.mac_addr_arr; + memset(mcast, 0, + (Q8_MAX_MAC_ADDRS * ETHER_ADDR_LEN)); } nmcast--; } } + + if (count) { + if (qla_config_mac_addr(ha, ha->hw.mac_addr_arr, add_mcast, + count)) { + device_printf(ha->pci_dev, "%s: failed\n", __func__); + return (-1); + } + } + QL_DPRINT2(ha, (ha->pci_dev, + "%s:[0x%x] exit nmcast = %d \n", __func__, add_mcast, nmcast)); + return 0; } static int -qla_hw_del_all_mcast(qla_host_t *ha) +qla_hw_add_all_mcast(qla_host_t *ha) { - int i, nmcast; + int ret; - nmcast = ha->hw.nmcast; + ret = qla_hw_all_mcast(ha, 1); - for (i = 0 ; ((i < Q8_MAX_NUM_MULTICAST_ADDRS) && nmcast); i++) { - if ((ha->hw.mcast[i].addr[0] != 0) || - (ha->hw.mcast[i].addr[1] != 0) || - (ha->hw.mcast[i].addr[2] != 0) || - (ha->hw.mcast[i].addr[3] != 0) || - (ha->hw.mcast[i].addr[4] != 0) || - (ha->hw.mcast[i].addr[5] != 0)) { + return (ret); +} - if (qla_config_mac_addr(ha, ha->hw.mcast[i].addr, 0)) - return (-1); +static int +qla_hw_del_all_mcast(qla_host_t *ha) +{ + int ret; - nmcast--; - } - } - return 0; + ret = qla_hw_all_mcast(ha, 0); + + bzero(ha->hw.mcast, (sizeof (qla_mcast_t) * Q8_MAX_NUM_MULTICAST_ADDRS)); + ha->hw.nmcast = 0; + + return (ret); } static int -qla_hw_add_mcast(qla_host_t *ha, uint8_t *mta) +qla_hw_mac_addr_present(qla_host_t *ha, uint8_t *mta) { int i; for (i = 0; i < Q8_MAX_NUM_MULTICAST_ADDRS; i++) { - if (QL_MAC_CMP(ha->hw.mcast[i].addr, mta) == 0) - return 0; /* its been already added */ + return (0); /* its been already added */ } + return (-1); +} +static int +qla_hw_add_mcast(qla_host_t *ha, uint8_t *mta, uint32_t nmcast) +{ + int i; + for (i = 0; i < Q8_MAX_NUM_MULTICAST_ADDRS; i++) { if ((ha->hw.mcast[i].addr[0] == 0) && (ha->hw.mcast[i].addr[1] == 0) && (ha->hw.mcast[i].addr[2] == 0) && (ha->hw.mcast[i].addr[3] == 0) && (ha->hw.mcast[i].addr[4] == 0) && (ha->hw.mcast[i].addr[5] == 0)) { - if (qla_config_mac_addr(ha, mta, 1)) - return (-1); - bcopy(mta, ha->hw.mcast[i].addr, Q8_MAC_ADDR_LEN); ha->hw.nmcast++; - return 0; + mta = mta + ETHER_ADDR_LEN; + nmcast--; + + if (nmcast == 0) + break; } + } return 0; } static int -qla_hw_del_mcast(qla_host_t *ha, uint8_t *mta) +qla_hw_del_mcast(qla_host_t *ha, uint8_t *mta, uint32_t nmcast) { int i; for (i = 0; i < Q8_MAX_NUM_MULTICAST_ADDRS; i++) { if (QL_MAC_CMP(ha->hw.mcast[i].addr, mta) == 0) { - if (qla_config_mac_addr(ha, mta, 0)) - return (-1); - ha->hw.mcast[i].addr[0] = 0; ha->hw.mcast[i].addr[1] = 0; ha->hw.mcast[i].addr[2] = 0; ha->hw.mcast[i].addr[3] = 0; ha->hw.mcast[i].addr[4] = 0; ha->hw.mcast[i].addr[5] = 0; ha->hw.nmcast--; - return 0; + mta = mta + ETHER_ADDR_LEN; + nmcast--; + + if (nmcast == 0) + break; } } return 0; } /* * Name: ql_hw_set_multi - * Function: Sets the Multicast Addresses provided the host O.S into the + * Function: Sets the Multicast Addresses provided by the host O.S into the * hardware (for the given interface) */ int -ql_hw_set_multi(qla_host_t *ha, uint8_t *mcast, uint32_t mcnt, +ql_hw_set_multi(qla_host_t *ha, uint8_t *mcast_addr, uint32_t mcnt, uint32_t add_mac) { + uint8_t *mta = mcast_addr; int i; - uint8_t *mta = mcast; int ret = 0; + uint32_t count = 0; + uint8_t *mcast; + mcast = ha->hw.mac_addr_arr; + memset(mcast, 0, (Q8_MAX_MAC_ADDRS * ETHER_ADDR_LEN)); + for (i = 0; i < mcnt; i++) { - if (add_mac) { - ret = qla_hw_add_mcast(ha, mta); - if (ret) - break; - } else { - ret = qla_hw_del_mcast(ha, mta); - if (ret) - break; + if (mta[0] || mta[1] || mta[2] || mta[3] || mta[4] || mta[5]) { + if (add_mac) { + if (qla_hw_mac_addr_present(ha, mta) != 0) { + bcopy(mta, mcast, ETHER_ADDR_LEN); + mcast = mcast + ETHER_ADDR_LEN; + count++; + } + } else { + if (qla_hw_mac_addr_present(ha, mta) == 0) { + bcopy(mta, mcast, ETHER_ADDR_LEN); + mcast = mcast + ETHER_ADDR_LEN; + count++; + } + } } + if (count == Q8_MAX_MAC_ADDRS) { + if (qla_config_mac_addr(ha, ha->hw.mac_addr_arr, + add_mac, count)) { + device_printf(ha->pci_dev, "%s: failed\n", + __func__); + return (-1); + } + + if (add_mac) { + qla_hw_add_mcast(ha, ha->hw.mac_addr_arr, + count); + } else { + qla_hw_del_mcast(ha, ha->hw.mac_addr_arr, + count); + } + + count = 0; + mcast = ha->hw.mac_addr_arr; + memset(mcast, 0, (Q8_MAX_MAC_ADDRS * ETHER_ADDR_LEN)); + } mta += Q8_MAC_ADDR_LEN; } + + if (count) { + if (qla_config_mac_addr(ha, ha->hw.mac_addr_arr, add_mac, + count)) { + device_printf(ha->pci_dev, "%s: failed\n", __func__); + return (-1); + } + if (add_mac) { + qla_hw_add_mcast(ha, ha->hw.mac_addr_arr, count); + } else { + qla_hw_del_mcast(ha, ha->hw.mac_addr_arr, count); + } + } + return (ret); } /* * Name: qla_hw_tx_done_locked * Function: Handle Transmit Completions */ static void qla_hw_tx_done_locked(qla_host_t *ha, uint32_t txr_idx) { qla_tx_buf_t *txb; qla_hw_t *hw = &ha->hw; uint32_t comp_idx, comp_count = 0; qla_hw_tx_cntxt_t *hw_tx_cntxt; hw_tx_cntxt = &hw->tx_cntxt[txr_idx]; /* retrieve index of last entry in tx ring completed */ comp_idx = qla_le32_to_host(*(hw_tx_cntxt->tx_cons)); while (comp_idx != hw_tx_cntxt->txr_comp) { txb = &ha->tx_ring[txr_idx].tx_buf[hw_tx_cntxt->txr_comp]; hw_tx_cntxt->txr_comp++; if (hw_tx_cntxt->txr_comp == NUM_TX_DESCRIPTORS) hw_tx_cntxt->txr_comp = 0; comp_count++; if (txb->m_head) { if_inc_counter(ha->ifp, IFCOUNTER_OPACKETS, 1); bus_dmamap_sync(ha->tx_tag, txb->map, BUS_DMASYNC_POSTWRITE); bus_dmamap_unload(ha->tx_tag, txb->map); m_freem(txb->m_head); txb->m_head = NULL; } } hw_tx_cntxt->txr_free += comp_count; return; } /* * Name: ql_hw_tx_done * Function: Handle Transmit Completions */ void ql_hw_tx_done(qla_host_t *ha) { int i; uint32_t flag = 0; if (!mtx_trylock(&ha->tx_lock)) { QL_DPRINT8(ha, (ha->pci_dev, "%s: !mtx_trylock(&ha->tx_lock)\n", __func__)); return; } for (i = 0; i < ha->hw.num_tx_rings; i++) { qla_hw_tx_done_locked(ha, i); if (ha->hw.tx_cntxt[i].txr_free <= (NUM_TX_DESCRIPTORS >> 1)) flag = 1; } if (!flag) ha->ifp->if_drv_flags &= ~IFF_DRV_OACTIVE; QLA_TX_UNLOCK(ha); return; } void ql_update_link_state(qla_host_t *ha) { uint32_t link_state; uint32_t prev_link_state; if (!(ha->ifp->if_drv_flags & IFF_DRV_RUNNING)) { ha->hw.link_up = 0; return; } link_state = READ_REG32(ha, Q8_LINK_STATE); prev_link_state = ha->hw.link_up; if (ha->pci_func == 0) ha->hw.link_up = (((link_state & 0xF) == 1)? 1 : 0); else ha->hw.link_up = ((((link_state >> 4)& 0xF) == 1)? 1 : 0); if (prev_link_state != ha->hw.link_up) { if (ha->hw.link_up) { if_link_state_change(ha->ifp, LINK_STATE_UP); } else { if_link_state_change(ha->ifp, LINK_STATE_DOWN); } } return; } void ql_hw_stop_rcv(qla_host_t *ha) { int i, done, count = 100; ha->flags.stop_rcv = 1; while (count) { done = 1; for (i = 0; i < ha->hw.num_sds_rings; i++) { if (ha->hw.sds[i].rcv_active) done = 0; } if (done) break; else qla_mdelay(__func__, 10); count--; } if (!count) device_printf(ha->pci_dev, "%s: Counter expired.\n", __func__); return; } int ql_hw_check_health(qla_host_t *ha) { uint32_t val; ha->hw.health_count++; if (ha->hw.health_count < 1000) return 0; ha->hw.health_count = 0; val = READ_REG32(ha, Q8_ASIC_TEMPERATURE); if (((val & 0xFFFF) == 2) || ((val & 0xFFFF) == 3) || (QL_ERR_INJECT(ha, INJCT_TEMPERATURE_FAILURE))) { device_printf(ha->pci_dev, "%s: Temperature Alert [0x%08x]\n", __func__, val); return -1; } val = READ_REG32(ha, Q8_FIRMWARE_HEARTBEAT); if ((val != ha->hw.hbeat_value) && (!(QL_ERR_INJECT(ha, INJCT_HEARTBEAT_FAILURE)))) { ha->hw.hbeat_value = val; return 0; } device_printf(ha->pci_dev, "%s: Heartbeat Failue [0x%08x]\n", __func__, val); return -1; } static int qla_init_nic_func(qla_host_t *ha) { device_t dev; q80_init_nic_func_t *init_nic; q80_init_nic_func_rsp_t *init_nic_rsp; uint32_t err; dev = ha->pci_dev; init_nic = (q80_init_nic_func_t *)ha->hw.mbox; bzero(init_nic, sizeof(q80_init_nic_func_t)); init_nic->opcode = Q8_MBX_INIT_NIC_FUNC; init_nic->count_version = (sizeof (q80_init_nic_func_t) >> 2); init_nic->count_version |= Q8_MBX_CMD_VERSION; init_nic->options = Q8_INIT_NIC_REG_DCBX_CHNG_AEN; init_nic->options |= Q8_INIT_NIC_REG_SFP_CHNG_AEN; init_nic->options |= Q8_INIT_NIC_REG_IDC_AEN; //qla_dump_buf8(ha, __func__, init_nic, sizeof (q80_init_nic_func_t)); if (qla_mbx_cmd(ha, (uint32_t *)init_nic, (sizeof (q80_init_nic_func_t) >> 2), ha->hw.mbox, (sizeof (q80_init_nic_func_rsp_t) >> 2), 0)) { device_printf(dev, "%s: failed\n", __func__); return -1; } init_nic_rsp = (q80_init_nic_func_rsp_t *)ha->hw.mbox; // qla_dump_buf8(ha, __func__, init_nic_rsp, sizeof (q80_init_nic_func_rsp_t)); err = Q8_MBX_RSP_STATUS(init_nic_rsp->regcnt_status); if (err) { device_printf(dev, "%s: failed [0x%08x]\n", __func__, err); } return 0; } static int qla_stop_nic_func(qla_host_t *ha) { device_t dev; q80_stop_nic_func_t *stop_nic; q80_stop_nic_func_rsp_t *stop_nic_rsp; uint32_t err; dev = ha->pci_dev; stop_nic = (q80_stop_nic_func_t *)ha->hw.mbox; bzero(stop_nic, sizeof(q80_stop_nic_func_t)); stop_nic->opcode = Q8_MBX_STOP_NIC_FUNC; stop_nic->count_version = (sizeof (q80_stop_nic_func_t) >> 2); stop_nic->count_version |= Q8_MBX_CMD_VERSION; stop_nic->options = Q8_STOP_NIC_DEREG_DCBX_CHNG_AEN; stop_nic->options |= Q8_STOP_NIC_DEREG_SFP_CHNG_AEN; //qla_dump_buf8(ha, __func__, stop_nic, sizeof (q80_stop_nic_func_t)); if (qla_mbx_cmd(ha, (uint32_t *)stop_nic, (sizeof (q80_stop_nic_func_t) >> 2), ha->hw.mbox, (sizeof (q80_stop_nic_func_rsp_t) >> 2), 0)) { device_printf(dev, "%s: failed\n", __func__); return -1; } stop_nic_rsp = (q80_stop_nic_func_rsp_t *)ha->hw.mbox; //qla_dump_buf8(ha, __func__, stop_nic_rsp, sizeof (q80_stop_nic_func_rsp_ t)); err = Q8_MBX_RSP_STATUS(stop_nic_rsp->regcnt_status); if (err) { device_printf(dev, "%s: failed [0x%08x]\n", __func__, err); } return 0; } static int qla_query_fw_dcbx_caps(qla_host_t *ha) { device_t dev; q80_query_fw_dcbx_caps_t *fw_dcbx; q80_query_fw_dcbx_caps_rsp_t *fw_dcbx_rsp; uint32_t err; dev = ha->pci_dev; fw_dcbx = (q80_query_fw_dcbx_caps_t *)ha->hw.mbox; bzero(fw_dcbx, sizeof(q80_query_fw_dcbx_caps_t)); fw_dcbx->opcode = Q8_MBX_GET_FW_DCBX_CAPS; fw_dcbx->count_version = (sizeof (q80_query_fw_dcbx_caps_t) >> 2); fw_dcbx->count_version |= Q8_MBX_CMD_VERSION; ql_dump_buf8(ha, __func__, fw_dcbx, sizeof (q80_query_fw_dcbx_caps_t)); if (qla_mbx_cmd(ha, (uint32_t *)fw_dcbx, (sizeof (q80_query_fw_dcbx_caps_t) >> 2), ha->hw.mbox, (sizeof (q80_query_fw_dcbx_caps_rsp_t) >> 2), 0)) { device_printf(dev, "%s: failed\n", __func__); return -1; } fw_dcbx_rsp = (q80_query_fw_dcbx_caps_rsp_t *)ha->hw.mbox; ql_dump_buf8(ha, __func__, fw_dcbx_rsp, sizeof (q80_query_fw_dcbx_caps_rsp_t)); err = Q8_MBX_RSP_STATUS(fw_dcbx_rsp->regcnt_status); if (err) { device_printf(dev, "%s: failed [0x%08x]\n", __func__, err); } return 0; } static int qla_idc_ack(qla_host_t *ha, uint32_t aen_mb1, uint32_t aen_mb2, uint32_t aen_mb3, uint32_t aen_mb4) { device_t dev; q80_idc_ack_t *idc_ack; q80_idc_ack_rsp_t *idc_ack_rsp; uint32_t err; int count = 300; dev = ha->pci_dev; idc_ack = (q80_idc_ack_t *)ha->hw.mbox; bzero(idc_ack, sizeof(q80_idc_ack_t)); idc_ack->opcode = Q8_MBX_IDC_ACK; idc_ack->count_version = (sizeof (q80_idc_ack_t) >> 2); idc_ack->count_version |= Q8_MBX_CMD_VERSION; idc_ack->aen_mb1 = aen_mb1; idc_ack->aen_mb2 = aen_mb2; idc_ack->aen_mb3 = aen_mb3; idc_ack->aen_mb4 = aen_mb4; ha->hw.imd_compl= 0; if (qla_mbx_cmd(ha, (uint32_t *)idc_ack, (sizeof (q80_idc_ack_t) >> 2), ha->hw.mbox, (sizeof (q80_idc_ack_rsp_t) >> 2), 0)) { device_printf(dev, "%s: failed\n", __func__); return -1; } idc_ack_rsp = (q80_idc_ack_rsp_t *)ha->hw.mbox; err = Q8_MBX_RSP_STATUS(idc_ack_rsp->regcnt_status); if (err) { device_printf(dev, "%s: failed [0x%08x]\n", __func__, err); return(-1); } while (count && !ha->hw.imd_compl) { qla_mdelay(__func__, 100); count--; } if (!count) return -1; else device_printf(dev, "%s: count %d\n", __func__, count); return (0); } static int qla_set_port_config(qla_host_t *ha, uint32_t cfg_bits) { device_t dev; q80_set_port_cfg_t *pcfg; q80_set_port_cfg_rsp_t *pfg_rsp; uint32_t err; int count = 300; dev = ha->pci_dev; pcfg = (q80_set_port_cfg_t *)ha->hw.mbox; bzero(pcfg, sizeof(q80_set_port_cfg_t)); pcfg->opcode = Q8_MBX_SET_PORT_CONFIG; pcfg->count_version = (sizeof (q80_set_port_cfg_t) >> 2); pcfg->count_version |= Q8_MBX_CMD_VERSION; pcfg->cfg_bits = cfg_bits; device_printf(dev, "%s: cfg_bits" " [STD_PAUSE_DIR, PAUSE_TYPE, DCBX]" " [0x%x, 0x%x, 0x%x]\n", __func__, ((cfg_bits & Q8_PORT_CFG_BITS_STDPAUSE_DIR_MASK)>>20), ((cfg_bits & Q8_PORT_CFG_BITS_PAUSE_CFG_MASK) >> 5), ((cfg_bits & Q8_PORT_CFG_BITS_DCBX_ENABLE) ? 1: 0)); ha->hw.imd_compl= 0; if (qla_mbx_cmd(ha, (uint32_t *)pcfg, (sizeof (q80_set_port_cfg_t) >> 2), ha->hw.mbox, (sizeof (q80_set_port_cfg_rsp_t) >> 2), 0)) { device_printf(dev, "%s: failed\n", __func__); return -1; } pfg_rsp = (q80_set_port_cfg_rsp_t *)ha->hw.mbox; err = Q8_MBX_RSP_STATUS(pfg_rsp->regcnt_status); if (err == Q8_MBX_RSP_IDC_INTRMD_RSP) { while (count && !ha->hw.imd_compl) { qla_mdelay(__func__, 100); count--; } if (count) { device_printf(dev, "%s: count %d\n", __func__, count); err = 0; } } if (err) { device_printf(dev, "%s: failed [0x%08x]\n", __func__, err); return(-1); } return (0); } static int qla_get_minidump_tmplt_size(qla_host_t *ha, uint32_t *size) { uint32_t err; device_t dev = ha->pci_dev; q80_config_md_templ_size_t *md_size; q80_config_md_templ_size_rsp_t *md_size_rsp; #ifndef QL_LDFLASH_FW ql_minidump_template_hdr_t *hdr; hdr = (ql_minidump_template_hdr_t *)ql83xx_minidump; *size = hdr->size_of_template; return (0); #endif /* #ifdef QL_LDFLASH_FW */ md_size = (q80_config_md_templ_size_t *) ha->hw.mbox; bzero(md_size, sizeof(q80_config_md_templ_size_t)); md_size->opcode = Q8_MBX_GET_MINIDUMP_TMPLT_SIZE; md_size->count_version = (sizeof (q80_config_md_templ_size_t) >> 2); md_size->count_version |= Q8_MBX_CMD_VERSION; if (qla_mbx_cmd(ha, (uint32_t *) md_size, (sizeof(q80_config_md_templ_size_t) >> 2), ha->hw.mbox, (sizeof(q80_config_md_templ_size_rsp_t) >> 2), 0)) { device_printf(dev, "%s: failed\n", __func__); return (-1); } md_size_rsp = (q80_config_md_templ_size_rsp_t *) ha->hw.mbox; err = Q8_MBX_RSP_STATUS(md_size_rsp->regcnt_status); if (err) { device_printf(dev, "%s: failed [0x%08x]\n", __func__, err); return(-1); } *size = md_size_rsp->templ_size; return (0); } static int qla_get_port_config(qla_host_t *ha, uint32_t *cfg_bits) { device_t dev; q80_get_port_cfg_t *pcfg; q80_get_port_cfg_rsp_t *pcfg_rsp; uint32_t err; dev = ha->pci_dev; pcfg = (q80_get_port_cfg_t *)ha->hw.mbox; bzero(pcfg, sizeof(q80_get_port_cfg_t)); pcfg->opcode = Q8_MBX_GET_PORT_CONFIG; pcfg->count_version = (sizeof (q80_get_port_cfg_t) >> 2); pcfg->count_version |= Q8_MBX_CMD_VERSION; if (qla_mbx_cmd(ha, (uint32_t *)pcfg, (sizeof (q80_get_port_cfg_t) >> 2), ha->hw.mbox, (sizeof (q80_get_port_cfg_rsp_t) >> 2), 0)) { device_printf(dev, "%s: failed\n", __func__); return -1; } pcfg_rsp = (q80_get_port_cfg_rsp_t *)ha->hw.mbox; err = Q8_MBX_RSP_STATUS(pcfg_rsp->regcnt_status); if (err) { device_printf(dev, "%s: failed [0x%08x]\n", __func__, err); return(-1); } device_printf(dev, "%s: [cfg_bits, port type]" " [0x%08x, 0x%02x] [STD_PAUSE_DIR, PAUSE_TYPE, DCBX]" " [0x%x, 0x%x, 0x%x]\n", __func__, pcfg_rsp->cfg_bits, pcfg_rsp->phys_port_type, ((pcfg_rsp->cfg_bits & Q8_PORT_CFG_BITS_STDPAUSE_DIR_MASK)>>20), ((pcfg_rsp->cfg_bits & Q8_PORT_CFG_BITS_PAUSE_CFG_MASK) >> 5), ((pcfg_rsp->cfg_bits & Q8_PORT_CFG_BITS_DCBX_ENABLE) ? 1: 0) ); *cfg_bits = pcfg_rsp->cfg_bits; return (0); } int qla_iscsi_pdu(qla_host_t *ha, struct mbuf *mp) { struct ether_vlan_header *eh; uint16_t etype; struct ip *ip = NULL; struct ip6_hdr *ip6 = NULL; struct tcphdr *th = NULL; uint32_t hdrlen; uint32_t offset; uint8_t buf[sizeof(struct ip6_hdr)]; eh = mtod(mp, struct ether_vlan_header *); if (eh->evl_encap_proto == htons(ETHERTYPE_VLAN)) { hdrlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN; etype = ntohs(eh->evl_proto); } else { hdrlen = ETHER_HDR_LEN; etype = ntohs(eh->evl_encap_proto); } if (etype == ETHERTYPE_IP) { offset = (hdrlen + sizeof (struct ip)); if (mp->m_len >= offset) { ip = (struct ip *)(mp->m_data + hdrlen); } else { m_copydata(mp, hdrlen, sizeof (struct ip), buf); ip = (struct ip *)buf; } if (ip->ip_p == IPPROTO_TCP) { hdrlen += ip->ip_hl << 2; offset = hdrlen + 4; if (mp->m_len >= offset) { th = (struct tcphdr *)(mp->m_data + hdrlen);; } else { m_copydata(mp, hdrlen, 4, buf); th = (struct tcphdr *)buf; } } } else if (etype == ETHERTYPE_IPV6) { offset = (hdrlen + sizeof (struct ip6_hdr)); if (mp->m_len >= offset) { ip6 = (struct ip6_hdr *)(mp->m_data + hdrlen); } else { m_copydata(mp, hdrlen, sizeof (struct ip6_hdr), buf); ip6 = (struct ip6_hdr *)buf; } if (ip6->ip6_nxt == IPPROTO_TCP) { hdrlen += sizeof(struct ip6_hdr); offset = hdrlen + 4; if (mp->m_len >= offset) { th = (struct tcphdr *)(mp->m_data + hdrlen);; } else { m_copydata(mp, hdrlen, 4, buf); th = (struct tcphdr *)buf; } } } if (th != NULL) { if ((th->th_sport == htons(3260)) || (th->th_dport == htons(3260))) return 0; } return (-1); } void qla_hw_async_event(qla_host_t *ha) { switch (ha->hw.aen_mb0) { case 0x8101: (void)qla_idc_ack(ha, ha->hw.aen_mb1, ha->hw.aen_mb2, ha->hw.aen_mb3, ha->hw.aen_mb4); break; default: break; } return; } #ifdef QL_LDFLASH_FW static int ql_get_minidump_template(qla_host_t *ha) { uint32_t err; device_t dev = ha->pci_dev; q80_config_md_templ_cmd_t *md_templ; q80_config_md_templ_cmd_rsp_t *md_templ_rsp; md_templ = (q80_config_md_templ_cmd_t *) ha->hw.mbox; bzero(md_templ, (sizeof (q80_config_md_templ_cmd_t))); md_templ->opcode = Q8_MBX_GET_MINIDUMP_TMPLT; md_templ->count_version = ( sizeof(q80_config_md_templ_cmd_t) >> 2); md_templ->count_version |= Q8_MBX_CMD_VERSION; md_templ->buf_addr = ha->hw.dma_buf.minidump.dma_addr; md_templ->buff_size = ha->hw.dma_buf.minidump.size; if (qla_mbx_cmd(ha, (uint32_t *) md_templ, (sizeof(q80_config_md_templ_cmd_t) >> 2), ha->hw.mbox, (sizeof(q80_config_md_templ_cmd_rsp_t) >> 2), 0)) { device_printf(dev, "%s: failed\n", __func__); return (-1); } md_templ_rsp = (q80_config_md_templ_cmd_rsp_t *) ha->hw.mbox; err = Q8_MBX_RSP_STATUS(md_templ_rsp->regcnt_status); if (err) { device_printf(dev, "%s: failed [0x%08x]\n", __func__, err); return (-1); } return (0); } #endif /* #ifdef QL_LDFLASH_FW */ /* * Minidump related functionality */ static int ql_parse_template(qla_host_t *ha); static uint32_t ql_rdcrb(qla_host_t *ha, ql_minidump_entry_rdcrb_t *crb_entry, uint32_t * data_buff); static uint32_t ql_pollrd(qla_host_t *ha, ql_minidump_entry_pollrd_t *entry, uint32_t * data_buff); static uint32_t ql_pollrd_modify_write(qla_host_t *ha, ql_minidump_entry_rd_modify_wr_with_poll_t *entry, uint32_t *data_buff); static uint32_t ql_L2Cache(qla_host_t *ha, ql_minidump_entry_cache_t *cacheEntry, uint32_t * data_buff); static uint32_t ql_L1Cache(qla_host_t *ha, ql_minidump_entry_cache_t *cacheEntry, uint32_t *data_buff); static uint32_t ql_rdocm(qla_host_t *ha, ql_minidump_entry_rdocm_t *ocmEntry, uint32_t *data_buff); static uint32_t ql_rdmem(qla_host_t *ha, ql_minidump_entry_rdmem_t *mem_entry, uint32_t *data_buff); static uint32_t ql_rdrom(qla_host_t *ha, ql_minidump_entry_rdrom_t *romEntry, uint32_t *data_buff); static uint32_t ql_rdmux(qla_host_t *ha, ql_minidump_entry_mux_t *muxEntry, uint32_t *data_buff); static uint32_t ql_rdmux2(qla_host_t *ha, ql_minidump_entry_mux2_t *muxEntry, uint32_t *data_buff); static uint32_t ql_rdqueue(qla_host_t *ha, ql_minidump_entry_queue_t *queueEntry, uint32_t *data_buff); static uint32_t ql_cntrl(qla_host_t *ha, ql_minidump_template_hdr_t *template_hdr, ql_minidump_entry_cntrl_t *crbEntry); static uint32_t ql_minidump_size(qla_host_t *ha) { uint32_t i, k; uint32_t size = 0; ql_minidump_template_hdr_t *hdr; hdr = (ql_minidump_template_hdr_t *)ha->hw.dma_buf.minidump.dma_b; i = 0x2; for (k = 1; k < QL_DBG_CAP_SIZE_ARRAY_LEN; k++) { if (i & ha->hw.mdump_capture_mask) size += hdr->capture_size_array[k]; i = i << 1; } return (size); } static void ql_free_minidump_buffer(qla_host_t *ha) { if (ha->hw.mdump_buffer != NULL) { free(ha->hw.mdump_buffer, M_QLA83XXBUF); ha->hw.mdump_buffer = NULL; ha->hw.mdump_buffer_size = 0; } return; } static int ql_alloc_minidump_buffer(qla_host_t *ha) { ha->hw.mdump_buffer_size = ql_minidump_size(ha); if (!ha->hw.mdump_buffer_size) return (-1); ha->hw.mdump_buffer = malloc(ha->hw.mdump_buffer_size, M_QLA83XXBUF, M_NOWAIT); if (ha->hw.mdump_buffer == NULL) return (-1); return (0); } static void ql_free_minidump_template_buffer(qla_host_t *ha) { if (ha->hw.mdump_template != NULL) { free(ha->hw.mdump_template, M_QLA83XXBUF); ha->hw.mdump_template = NULL; ha->hw.mdump_template_size = 0; } return; } static int ql_alloc_minidump_template_buffer(qla_host_t *ha) { ha->hw.mdump_template_size = ha->hw.dma_buf.minidump.size; ha->hw.mdump_template = malloc(ha->hw.mdump_template_size, M_QLA83XXBUF, M_NOWAIT); if (ha->hw.mdump_template == NULL) return (-1); return (0); } static int ql_alloc_minidump_buffers(qla_host_t *ha) { int ret; ret = ql_alloc_minidump_template_buffer(ha); if (ret) return (ret); ret = ql_alloc_minidump_buffer(ha); if (ret) ql_free_minidump_template_buffer(ha); return (ret); } static uint32_t ql_validate_minidump_checksum(qla_host_t *ha) { uint64_t sum = 0; int count; uint32_t *template_buff; count = ha->hw.dma_buf.minidump.size / sizeof (uint32_t); template_buff = ha->hw.dma_buf.minidump.dma_b; while (count-- > 0) { sum += *template_buff++; } while (sum >> 32) { sum = (sum & 0xFFFFFFFF) + (sum >> 32); } return (~sum); } int ql_minidump_init(qla_host_t *ha) { int ret = 0; uint32_t template_size = 0; device_t dev = ha->pci_dev; /* * Get Minidump Template Size */ ret = qla_get_minidump_tmplt_size(ha, &template_size); if (ret || (template_size == 0)) { device_printf(dev, "%s: failed [%d, %d]\n", __func__, ret, template_size); return (-1); } /* * Allocate Memory for Minidump Template */ ha->hw.dma_buf.minidump.alignment = 8; ha->hw.dma_buf.minidump.size = template_size; #ifdef QL_LDFLASH_FW if (ql_alloc_dmabuf(ha, &ha->hw.dma_buf.minidump)) { device_printf(dev, "%s: minidump dma alloc failed\n", __func__); return (-1); } ha->hw.dma_buf.flags.minidump = 1; /* * Retrieve Minidump Template */ ret = ql_get_minidump_template(ha); #else ha->hw.dma_buf.minidump.dma_b = ql83xx_minidump; #endif /* #ifdef QL_LDFLASH_FW */ if (ret == 0) { ret = ql_validate_minidump_checksum(ha); if (ret == 0) { ret = ql_alloc_minidump_buffers(ha); if (ret == 0) ha->hw.mdump_init = 1; else device_printf(dev, "%s: ql_alloc_minidump_buffers" " failed\n", __func__); } else { device_printf(dev, "%s: ql_validate_minidump_checksum" " failed\n", __func__); } } else { device_printf(dev, "%s: ql_get_minidump_template failed\n", __func__); } if (ret) ql_minidump_free(ha); return (ret); } static void ql_minidump_free(qla_host_t *ha) { ha->hw.mdump_init = 0; if (ha->hw.dma_buf.flags.minidump) { ha->hw.dma_buf.flags.minidump = 0; ql_free_dmabuf(ha, &ha->hw.dma_buf.minidump); } ql_free_minidump_template_buffer(ha); ql_free_minidump_buffer(ha); return; } void ql_minidump(qla_host_t *ha) { if (!ha->hw.mdump_init) return; if (ha->hw.mdump_done) return; ha->hw.mdump_start_seq_index = ql_stop_sequence(ha); bzero(ha->hw.mdump_buffer, ha->hw.mdump_buffer_size); bzero(ha->hw.mdump_template, ha->hw.mdump_template_size); bcopy(ha->hw.dma_buf.minidump.dma_b, ha->hw.mdump_template, ha->hw.mdump_template_size); ql_parse_template(ha); ql_start_sequence(ha, ha->hw.mdump_start_seq_index); ha->hw.mdump_done = 1; return; } /* * helper routines */ static void ql_entry_err_chk(ql_minidump_entry_t *entry, uint32_t esize) { if (esize != entry->hdr.entry_capture_size) { entry->hdr.entry_capture_size = esize; entry->hdr.driver_flags |= QL_DBG_SIZE_ERR_FLAG; } return; } static int ql_parse_template(qla_host_t *ha) { uint32_t num_of_entries, buff_level, e_cnt, esize; uint32_t end_cnt, rv = 0; char *dump_buff, *dbuff; int sane_start = 0, sane_end = 0; ql_minidump_template_hdr_t *template_hdr; ql_minidump_entry_t *entry; uint32_t capture_mask; uint32_t dump_size; /* Setup parameters */ template_hdr = (ql_minidump_template_hdr_t *)ha->hw.mdump_template; if (template_hdr->entry_type == TLHDR) sane_start = 1; dump_buff = (char *) ha->hw.mdump_buffer; num_of_entries = template_hdr->num_of_entries; entry = (ql_minidump_entry_t *) ((char *)template_hdr + template_hdr->first_entry_offset ); template_hdr->saved_state_array[QL_OCM0_ADDR_INDX] = template_hdr->ocm_window_array[ha->pci_func]; template_hdr->saved_state_array[QL_PCIE_FUNC_INDX] = ha->pci_func; capture_mask = ha->hw.mdump_capture_mask; dump_size = ha->hw.mdump_buffer_size; template_hdr->driver_capture_mask = capture_mask; QL_DPRINT80(ha, (ha->pci_dev, "%s: sane_start = %d num_of_entries = %d " "capture_mask = 0x%x dump_size = %d \n", __func__, sane_start, num_of_entries, capture_mask, dump_size)); for (buff_level = 0, e_cnt = 0; e_cnt < num_of_entries; e_cnt++) { /* * If the capture_mask of the entry does not match capture mask * skip the entry after marking the driver_flags indicator. */ if (!(entry->hdr.entry_capture_mask & capture_mask)) { entry->hdr.driver_flags |= QL_DBG_SKIPPED_FLAG; entry = (ql_minidump_entry_t *) ((char *) entry + entry->hdr.entry_size); continue; } /* * This is ONLY needed in implementations where * the capture buffer allocated is too small to capture * all of the required entries for a given capture mask. * We need to empty the buffer contents to a file * if possible, before processing the next entry * If the buff_full_flag is set, no further capture will happen * and all remaining non-control entries will be skipped. */ if (entry->hdr.entry_capture_size != 0) { if ((buff_level + entry->hdr.entry_capture_size) > dump_size) { /* Try to recover by emptying buffer to file */ entry->hdr.driver_flags |= QL_DBG_SKIPPED_FLAG; entry = (ql_minidump_entry_t *) ((char *) entry + entry->hdr.entry_size); continue; } } /* * Decode the entry type and process it accordingly */ switch (entry->hdr.entry_type) { case RDNOP: break; case RDEND: if (sane_end == 0) { end_cnt = e_cnt; } sane_end++; break; case RDCRB: dbuff = dump_buff + buff_level; esize = ql_rdcrb(ha, (void *)entry, (void *)dbuff); ql_entry_err_chk(entry, esize); buff_level += esize; break; case POLLRD: dbuff = dump_buff + buff_level; esize = ql_pollrd(ha, (void *)entry, (void *)dbuff); ql_entry_err_chk(entry, esize); buff_level += esize; break; case POLLRDMWR: dbuff = dump_buff + buff_level; esize = ql_pollrd_modify_write(ha, (void *)entry, (void *)dbuff); ql_entry_err_chk(entry, esize); buff_level += esize; break; case L2ITG: case L2DTG: case L2DAT: case L2INS: dbuff = dump_buff + buff_level; esize = ql_L2Cache(ha, (void *)entry, (void *)dbuff); if (esize == -1) { entry->hdr.driver_flags |= QL_DBG_SKIPPED_FLAG; } else { ql_entry_err_chk(entry, esize); buff_level += esize; } break; case L1DAT: case L1INS: dbuff = dump_buff + buff_level; esize = ql_L1Cache(ha, (void *)entry, (void *)dbuff); ql_entry_err_chk(entry, esize); buff_level += esize; break; case RDOCM: dbuff = dump_buff + buff_level; esize = ql_rdocm(ha, (void *)entry, (void *)dbuff); ql_entry_err_chk(entry, esize); buff_level += esize; break; case RDMEM: dbuff = dump_buff + buff_level; esize = ql_rdmem(ha, (void *)entry, (void *)dbuff); ql_entry_err_chk(entry, esize); buff_level += esize; break; case BOARD: case RDROM: dbuff = dump_buff + buff_level; esize = ql_rdrom(ha, (void *)entry, (void *)dbuff); ql_entry_err_chk(entry, esize); buff_level += esize; break; case RDMUX: dbuff = dump_buff + buff_level; esize = ql_rdmux(ha, (void *)entry, (void *)dbuff); ql_entry_err_chk(entry, esize); buff_level += esize; break; case RDMUX2: dbuff = dump_buff + buff_level; esize = ql_rdmux2(ha, (void *)entry, (void *)dbuff); ql_entry_err_chk(entry, esize); buff_level += esize; break; case QUEUE: dbuff = dump_buff + buff_level; esize = ql_rdqueue(ha, (void *)entry, (void *)dbuff); ql_entry_err_chk(entry, esize); buff_level += esize; break; case CNTRL: if ((rv = ql_cntrl(ha, template_hdr, (void *)entry))) { entry->hdr.driver_flags |= QL_DBG_SKIPPED_FLAG; } break; default: entry->hdr.driver_flags |= QL_DBG_SKIPPED_FLAG; break; } /* next entry in the template */ entry = (ql_minidump_entry_t *) ((char *) entry + entry->hdr.entry_size); } if (!sane_start || (sane_end > 1)) { device_printf(ha->pci_dev, "\n%s: Template configuration error. Check Template\n", __func__); } QL_DPRINT80(ha, (ha->pci_dev, "%s: Minidump num of entries = %d\n", __func__, template_hdr->num_of_entries)); return 0; } /* * Read CRB operation. */ static uint32_t ql_rdcrb(qla_host_t *ha, ql_minidump_entry_rdcrb_t * crb_entry, uint32_t * data_buff) { int loop_cnt; int ret; uint32_t op_count, addr, stride, value = 0; addr = crb_entry->addr; op_count = crb_entry->op_count; stride = crb_entry->addr_stride; for (loop_cnt = 0; loop_cnt < op_count; loop_cnt++) { ret = ql_rdwr_indreg32(ha, addr, &value, 1); if (ret) return (0); *data_buff++ = addr; *data_buff++ = value; addr = addr + stride; } /* * for testing purpose we return amount of data written */ return (op_count * (2 * sizeof(uint32_t))); } /* * Handle L2 Cache. */ static uint32_t ql_L2Cache(qla_host_t *ha, ql_minidump_entry_cache_t *cacheEntry, uint32_t * data_buff) { int i, k; int loop_cnt; int ret; uint32_t read_value; uint32_t addr, read_addr, cntrl_addr, tag_reg_addr, cntl_value_w; uint32_t tag_value, read_cnt; volatile uint8_t cntl_value_r; long timeout; uint32_t data; loop_cnt = cacheEntry->op_count; read_addr = cacheEntry->read_addr; cntrl_addr = cacheEntry->control_addr; cntl_value_w = (uint32_t) cacheEntry->write_value; tag_reg_addr = cacheEntry->tag_reg_addr; tag_value = cacheEntry->init_tag_value; read_cnt = cacheEntry->read_addr_cnt; for (i = 0; i < loop_cnt; i++) { ret = ql_rdwr_indreg32(ha, tag_reg_addr, &tag_value, 0); if (ret) return (0); if (cacheEntry->write_value != 0) { ret = ql_rdwr_indreg32(ha, cntrl_addr, &cntl_value_w, 0); if (ret) return (0); } if (cacheEntry->poll_mask != 0) { timeout = cacheEntry->poll_wait; ret = ql_rdwr_indreg32(ha, cntrl_addr, &data, 1); if (ret) return (0); cntl_value_r = (uint8_t)data; while ((cntl_value_r & cacheEntry->poll_mask) != 0) { if (timeout) { qla_mdelay(__func__, 1); timeout--; } else break; ret = ql_rdwr_indreg32(ha, cntrl_addr, &data, 1); if (ret) return (0); cntl_value_r = (uint8_t)data; } if (!timeout) { /* Report timeout error. * core dump capture failed * Skip remaining entries. * Write buffer out to file * Use driver specific fields in template header * to report this error. */ return (-1); } } addr = read_addr; for (k = 0; k < read_cnt; k++) { ret = ql_rdwr_indreg32(ha, addr, &read_value, 1); if (ret) return (0); *data_buff++ = read_value; addr += cacheEntry->read_addr_stride; } tag_value += cacheEntry->tag_value_stride; } return (read_cnt * loop_cnt * sizeof(uint32_t)); } /* * Handle L1 Cache. */ static uint32_t ql_L1Cache(qla_host_t *ha, ql_minidump_entry_cache_t *cacheEntry, uint32_t *data_buff) { int ret; int i, k; int loop_cnt; uint32_t read_value; uint32_t addr, read_addr, cntrl_addr, tag_reg_addr; uint32_t tag_value, read_cnt; uint32_t cntl_value_w; loop_cnt = cacheEntry->op_count; read_addr = cacheEntry->read_addr; cntrl_addr = cacheEntry->control_addr; cntl_value_w = (uint32_t) cacheEntry->write_value; tag_reg_addr = cacheEntry->tag_reg_addr; tag_value = cacheEntry->init_tag_value; read_cnt = cacheEntry->read_addr_cnt; for (i = 0; i < loop_cnt; i++) { ret = ql_rdwr_indreg32(ha, tag_reg_addr, &tag_value, 0); if (ret) return (0); ret = ql_rdwr_indreg32(ha, cntrl_addr, &cntl_value_w, 0); if (ret) return (0); addr = read_addr; for (k = 0; k < read_cnt; k++) { ret = ql_rdwr_indreg32(ha, addr, &read_value, 1); if (ret) return (0); *data_buff++ = read_value; addr += cacheEntry->read_addr_stride; } tag_value += cacheEntry->tag_value_stride; } return (read_cnt * loop_cnt * sizeof(uint32_t)); } /* * Reading OCM memory */ static uint32_t ql_rdocm(qla_host_t *ha, ql_minidump_entry_rdocm_t *ocmEntry, uint32_t *data_buff) { int i, loop_cnt; volatile uint32_t addr; volatile uint32_t value; addr = ocmEntry->read_addr; loop_cnt = ocmEntry->op_count; for (i = 0; i < loop_cnt; i++) { value = READ_REG32(ha, addr); *data_buff++ = value; addr += ocmEntry->read_addr_stride; } return (loop_cnt * sizeof(value)); } /* * Read memory */ static uint32_t ql_rdmem(qla_host_t *ha, ql_minidump_entry_rdmem_t *mem_entry, uint32_t *data_buff) { int ret; int i, loop_cnt; volatile uint32_t addr; q80_offchip_mem_val_t val; addr = mem_entry->read_addr; /* size in bytes / 16 */ loop_cnt = mem_entry->read_data_size / (sizeof(uint32_t) * 4); for (i = 0; i < loop_cnt; i++) { ret = ql_rdwr_offchip_mem(ha, (addr & 0x0ffffffff), &val, 1); if (ret) return (0); *data_buff++ = val.data_lo; *data_buff++ = val.data_hi; *data_buff++ = val.data_ulo; *data_buff++ = val.data_uhi; addr += (sizeof(uint32_t) * 4); } return (loop_cnt * (sizeof(uint32_t) * 4)); } /* * Read Rom */ static uint32_t ql_rdrom(qla_host_t *ha, ql_minidump_entry_rdrom_t *romEntry, uint32_t *data_buff) { int ret; int i, loop_cnt; uint32_t addr; uint32_t value; addr = romEntry->read_addr; loop_cnt = romEntry->read_data_size; /* This is size in bytes */ loop_cnt /= sizeof(value); for (i = 0; i < loop_cnt; i++) { ret = ql_rd_flash32(ha, addr, &value); if (ret) return (0); *data_buff++ = value; addr += sizeof(value); } return (loop_cnt * sizeof(value)); } /* * Read MUX data */ static uint32_t ql_rdmux(qla_host_t *ha, ql_minidump_entry_mux_t *muxEntry, uint32_t *data_buff) { int ret; int loop_cnt; uint32_t read_value, sel_value; uint32_t read_addr, select_addr; select_addr = muxEntry->select_addr; sel_value = muxEntry->select_value; read_addr = muxEntry->read_addr; for (loop_cnt = 0; loop_cnt < muxEntry->op_count; loop_cnt++) { ret = ql_rdwr_indreg32(ha, select_addr, &sel_value, 0); if (ret) return (0); ret = ql_rdwr_indreg32(ha, read_addr, &read_value, 1); if (ret) return (0); *data_buff++ = sel_value; *data_buff++ = read_value; sel_value += muxEntry->select_value_stride; } return (loop_cnt * (2 * sizeof(uint32_t))); } static uint32_t ql_rdmux2(qla_host_t *ha, ql_minidump_entry_mux2_t *muxEntry, uint32_t *data_buff) { int ret; int loop_cnt; uint32_t select_addr_1, select_addr_2; uint32_t select_value_1, select_value_2; uint32_t select_value_count, select_value_mask; uint32_t read_addr, read_value; select_addr_1 = muxEntry->select_addr_1; select_addr_2 = muxEntry->select_addr_2; select_value_1 = muxEntry->select_value_1; select_value_2 = muxEntry->select_value_2; select_value_count = muxEntry->select_value_count; select_value_mask = muxEntry->select_value_mask; read_addr = muxEntry->read_addr; for (loop_cnt = 0; loop_cnt < muxEntry->select_value_count; loop_cnt++) { uint32_t temp_sel_val; ret = ql_rdwr_indreg32(ha, select_addr_1, &select_value_1, 0); if (ret) return (0); temp_sel_val = select_value_1 & select_value_mask; ret = ql_rdwr_indreg32(ha, select_addr_2, &temp_sel_val, 0); if (ret) return (0); ret = ql_rdwr_indreg32(ha, read_addr, &read_value, 1); if (ret) return (0); *data_buff++ = temp_sel_val; *data_buff++ = read_value; ret = ql_rdwr_indreg32(ha, select_addr_1, &select_value_2, 0); if (ret) return (0); temp_sel_val = select_value_2 & select_value_mask; ret = ql_rdwr_indreg32(ha, select_addr_2, &temp_sel_val, 0); if (ret) return (0); ret = ql_rdwr_indreg32(ha, read_addr, &read_value, 1); if (ret) return (0); *data_buff++ = temp_sel_val; *data_buff++ = read_value; select_value_1 += muxEntry->select_value_stride; select_value_2 += muxEntry->select_value_stride; } return (loop_cnt * (4 * sizeof(uint32_t))); } /* * Handling Queue State Reads. */ static uint32_t ql_rdqueue(qla_host_t *ha, ql_minidump_entry_queue_t *queueEntry, uint32_t *data_buff) { int ret; int loop_cnt, k; uint32_t read_value; uint32_t read_addr, read_stride, select_addr; uint32_t queue_id, read_cnt; read_cnt = queueEntry->read_addr_cnt; read_stride = queueEntry->read_addr_stride; select_addr = queueEntry->select_addr; for (loop_cnt = 0, queue_id = 0; loop_cnt < queueEntry->op_count; loop_cnt++) { ret = ql_rdwr_indreg32(ha, select_addr, &queue_id, 0); if (ret) return (0); read_addr = queueEntry->read_addr; for (k = 0; k < read_cnt; k++) { ret = ql_rdwr_indreg32(ha, read_addr, &read_value, 1); if (ret) return (0); *data_buff++ = read_value; read_addr += read_stride; } queue_id += queueEntry->queue_id_stride; } return (loop_cnt * (read_cnt * sizeof(uint32_t))); } /* * Handling control entries. */ static uint32_t ql_cntrl(qla_host_t *ha, ql_minidump_template_hdr_t *template_hdr, ql_minidump_entry_cntrl_t *crbEntry) { int ret; int count; uint32_t opcode, read_value, addr, entry_addr; long timeout; entry_addr = crbEntry->addr; for (count = 0; count < crbEntry->op_count; count++) { opcode = crbEntry->opcode; if (opcode & QL_DBG_OPCODE_WR) { ret = ql_rdwr_indreg32(ha, entry_addr, &crbEntry->value_1, 0); if (ret) return (0); opcode &= ~QL_DBG_OPCODE_WR; } if (opcode & QL_DBG_OPCODE_RW) { ret = ql_rdwr_indreg32(ha, entry_addr, &read_value, 1); if (ret) return (0); ret = ql_rdwr_indreg32(ha, entry_addr, &read_value, 0); if (ret) return (0); opcode &= ~QL_DBG_OPCODE_RW; } if (opcode & QL_DBG_OPCODE_AND) { ret = ql_rdwr_indreg32(ha, entry_addr, &read_value, 1); if (ret) return (0); read_value &= crbEntry->value_2; opcode &= ~QL_DBG_OPCODE_AND; if (opcode & QL_DBG_OPCODE_OR) { read_value |= crbEntry->value_3; opcode &= ~QL_DBG_OPCODE_OR; } ret = ql_rdwr_indreg32(ha, entry_addr, &read_value, 0); if (ret) return (0); } if (opcode & QL_DBG_OPCODE_OR) { ret = ql_rdwr_indreg32(ha, entry_addr, &read_value, 1); if (ret) return (0); read_value |= crbEntry->value_3; ret = ql_rdwr_indreg32(ha, entry_addr, &read_value, 0); if (ret) return (0); opcode &= ~QL_DBG_OPCODE_OR; } if (opcode & QL_DBG_OPCODE_POLL) { opcode &= ~QL_DBG_OPCODE_POLL; timeout = crbEntry->poll_timeout; addr = entry_addr; ret = ql_rdwr_indreg32(ha, addr, &read_value, 1); if (ret) return (0); while ((read_value & crbEntry->value_2) != crbEntry->value_1) { if (timeout) { qla_mdelay(__func__, 1); timeout--; } else break; ret = ql_rdwr_indreg32(ha, addr, &read_value, 1); if (ret) return (0); } if (!timeout) { /* * Report timeout error. * core dump capture failed * Skip remaining entries. * Write buffer out to file * Use driver specific fields in template header * to report this error. */ return (-1); } } if (opcode & QL_DBG_OPCODE_RDSTATE) { /* * decide which address to use. */ if (crbEntry->state_index_a) { addr = template_hdr->saved_state_array[ crbEntry-> state_index_a]; } else { addr = entry_addr; } ret = ql_rdwr_indreg32(ha, addr, &read_value, 1); if (ret) return (0); template_hdr->saved_state_array[crbEntry->state_index_v] = read_value; opcode &= ~QL_DBG_OPCODE_RDSTATE; } if (opcode & QL_DBG_OPCODE_WRSTATE) { /* * decide which value to use. */ if (crbEntry->state_index_v) { read_value = template_hdr->saved_state_array[ crbEntry->state_index_v]; } else { read_value = crbEntry->value_1; } /* * decide which address to use. */ if (crbEntry->state_index_a) { addr = template_hdr->saved_state_array[ crbEntry-> state_index_a]; } else { addr = entry_addr; } ret = ql_rdwr_indreg32(ha, addr, &read_value, 0); if (ret) return (0); opcode &= ~QL_DBG_OPCODE_WRSTATE; } if (opcode & QL_DBG_OPCODE_MDSTATE) { /* Read value from saved state using index */ read_value = template_hdr->saved_state_array[ crbEntry->state_index_v]; read_value <<= crbEntry->shl; /*Shift left operation */ read_value >>= crbEntry->shr; /*Shift right operation */ if (crbEntry->value_2) { /* check if AND mask is provided */ read_value &= crbEntry->value_2; } read_value |= crbEntry->value_3; /* OR operation */ read_value += crbEntry->value_1; /* increment op */ /* Write value back to state area. */ template_hdr->saved_state_array[crbEntry->state_index_v] = read_value; opcode &= ~QL_DBG_OPCODE_MDSTATE; } entry_addr += crbEntry->addr_stride; } return (0); } /* * Handling rd poll entry. */ static uint32_t ql_pollrd(qla_host_t *ha, ql_minidump_entry_pollrd_t *entry, uint32_t *data_buff) { int ret; int loop_cnt; uint32_t op_count, select_addr, select_value_stride, select_value; uint32_t read_addr, poll, mask, data_size, data; uint32_t wait_count = 0; select_addr = entry->select_addr; read_addr = entry->read_addr; select_value = entry->select_value; select_value_stride = entry->select_value_stride; op_count = entry->op_count; poll = entry->poll; mask = entry->mask; data_size = entry->data_size; for (loop_cnt = 0; loop_cnt < op_count; loop_cnt++) { ret = ql_rdwr_indreg32(ha, select_addr, &select_value, 0); if (ret) return (0); wait_count = 0; while (wait_count < poll) { uint32_t temp; ret = ql_rdwr_indreg32(ha, select_addr, &temp, 1); if (ret) return (0); if ( (temp & mask) != 0 ) { break; } wait_count++; } if (wait_count == poll) { device_printf(ha->pci_dev, "%s: Error in processing entry\n", __func__); device_printf(ha->pci_dev, "%s: wait_count <0x%x> poll <0x%x>\n", __func__, wait_count, poll); return 0; } ret = ql_rdwr_indreg32(ha, read_addr, &data, 1); if (ret) return (0); *data_buff++ = select_value; *data_buff++ = data; select_value = select_value + select_value_stride; } /* * for testing purpose we return amount of data written */ return (loop_cnt * (2 * sizeof(uint32_t))); } /* * Handling rd modify write poll entry. */ static uint32_t ql_pollrd_modify_write(qla_host_t *ha, ql_minidump_entry_rd_modify_wr_with_poll_t *entry, uint32_t *data_buff) { int ret; uint32_t addr_1, addr_2, value_1, value_2, data; uint32_t poll, mask, data_size, modify_mask; uint32_t wait_count = 0; addr_1 = entry->addr_1; addr_2 = entry->addr_2; value_1 = entry->value_1; value_2 = entry->value_2; poll = entry->poll; mask = entry->mask; modify_mask = entry->modify_mask; data_size = entry->data_size; ret = ql_rdwr_indreg32(ha, addr_1, &value_1, 0); if (ret) return (0); wait_count = 0; while (wait_count < poll) { uint32_t temp; ret = ql_rdwr_indreg32(ha, addr_1, &temp, 1); if (ret) return (0); if ( (temp & mask) != 0 ) { break; } wait_count++; } if (wait_count == poll) { device_printf(ha->pci_dev, "%s Error in processing entry\n", __func__); } else { ret = ql_rdwr_indreg32(ha, addr_2, &data, 1); if (ret) return (0); data = (data & modify_mask); ret = ql_rdwr_indreg32(ha, addr_2, &data, 0); if (ret) return (0); ret = ql_rdwr_indreg32(ha, addr_1, &value_2, 0); if (ret) return (0); /* Poll again */ wait_count = 0; while (wait_count < poll) { uint32_t temp; ret = ql_rdwr_indreg32(ha, addr_1, &temp, 1); if (ret) return (0); if ( (temp & mask) != 0 ) { break; } wait_count++; } *data_buff++ = addr_2; *data_buff++ = data; } /* * for testing purpose we return amount of data written */ return (2 * sizeof(uint32_t)); } Index: user/alc/PQ_LAUNDRY/sys/dev/qlxgbe/ql_hw.h =================================================================== --- user/alc/PQ_LAUNDRY/sys/dev/qlxgbe/ql_hw.h (revision 306831) +++ user/alc/PQ_LAUNDRY/sys/dev/qlxgbe/ql_hw.h (revision 306832) @@ -1,1750 +1,1752 @@ /* * Copyright (c) 2013-2016 Qlogic Corporation * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * * $FreeBSD$ */ /* * File: ql_hw.h * Author : David C Somayajulu, Qlogic Corporation, Aliso Viejo, CA 92656. */ #ifndef _QL_HW_H_ #define _QL_HW_H_ /* * PCIe Registers; Direct Mapped; Offsets from BAR0 */ /* * Register offsets for QLE8030 */ /* * Firmware Mailbox Registers * 0 thru 511; offsets 0x800 thru 0xFFC; 32bits each */ #define Q8_FW_MBOX0 0x00000800 #define Q8_FW_MBOX511 0x00000FFC /* * Host Mailbox Registers * 0 thru 511; offsets 0x000 thru 0x7FC; 32bits each */ #define Q8_HOST_MBOX0 0x00000000 #define Q8_HOST_MBOX511 0x000007FC #define Q8_MBOX_INT_ENABLE 0x00001000 #define Q8_MBOX_INT_MASK_MSIX 0x00001200 #define Q8_MBOX_INT_LEGACY 0x00003010 #define Q8_HOST_MBOX_CNTRL 0x00003038 #define Q8_FW_MBOX_CNTRL 0x0000303C #define Q8_PEG_HALT_STATUS1 0x000034A8 #define Q8_PEG_HALT_STATUS2 0x000034AC #define Q8_FIRMWARE_HEARTBEAT 0x000034B0 #define Q8_FLASH_LOCK_ID 0x00003500 #define Q8_DRIVER_LOCK_ID 0x00003504 #define Q8_FW_CAPABILITIES 0x00003528 #define Q8_FW_VER_MAJOR 0x00003550 #define Q8_FW_VER_MINOR 0x00003554 #define Q8_FW_VER_SUB 0x00003558 #define Q8_BOOTLD_ADDR 0x0000355C #define Q8_BOOTLD_SIZE 0x00003560 #define Q8_FW_IMAGE_ADDR 0x00003564 #define Q8_FW_BUILD_NUMBER 0x00003568 #define Q8_FW_IMAGE_VALID 0x000035FC #define Q8_CMDPEG_STATE 0x00003650 #define Q8_LINK_STATE 0x00003698 #define Q8_LINK_STATE_2 0x0000369C #define Q8_LINK_SPEED_0 0x000036E0 #define Q8_LINK_SPEED_1 0x000036E4 #define Q8_LINK_SPEED_2 0x000036E8 #define Q8_LINK_SPEED_3 0x000036EC #define Q8_MAX_LINK_SPEED_0 0x000036F0 #define Q8_MAX_LINK_SPEED_1 0x000036F4 #define Q8_MAX_LINK_SPEED_2 0x000036F8 #define Q8_MAX_LINK_SPEED_3 0x000036FC #define Q8_ASIC_TEMPERATURE 0x000037B4 /* * CRB Window Registers * 0 thru 15; offsets 0x3800 thru 0x383C; 32bits each */ #define Q8_CRB_WINDOW_PF0 0x00003800 #define Q8_CRB_WINDOW_PF15 0x0000383C #define Q8_FLASH_LOCK 0x00003850 #define Q8_FLASH_UNLOCK 0x00003854 #define Q8_DRIVER_LOCK 0x00003868 #define Q8_DRIVER_UNLOCK 0x0000386C #define Q8_LEGACY_INT_PTR 0x000038C0 #define Q8_LEGACY_INT_TRIG 0x000038C4 #define Q8_LEGACY_INT_MASK 0x000038C8 #define Q8_WILD_CARD 0x000038F0 #define Q8_INFORMANT 0x000038FC /* * Ethernet Interface Specific Registers */ #define Q8_DRIVER_OP_MODE 0x00003570 #define Q8_API_VERSION 0x0000356C #define Q8_NPAR_STATE 0x0000359C /* * End of PCIe Registers; Direct Mapped; Offsets from BAR0 */ /* * Indirect Registers */ #define Q8_LED_DUAL_0 0x28084C80 #define Q8_LED_SINGLE_0 0x28084C90 #define Q8_LED_DUAL_1 0x28084CA0 #define Q8_LED_SINGLE_1 0x28084CB0 #define Q8_LED_DUAL_2 0x28084CC0 #define Q8_LED_SINGLE_2 0x28084CD0 #define Q8_LED_DUAL_3 0x28084CE0 #define Q8_LED_SINGLE_3 0x28084CF0 #define Q8_GPIO_1 0x28084D00 #define Q8_GPIO_2 0x28084D10 #define Q8_GPIO_3 0x28084D20 #define Q8_GPIO_4 0x28084D40 #define Q8_GPIO_5 0x28084D50 #define Q8_GPIO_6 0x28084D60 #define Q8_GPIO_7 0x42100060 #define Q8_GPIO_8 0x42100064 #define Q8_FLASH_SPI_STATUS 0x2808E010 #define Q8_FLASH_SPI_CONTROL 0x2808E014 #define Q8_FLASH_STATUS 0x42100004 #define Q8_FLASH_CONTROL 0x42110004 #define Q8_FLASH_ADDRESS 0x42110008 #define Q8_FLASH_WR_DATA 0x4211000C #define Q8_FLASH_RD_DATA 0x42110018 #define Q8_FLASH_DIRECT_WINDOW 0x42110030 #define Q8_FLASH_DIRECT_DATA 0x42150000 #define Q8_MS_CNTRL 0x41000090 #define Q8_MS_ADDR_LO 0x41000094 #define Q8_MS_ADDR_HI 0x41000098 #define Q8_MS_WR_DATA_0_31 0x410000A0 #define Q8_MS_WR_DATA_32_63 0x410000A4 #define Q8_MS_WR_DATA_64_95 0x410000B0 #define Q8_MS_WR_DATA_96_127 0x410000B4 #define Q8_MS_RD_DATA_0_31 0x410000A8 #define Q8_MS_RD_DATA_32_63 0x410000AC #define Q8_MS_RD_DATA_64_95 0x410000B8 #define Q8_MS_RD_DATA_96_127 0x410000BC #define Q8_CRB_PEG_0 0x3400003c #define Q8_CRB_PEG_1 0x3410003c #define Q8_CRB_PEG_2 0x3420003c #define Q8_CRB_PEG_3 0x3430003c #define Q8_CRB_PEG_4 0x34B0003c /* * Macros for reading and writing registers */ #if defined(__i386__) || defined(__amd64__) #define Q8_MB() __asm volatile("mfence" ::: "memory") #define Q8_WMB() __asm volatile("sfence" ::: "memory") #define Q8_RMB() __asm volatile("lfence" ::: "memory") #else #define Q8_MB() #define Q8_WMB() #define Q8_RMB() #endif #define READ_REG32(ha, reg) bus_read_4((ha->pci_reg), reg) #define WRITE_REG32(ha, reg, val) \ {\ bus_write_4((ha->pci_reg), reg, val);\ bus_read_4((ha->pci_reg), reg);\ } #define Q8_NUM_MBOX 512 -#define Q8_MAX_NUM_MULTICAST_ADDRS 1023 +#define Q8_MAX_NUM_MULTICAST_ADDRS 1022 #define Q8_MAC_ADDR_LEN 6 /* * Firmware Interface */ /* * Command Response Interface - Commands */ #define Q8_MBX_CONFIG_IP_ADDRESS 0x0001 #define Q8_MBX_CONFIG_INTR 0x0002 #define Q8_MBX_MAP_INTR_SRC 0x0003 #define Q8_MBX_MAP_SDS_TO_RDS 0x0006 #define Q8_MBX_CREATE_RX_CNTXT 0x0007 #define Q8_MBX_DESTROY_RX_CNTXT 0x0008 #define Q8_MBX_CREATE_TX_CNTXT 0x0009 #define Q8_MBX_DESTROY_TX_CNTXT 0x000A #define Q8_MBX_ADD_RX_RINGS 0x000B #define Q8_MBX_CONFIG_LRO_FLOW 0x000C #define Q8_MBX_CONFIG_MAC_LEARNING 0x000D #define Q8_MBX_GET_STATS 0x000F #define Q8_MBX_GENERATE_INTR 0x0011 #define Q8_MBX_SET_MAX_MTU 0x0012 #define Q8_MBX_MAC_ADDR_CNTRL 0x001F #define Q8_MBX_GET_PCI_CONFIG 0x0020 #define Q8_MBX_GET_NIC_PARTITION 0x0021 #define Q8_MBX_SET_NIC_PARTITION 0x0022 #define Q8_MBX_QUERY_WOL_CAP 0x002C #define Q8_MBX_SET_WOL_CONFIG 0x002D #define Q8_MBX_GET_MINIDUMP_TMPLT_SIZE 0x002F #define Q8_MBX_GET_MINIDUMP_TMPLT 0x0030 #define Q8_MBX_GET_FW_DCBX_CAPS 0x0034 #define Q8_MBX_QUERY_DCBX_SETTINGS 0x0035 #define Q8_MBX_CONFIG_RSS 0x0041 #define Q8_MBX_CONFIG_RSS_TABLE 0x0042 #define Q8_MBX_CONFIG_INTR_COALESCE 0x0043 #define Q8_MBX_CONFIG_LED 0x0044 #define Q8_MBX_CONFIG_MAC_ADDR 0x0045 #define Q8_MBX_CONFIG_STATISTICS 0x0046 #define Q8_MBX_CONFIG_LOOPBACK 0x0047 #define Q8_MBX_LINK_EVENT_REQ 0x0048 #define Q8_MBX_CONFIG_MAC_RX_MODE 0x0049 #define Q8_MBX_CONFIG_FW_LRO 0x004A #define Q8_MBX_HW_CONFIG 0x004C #define Q8_MBX_INIT_NIC_FUNC 0x0060 #define Q8_MBX_STOP_NIC_FUNC 0x0061 #define Q8_MBX_IDC_REQ 0x0062 #define Q8_MBX_IDC_ACK 0x0063 #define Q8_MBX_SET_PORT_CONFIG 0x0066 #define Q8_MBX_GET_PORT_CONFIG 0x0067 #define Q8_MBX_GET_LINK_STATUS 0x0068 /* * Mailbox Command Response */ #define Q8_MBX_RSP_SUCCESS 0x0001 #define Q8_MBX_RSP_RESPONSE_FAILURE 0x0002 #define Q8_MBX_RSP_NO_CARD_CRB 0x0003 #define Q8_MBX_RSP_NO_CARD_MEM 0x0004 #define Q8_MBX_RSP_NO_CARD_RSRC 0x0005 #define Q8_MBX_RSP_INVALID_ARGS 0x0006 #define Q8_MBX_RSP_INVALID_ACTION 0x0007 #define Q8_MBX_RSP_INVALID_STATE 0x0008 #define Q8_MBX_RSP_NOT_SUPPORTED 0x0009 #define Q8_MBX_RSP_NOT_PERMITTED 0x000A #define Q8_MBX_RSP_NOT_READY 0x000B #define Q8_MBX_RSP_DOES_NOT_EXIST 0x000C #define Q8_MBX_RSP_ALREADY_EXISTS 0x000D #define Q8_MBX_RSP_BAD_SIGNATURE 0x000E #define Q8_MBX_RSP_CMD_NOT_IMPLEMENTED 0x000F #define Q8_MBX_RSP_CMD_INVALID 0x0010 #define Q8_MBX_RSP_TIMEOUT 0x0011 #define Q8_MBX_RSP_CMD_FAILED 0x0012 #define Q8_MBX_RSP_FATAL_TEMP 0x0013 #define Q8_MBX_RSP_MAX_EXCEEDED 0x0014 #define Q8_MBX_RSP_UNSPECIFIED 0x0015 #define Q8_MBX_RSP_INTR_CREATE_FAILED 0x0017 #define Q8_MBX_RSP_INTR_DELETE_FAILED 0x0018 #define Q8_MBX_RSP_INTR_INVALID_OP 0x0019 #define Q8_MBX_RSP_IDC_INTRMD_RSP 0x001A #define Q8_MBX_CMD_VERSION (0x2 << 13) #define Q8_MBX_RSP_STATUS(x) (((!(x >> 9)) || ((x >> 9) == 1)) ? 0: (x >> 9)) /* * Configure IP Address */ typedef struct _q80_config_ip_addr { uint16_t opcode; uint16_t count_version; uint8_t cmd; #define Q8_MBX_CONFIG_IP_ADD_IP 0x1 #define Q8_MBX_CONFIG_IP_DEL_IP 0x2 uint8_t ip_type; #define Q8_MBX_CONFIG_IP_V4 0x0 #define Q8_MBX_CONFIG_IP_V6 0x1 uint16_t rsrvd; union { struct { uint32_t addr; uint32_t rsrvd[3]; } ipv4; uint8_t ipv6_addr[16]; } u; } __packed q80_config_ip_addr_t; typedef struct _q80_config_ip_addr_rsp { uint16_t opcode; uint16_t regcnt_status; } __packed q80_config_ip_addr_rsp_t; /* * Configure Interrupt Command */ typedef struct _q80_intr { uint8_t cmd_type; #define Q8_MBX_CONFIG_INTR_CREATE 0x1 #define Q8_MBX_CONFIG_INTR_DELETE 0x2 #define Q8_MBX_CONFIG_INTR_TYPE_LINE (0x1 << 4) #define Q8_MBX_CONFIG_INTR_TYPE_MSI_X (0x3 << 4) uint8_t rsrvd; uint16_t msix_index; } __packed q80_intr_t; #define Q8_MAX_INTR_VECTORS 16 typedef struct _q80_config_intr { uint16_t opcode; uint16_t count_version; uint8_t nentries; uint8_t rsrvd[3]; q80_intr_t intr[Q8_MAX_INTR_VECTORS]; } __packed q80_config_intr_t; typedef struct _q80_intr_rsp { uint8_t status; uint8_t cmd; uint16_t intr_id; uint32_t intr_src; } q80_intr_rsp_t; typedef struct _q80_config_intr_rsp { uint16_t opcode; uint16_t regcnt_status; uint8_t nentries; uint8_t rsrvd[3]; q80_intr_rsp_t intr[Q8_MAX_INTR_VECTORS]; } __packed q80_config_intr_rsp_t; /* * Configure LRO Flow Command */ typedef struct _q80_config_lro_flow { uint16_t opcode; uint16_t count_version; uint8_t cmd; #define Q8_MBX_CONFIG_LRO_FLOW_ADD 0x01 #define Q8_MBX_CONFIG_LRO_FLOW_DELETE 0x02 uint8_t type_ts; #define Q8_MBX_CONFIG_LRO_FLOW_IPV4 0x00 #define Q8_MBX_CONFIG_LRO_FLOW_IPV6 0x01 #define Q8_MBX_CONFIG_LRO_FLOW_TS_ABSENT 0x00 #define Q8_MBX_CONFIG_LRO_FLOW_TS_PRESENT 0x02 uint16_t rsrvd; union { struct { uint32_t addr; uint32_t rsrvd[3]; } ipv4; uint8_t ipv6_addr[16]; } dst; union { struct { uint32_t addr; uint32_t rsrvd[3]; } ipv4; uint8_t ipv6_addr[16]; } src; uint16_t dst_port; uint16_t src_port; } __packed q80_config_lro_flow_t; typedef struct _q80_config_lro_flow_rsp { uint16_t opcode; uint16_t regcnt_status; } __packed q80_config_lro_flow_rsp_t; typedef struct _q80_set_max_mtu { uint16_t opcode; uint16_t count_version; uint32_t cntxt_id; uint32_t mtu; } __packed q80_set_max_mtu_t; typedef struct _q80_set_max_mtu_rsp { uint16_t opcode; uint16_t regcnt_status; } __packed q80_set_max_mtu_rsp_t; /* * Configure RSS */ typedef struct _q80_config_rss { uint16_t opcode; uint16_t count_version; uint16_t cntxt_id; uint16_t rsrvd; uint8_t hash_type; #define Q8_MBX_RSS_HASH_TYPE_IPV4_IP (0x1 << 4) #define Q8_MBX_RSS_HASH_TYPE_IPV4_TCP (0x2 << 4) #define Q8_MBX_RSS_HASH_TYPE_IPV4_TCP_IP (0x3 << 4) #define Q8_MBX_RSS_HASH_TYPE_IPV6_IP (0x1 << 6) #define Q8_MBX_RSS_HASH_TYPE_IPV6_TCP (0x2 << 6) #define Q8_MBX_RSS_HASH_TYPE_IPV6_TCP_IP (0x3 << 6) uint8_t flags; #define Q8_MBX_RSS_FLAGS_ENABLE_RSS (0x1) #define Q8_MBX_RSS_FLAGS_USE_IND_TABLE (0x2) #define Q8_MBX_RSS_FLAGS_TYPE_CRSS (0x4) uint16_t indtbl_mask; #define Q8_MBX_RSS_INDTBL_MASK 0x7F #define Q8_MBX_RSS_FLAGS_MULTI_RSS_VALID 0x8000 uint32_t multi_rss; #define Q8_MBX_RSS_MULTI_RSS_ENGINE_ASSIGN BIT_30 #define Q8_MBX_RSS_USE_MULTI_RSS_ENGINES BIT_31 uint64_t rss_key[5]; } __packed q80_config_rss_t; typedef struct _q80_config_rss_rsp { uint16_t opcode; uint16_t regcnt_status; } __packed q80_config_rss_rsp_t; /* * Configure RSS Indirection Table */ #define Q8_RSS_IND_TBL_SIZE 40 #define Q8_RSS_IND_TBL_MIN_IDX 0 #define Q8_RSS_IND_TBL_MAX_IDX 127 typedef struct _q80_config_rss_ind_table { uint16_t opcode; uint16_t count_version; uint8_t start_idx; uint8_t end_idx; uint16_t cntxt_id; uint8_t ind_table[Q8_RSS_IND_TBL_SIZE]; } __packed q80_config_rss_ind_table_t; typedef struct _q80_config_rss_ind_table_rsp { uint16_t opcode; uint16_t regcnt_status; } __packed q80_config_rss_ind_table_rsp_t; /* * Configure Interrupt Coalescing and Generation */ typedef struct _q80_config_intr_coalesc { uint16_t opcode; uint16_t count_version; uint16_t flags; #define Q8_MBX_INTRC_FLAGS_RCV 1 #define Q8_MBX_INTRC_FLAGS_XMT 2 #define Q8_MBX_INTRC_FLAGS_PERIODIC (1 << 3) uint16_t cntxt_id; uint16_t max_pkts; uint16_t max_mswait; uint8_t timer_type; #define Q8_MBX_INTRC_TIMER_NONE 0 #define Q8_MBX_INTRC_TIMER_SINGLE 1 #define Q8_MBX_INTRC_TIMER_PERIODIC 2 uint16_t sds_ring_mask; uint8_t rsrvd; uint32_t ms_timeout; } __packed q80_config_intr_coalesc_t; typedef struct _q80_config_intr_coalesc_rsp { uint16_t opcode; uint16_t regcnt_status; } __packed q80_config_intr_coalesc_rsp_t; /* * Configure MAC Address */ +#define Q8_ETHER_ADDR_LEN 6 typedef struct _q80_mac_addr { - uint8_t addr[6]; + uint8_t addr[Q8_ETHER_ADDR_LEN]; uint16_t vlan_tci; } __packed q80_mac_addr_t; #define Q8_MAX_MAC_ADDRS 64 typedef struct _q80_config_mac_addr { uint16_t opcode; uint16_t count_version; uint8_t cmd; #define Q8_MBX_CMAC_CMD_ADD_MAC_ADDR 1 #define Q8_MBX_CMAC_CMD_DEL_MAC_ADDR 2 #define Q8_MBX_CMAC_CMD_CAM_BOTH (0x0 << 6) #define Q8_MBX_CMAC_CMD_CAM_INGRESS (0x1 << 6) #define Q8_MBX_CMAC_CMD_CAM_EGRESS (0x2 << 6) uint8_t nmac_entries; uint16_t cntxt_id; q80_mac_addr_t mac_addr[Q8_MAX_MAC_ADDRS]; } __packed q80_config_mac_addr_t; typedef struct _q80_config_mac_addr_rsp { uint16_t opcode; uint16_t regcnt_status; uint8_t cmd; uint8_t nmac_entries; uint16_t cntxt_id; uint32_t status[Q8_MAX_MAC_ADDRS]; } __packed q80_config_mac_addr_rsp_t; /* * Configure MAC Receive Mode */ typedef struct _q80_config_mac_rcv_mode { uint16_t opcode; uint16_t count_version; uint8_t mode; #define Q8_MBX_MAC_RCV_PROMISC_ENABLE 0x1 #define Q8_MBX_MAC_ALL_MULTI_ENABLE 0x2 uint8_t rsrvd; uint16_t cntxt_id; } __packed q80_config_mac_rcv_mode_t; typedef struct _q80_config_mac_rcv_mode_rsp { uint16_t opcode; uint16_t regcnt_status; } __packed q80_config_mac_rcv_mode_rsp_t; /* * Configure Firmware Controlled LRO */ typedef struct _q80_config_fw_lro { uint16_t opcode; uint16_t count_version; uint8_t flags; #define Q8_MBX_FW_LRO_IPV4 0x1 #define Q8_MBX_FW_LRO_IPV6 0x2 #define Q8_MBX_FW_LRO_IPV4_WO_DST_IP_CHK 0x4 #define Q8_MBX_FW_LRO_IPV6_WO_DST_IP_CHK 0x8 #define Q8_MBX_FW_LRO_LOW_THRESHOLD 0x10 uint8_t rsrvd; uint16_t cntxt_id; uint16_t low_threshold; uint16_t rsrvd0; } __packed q80_config_fw_lro_t; typedef struct _q80_config_fw_lro_rsp { uint16_t opcode; uint16_t regcnt_status; } __packed q80_config_fw_lro_rsp_t; /* * Minidump mailbox commands */ typedef struct _q80_config_md_templ_size { uint16_t opcode; uint16_t count_version; } __packed q80_config_md_templ_size_t; typedef struct _q80_config_md_templ_size_rsp { uint16_t opcode; uint16_t regcnt_status; uint32_t rsrvd; uint32_t templ_size; uint32_t templ_version; } __packed q80_config_md_templ_size_rsp_t; typedef struct _q80_config_md_templ_cmd { uint16_t opcode; uint16_t count_version; uint64_t buf_addr; /* physical address of buffer */ uint32_t buff_size; uint32_t offset; } __packed q80_config_md_templ_cmd_t; typedef struct _q80_config_md_templ_cmd_rsp { uint16_t opcode; uint16_t regcnt_status; uint32_t rsrvd; uint32_t templ_size; uint32_t buff_size; uint32_t offset; } __packed q80_config_md_templ_cmd_rsp_t; /* * Hardware Configuration Commands */ typedef struct _q80_hw_config { uint16_t opcode; uint16_t count_version; #define Q8_HW_CONFIG_SET_MDIO_REG_COUNT 0x06 #define Q8_HW_CONFIG_GET_MDIO_REG_COUNT 0x05 #define Q8_HW_CONFIG_SET_CAM_SEARCH_MODE_COUNT 0x03 #define Q8_HW_CONFIG_GET_CAM_SEARCH_MODE_COUNT 0x02 #define Q8_HW_CONFIG_SET_TEMP_THRESHOLD_COUNT 0x03 #define Q8_HW_CONFIG_GET_TEMP_THRESHOLD_COUNT 0x02 #define Q8_HW_CONFIG_GET_ECC_COUNTS_COUNT 0x02 uint32_t cmd; #define Q8_HW_CONFIG_SET_MDIO_REG 0x01 #define Q8_HW_CONFIG_GET_MDIO_REG 0x02 #define Q8_HW_CONFIG_SET_CAM_SEARCH_MODE 0x03 #define Q8_HW_CONFIG_GET_CAM_SEARCH_MODE 0x04 #define Q8_HW_CONFIG_SET_TEMP_THRESHOLD 0x07 #define Q8_HW_CONFIG_GET_TEMP_THRESHOLD 0x08 #define Q8_HW_CONFIG_GET_ECC_COUNTS 0x0A union { struct { uint32_t phys_port_number; uint32_t phy_dev_addr; uint32_t reg_addr; uint32_t data; } set_mdio; struct { uint32_t phys_port_number; uint32_t phy_dev_addr; uint32_t reg_addr; } get_mdio; struct { uint32_t mode; #define Q8_HW_CONFIG_CAM_SEARCH_MODE_INTERNAL 0x1 #define Q8_HW_CONFIG_CAM_SEARCH_MODE_AUTO 0x2 } set_cam_search_mode; struct { uint32_t value; } set_temp_threshold; } u; } __packed q80_hw_config_t; typedef struct _q80_hw_config_rsp { uint16_t opcode; uint16_t regcnt_status; union { struct { uint32_t value; } get_mdio; struct { uint32_t mode; } get_cam_search_mode; struct { uint32_t temp_warn; uint32_t curr_temp; uint32_t osc_ring_rate; uint32_t core_voltage; } get_temp_threshold; struct { uint32_t ddr_ecc_error_count; uint32_t ocm_ecc_error_count; uint32_t l2_dcache_ecc_error_count; uint32_t l2_icache_ecc_error_count; uint32_t eport_ecc_error_count; } get_ecc_counts; } u; } __packed q80_hw_config_rsp_t; /* * Link Event Request Command */ typedef struct _q80_link_event { uint16_t opcode; uint16_t count_version; uint8_t cmd; #define Q8_LINK_EVENT_CMD_STOP_PERIODIC 0 #define Q8_LINK_EVENT_CMD_ENABLE_ASYNC 1 uint8_t flags; #define Q8_LINK_EVENT_FLAGS_SEND_RSP 1 uint16_t cntxt_id; } __packed q80_link_event_t; typedef struct _q80_link_event_rsp { uint16_t opcode; uint16_t regcnt_status; } __packed q80_link_event_rsp_t; /* * Get Statistics Command */ typedef struct _q80_rcv_stats { uint64_t total_bytes; uint64_t total_pkts; uint64_t lro_pkt_count; uint64_t sw_pkt_count; uint64_t ip_chksum_err; uint64_t pkts_wo_acntxts; uint64_t pkts_dropped_no_sds_card; uint64_t pkts_dropped_no_sds_host; uint64_t oversized_pkts; uint64_t pkts_dropped_no_rds; uint64_t unxpctd_mcast_pkts; uint64_t re1_fbq_error; uint64_t invalid_mac_addr; uint64_t rds_prime_trys; uint64_t rds_prime_success; uint64_t lro_flows_added; uint64_t lro_flows_deleted; uint64_t lro_flows_active; uint64_t pkts_droped_unknown; } __packed q80_rcv_stats_t; typedef struct _q80_xmt_stats { uint64_t total_bytes; uint64_t total_pkts; uint64_t errors; uint64_t pkts_dropped; uint64_t switch_pkts; uint64_t num_buffers; } __packed q80_xmt_stats_t; typedef struct _q80_mac_stats { uint64_t xmt_frames; uint64_t xmt_bytes; uint64_t xmt_mcast_pkts; uint64_t xmt_bcast_pkts; uint64_t xmt_pause_frames; uint64_t xmt_cntrl_pkts; uint64_t xmt_pkt_lt_64bytes; uint64_t xmt_pkt_lt_127bytes; uint64_t xmt_pkt_lt_255bytes; uint64_t xmt_pkt_lt_511bytes; uint64_t xmt_pkt_lt_1023bytes; uint64_t xmt_pkt_lt_1518bytes; uint64_t xmt_pkt_gt_1518bytes; uint64_t rsrvd0[3]; uint64_t rcv_frames; uint64_t rcv_bytes; uint64_t rcv_mcast_pkts; uint64_t rcv_bcast_pkts; uint64_t rcv_pause_frames; uint64_t rcv_cntrl_pkts; uint64_t rcv_pkt_lt_64bytes; uint64_t rcv_pkt_lt_127bytes; uint64_t rcv_pkt_lt_255bytes; uint64_t rcv_pkt_lt_511bytes; uint64_t rcv_pkt_lt_1023bytes; uint64_t rcv_pkt_lt_1518bytes; uint64_t rcv_pkt_gt_1518bytes; uint64_t rsrvd1[3]; uint64_t rcv_len_error; uint64_t rcv_len_small; uint64_t rcv_len_large; uint64_t rcv_jabber; uint64_t rcv_dropped; uint64_t fcs_error; uint64_t align_error; uint64_t eswitched_frames; uint64_t eswitched_bytes; uint64_t eswitched_mcast_frames; uint64_t eswitched_bcast_frames; uint64_t eswitched_ucast_frames; uint64_t eswitched_err_free_frames; uint64_t eswitched_err_free_bytes; } __packed q80_mac_stats_t; typedef struct _q80_get_stats { uint16_t opcode; uint16_t count_version; uint32_t cmd; #define Q8_GET_STATS_CMD_CLEAR 0x01 #define Q8_GET_STATS_CMD_RCV 0x00 #define Q8_GET_STATS_CMD_XMT 0x02 #define Q8_GET_STATS_CMD_TYPE_CNTXT 0x00 #define Q8_GET_STATS_CMD_TYPE_MAC 0x04 #define Q8_GET_STATS_CMD_TYPE_FUNC 0x08 #define Q8_GET_STATS_CMD_TYPE_VPORT 0x0C #define Q8_GET_STATS_CMD_TYPE_ALL (0x7 << 2) } __packed q80_get_stats_t; typedef struct _q80_get_stats_rsp { uint16_t opcode; uint16_t regcnt_status; uint32_t cmd; union { q80_rcv_stats_t rcv; q80_xmt_stats_t xmt; q80_mac_stats_t mac; } u; } __packed q80_get_stats_rsp_t; typedef struct _q80_get_mac_rcv_xmt_stats_rsp { uint16_t opcode; uint16_t regcnt_status; uint32_t cmd; q80_mac_stats_t mac; q80_rcv_stats_t rcv; q80_xmt_stats_t xmt; } __packed q80_get_mac_rcv_xmt_stats_rsp_t; /* * Init NIC Function * Used to Register DCBX Configuration Change AEN */ typedef struct _q80_init_nic_func { uint16_t opcode; uint16_t count_version; uint32_t options; #define Q8_INIT_NIC_REG_IDC_AEN 0x01 #define Q8_INIT_NIC_REG_DCBX_CHNG_AEN 0x02 #define Q8_INIT_NIC_REG_SFP_CHNG_AEN 0x04 } __packed q80_init_nic_func_t; typedef struct _q80_init_nic_func_rsp { uint16_t opcode; uint16_t regcnt_status; } __packed q80_init_nic_func_rsp_t; /* * Stop NIC Function * Used to DeRegister DCBX Configuration Change AEN */ typedef struct _q80_stop_nic_func { uint16_t opcode; uint16_t count_version; uint32_t options; #define Q8_STOP_NIC_DEREG_DCBX_CHNG_AEN 0x02 #define Q8_STOP_NIC_DEREG_SFP_CHNG_AEN 0x04 } __packed q80_stop_nic_func_t; typedef struct _q80_stop_nic_func_rsp { uint16_t opcode; uint16_t regcnt_status; } __packed q80_stop_nic_func_rsp_t; /* * Query Firmware DCBX Capabilities */ typedef struct _q80_query_fw_dcbx_caps { uint16_t opcode; uint16_t count_version; } __packed q80_query_fw_dcbx_caps_t; typedef struct _q80_query_fw_dcbx_caps_rsp { uint16_t opcode; uint16_t regcnt_status; uint32_t dcbx_caps; #define Q8_QUERY_FW_DCBX_CAPS_TSA 0x00000001 #define Q8_QUERY_FW_DCBX_CAPS_ETS 0x00000002 #define Q8_QUERY_FW_DCBX_CAPS_DCBX_CEE_1_01 0x00000004 #define Q8_QUERY_FW_DCBX_CAPS_DCBX_IEEE_1_0 0x00000008 #define Q8_QUERY_FW_DCBX_MAX_TC_MASK 0x00F00000 #define Q8_QUERY_FW_DCBX_MAX_ETS_TC_MASK 0x0F000000 #define Q8_QUERY_FW_DCBX_MAX_PFC_TC_MASK 0xF0000000 } __packed q80_query_fw_dcbx_caps_rsp_t; /* * IDC Ack Cmd */ typedef struct _q80_idc_ack { uint16_t opcode; uint16_t count_version; uint32_t aen_mb1; uint32_t aen_mb2; uint32_t aen_mb3; uint32_t aen_mb4; } __packed q80_idc_ack_t; typedef struct _q80_idc_ack_rsp { uint16_t opcode; uint16_t regcnt_status; } __packed q80_idc_ack_rsp_t; /* * Set Port Configuration command * Used to set Ethernet Standard Pause values */ typedef struct _q80_set_port_cfg { uint16_t opcode; uint16_t count_version; uint32_t cfg_bits; #define Q8_PORT_CFG_BITS_LOOPBACK_MODE_MASK (0x7 << 1) #define Q8_PORT_CFG_BITS_LOOPBACK_MODE_NONE (0x0 << 1) #define Q8_PORT_CFG_BITS_LOOPBACK_MODE_HSS (0x2 << 1) #define Q8_PORT_CFG_BITS_LOOPBACK_MODE_PHY (0x3 << 1) #define Q8_PORT_CFG_BITS_LOOPBACK_MODE_EXT (0x4 << 1) #define Q8_VALID_LOOPBACK_MODE(mode) \ (((mode) == Q8_PORT_CFG_BITS_LOOPBACK_MODE_NONE) || \ (((mode) >= Q8_PORT_CFG_BITS_LOOPBACK_MODE_HSS) && \ ((mode) <= Q8_PORT_CFG_BITS_LOOPBACK_MODE_EXT))) #define Q8_PORT_CFG_BITS_DCBX_ENABLE BIT_4 #define Q8_PORT_CFG_BITS_PAUSE_CFG_MASK (0x3 << 5) #define Q8_PORT_CFG_BITS_PAUSE_DISABLED (0x0 << 5) #define Q8_PORT_CFG_BITS_PAUSE_STD (0x1 << 5) #define Q8_PORT_CFG_BITS_PAUSE_PPM (0x2 << 5) #define Q8_PORT_CFG_BITS_LNKCAP_10MB BIT_8 #define Q8_PORT_CFG_BITS_LNKCAP_100MB BIT_9 #define Q8_PORT_CFG_BITS_LNKCAP_1GB BIT_10 #define Q8_PORT_CFG_BITS_LNKCAP_10GB BIT_11 #define Q8_PORT_CFG_BITS_AUTONEG BIT_15 #define Q8_PORT_CFG_BITS_XMT_DISABLE BIT_17 #define Q8_PORT_CFG_BITS_FEC_RQSTD BIT_18 #define Q8_PORT_CFG_BITS_EEE_RQSTD BIT_19 #define Q8_PORT_CFG_BITS_STDPAUSE_DIR_MASK (0x3 << 20) #define Q8_PORT_CFG_BITS_STDPAUSE_XMT_RCV (0x0 << 20) #define Q8_PORT_CFG_BITS_STDPAUSE_XMT (0x1 << 20) #define Q8_PORT_CFG_BITS_STDPAUSE_RCV (0x2 << 20) } __packed q80_set_port_cfg_t; typedef struct _q80_set_port_cfg_rsp { uint16_t opcode; uint16_t regcnt_status; } __packed q80_set_port_cfg_rsp_t; /* * Get Port Configuration Command */ typedef struct _q80_get_port_cfg { uint16_t opcode; uint16_t count_version; } __packed q80_get_port_cfg_t; typedef struct _q80_get_port_cfg_rsp { uint16_t opcode; uint16_t regcnt_status; uint32_t cfg_bits; /* same as in q80_set_port_cfg_t */ uint8_t phys_port_type; uint8_t rsvd[3]; } __packed q80_get_port_cfg_rsp_t; /* * Get Link Status Command * Used to get current PAUSE values for the port */ typedef struct _q80_get_link_status { uint16_t opcode; uint16_t count_version; } __packed q80_get_link_status_t; typedef struct _q80_get_link_status_rsp { uint16_t opcode; uint16_t regcnt_status; uint32_t cfg_bits; #define Q8_GET_LINK_STAT_CFG_BITS_LINK_UP BIT_0 #define Q8_GET_LINK_STAT_CFG_BITS_LINK_SPEED_MASK (0x7 << 3) #define Q8_GET_LINK_STAT_CFG_BITS_LINK_SPEED_UNKNOWN (0x0 << 3) #define Q8_GET_LINK_STAT_CFG_BITS_LINK_SPEED_10MB (0x1 << 3) #define Q8_GET_LINK_STAT_CFG_BITS_LINK_SPEED_100MB (0x2 << 3) #define Q8_GET_LINK_STAT_CFG_BITS_LINK_SPEED_1GB (0x3 << 3) #define Q8_GET_LINK_STAT_CFG_BITS_LINK_SPEED_10GB (0x4 << 3) #define Q8_GET_LINK_STAT_CFG_BITS_PAUSE_CFG_MASK (0x3 << 6) #define Q8_GET_LINK_STAT_CFG_BITS_PAUSE_CFG_DISABLE (0x0 << 6) #define Q8_GET_LINK_STAT_CFG_BITS_PAUSE_CFG_STD (0x1 << 6) #define Q8_GET_LINK_STAT_CFG_BITS_PAUSE_CFG_PPM (0x2 << 6) #define Q8_GET_LINK_STAT_CFG_BITS_LOOPBACK_MASK (0x7 << 8) #define Q8_GET_LINK_STAT_CFG_BITS_LOOPBACK_NONE (0x0 << 6) #define Q8_GET_LINK_STAT_CFG_BITS_LOOPBACK_HSS (0x2 << 6) #define Q8_GET_LINK_STAT_CFG_BITS_LOOPBACK_PHY (0x3 << 6) #define Q8_GET_LINK_STAT_CFG_BITS_FEC_ENABLED BIT_12 #define Q8_GET_LINK_STAT_CFG_BITS_EEE_ENABLED BIT_13 #define Q8_GET_LINK_STAT_CFG_BITS_STDPAUSE_DIR_MASK (0x3 << 20) #define Q8_GET_LINK_STAT_CFG_BITS_STDPAUSE_NONE (0x0 << 20) #define Q8_GET_LINK_STAT_CFG_BITS_STDPAUSE_XMT (0x1 << 20) #define Q8_GET_LINK_STAT_CFG_BITS_STDPAUSE_RCV (0x2 << 20) #define Q8_GET_LINK_STAT_CFG_BITS_STDPAUSE_XMT_RCV (0x3 << 20) uint32_t link_state; #define Q8_GET_LINK_STAT_LOSS_OF_SIGNAL BIT_0 #define Q8_GET_LINK_STAT_PORT_RST_DONE BIT_3 #define Q8_GET_LINK_STAT_PHY_LINK_DOWN BIT_4 #define Q8_GET_LINK_STAT_PCS_LINK_DOWN BIT_5 #define Q8_GET_LINK_STAT_MAC_LOCAL_FAULT BIT_6 #define Q8_GET_LINK_STAT_MAC_REMOTE_FAULT BIT_7 #define Q8_GET_LINK_STAT_XMT_DISABLED BIT_9 #define Q8_GET_LINK_STAT_SFP_XMT_FAULT BIT_10 uint32_t sfp_info; #define Q8_GET_LINK_STAT_SFP_TRNCVR_MASK 0x3 #define Q8_GET_LINK_STAT_SFP_TRNCVR_NOT_EXPECTED 0x0 #define Q8_GET_LINK_STAT_SFP_TRNCVR_NONE 0x1 #define Q8_GET_LINK_STAT_SFP_TRNCVR_INVALID 0x2 #define Q8_GET_LINK_STAT_SFP_TRNCVR_VALID 0x3 #define Q8_GET_LINK_STAT_SFP_ADDTL_INFO_MASK (0x3 << 2) #define Q8_GET_LINK_STAT_SFP_ADDTL_INFO_UNREC_TRSVR (0x0 << 2) #define Q8_GET_LINK_STAT_SFP_ADDTL_INFO_NOT_QLOGIC (0x1 << 2) #define Q8_GET_LINK_STAT_SFP_ADDTL_INFO_SPEED_FAILED (0x2 << 2) #define Q8_GET_LINK_STAT_SFP_ADDTL_INFO_ACCESS_ERROR (0x3 << 2) #define Q8_GET_LINK_STAT_SFP_MOD_TYPE_MASK (0x1F << 4) #define Q8_GET_LINK_STAT_SFP_MOD_NONE (0x00 << 4) #define Q8_GET_LINK_STAT_SFP_MOD_10GBLRM (0x01 << 4) #define Q8_GET_LINK_STAT_SFP_MOD_10GBLR (0x02 << 4) #define Q8_GET_LINK_STAT_SFP_MOD_10GBSR (0x03 << 4) #define Q8_GET_LINK_STAT_SFP_MOD_10GBC_P (0x04 << 4) #define Q8_GET_LINK_STAT_SFP_MOD_10GBC_AL (0x05 << 4) #define Q8_GET_LINK_STAT_SFP_MOD_10GBC_PL (0x06 << 4) #define Q8_GET_LINK_STAT_SFP_MOD_1GBSX (0x07 << 4) #define Q8_GET_LINK_STAT_SFP_MOD_1GBLX (0x08 << 4) #define Q8_GET_LINK_STAT_SFP_MOD_1GBCX (0x09 << 4) #define Q8_GET_LINK_STAT_SFP_MOD_1GBT (0x0A << 4) #define Q8_GET_LINK_STAT_SFP_MOD_1GBC_PL (0x0B << 4) #define Q8_GET_LINK_STAT_SFP_MOD_UNKNOWN (0x0F << 4) #define Q8_GET_LINK_STAT_SFP_MULTI_RATE_MOD BIT_9 #define Q8_GET_LINK_STAT_SFP_XMT_FAULT BIT_10 #define Q8_GET_LINK_STAT_SFP_COPPER_CBL_LENGTH_MASK (0xFF << 16) } __packed q80_get_link_status_rsp_t; /* * Transmit Related Definitions */ /* Max# of TX Rings per Tx Create Cntxt Mbx Cmd*/ #define MAX_TCNTXT_RINGS 8 /* * Transmit Context - Q8_CMD_CREATE_TX_CNTXT Command Configuration Data */ typedef struct _q80_rq_tx_ring { uint64_t paddr; uint64_t tx_consumer; uint16_t nentries; uint16_t intr_id; uint8_t intr_src_bit; uint8_t rsrvd[3]; } __packed q80_rq_tx_ring_t; typedef struct _q80_rq_tx_cntxt { uint16_t opcode; uint16_t count_version; uint32_t cap0; #define Q8_TX_CNTXT_CAP0_BASEFW (1 << 0) #define Q8_TX_CNTXT_CAP0_LSO (1 << 6) #define Q8_TX_CNTXT_CAP0_TC (1 << 25) uint32_t cap1; uint32_t cap2; uint32_t cap3; uint8_t ntx_rings; uint8_t traffic_class; /* bits 8-10; others reserved */ uint16_t tx_vpid; q80_rq_tx_ring_t tx_ring[MAX_TCNTXT_RINGS]; } __packed q80_rq_tx_cntxt_t; typedef struct _q80_rsp_tx_ring { uint32_t prod_index; uint16_t cntxt_id; uint8_t state; uint8_t rsrvd; } q80_rsp_tx_ring_t; typedef struct _q80_rsp_tx_cntxt { uint16_t opcode; uint16_t regcnt_status; uint8_t ntx_rings; uint8_t phy_port; uint8_t virt_port; uint8_t rsrvd; q80_rsp_tx_ring_t tx_ring[MAX_TCNTXT_RINGS]; } __packed q80_rsp_tx_cntxt_t; typedef struct _q80_tx_cntxt_destroy { uint16_t opcode; uint16_t count_version; uint32_t cntxt_id; } __packed q80_tx_cntxt_destroy_t; typedef struct _q80_tx_cntxt_destroy_rsp { uint16_t opcode; uint16_t regcnt_status; } __packed q80_tx_cntxt_destroy_rsp_t; /* * Transmit Command Descriptor * These commands are issued on the Transmit Ring associated with a Transmit * context */ typedef struct _q80_tx_cmd { uint8_t tcp_hdr_off; /* TCP Header Offset */ uint8_t ip_hdr_off; /* IP Header Offset */ uint16_t flags_opcode; /* Bits 0-6: flags; 7-12: opcode */ /* flags field */ #define Q8_TX_CMD_FLAGS_MULTICAST 0x01 #define Q8_TX_CMD_FLAGS_LSO_TSO 0x02 #define Q8_TX_CMD_FLAGS_VLAN_TAGGED 0x10 #define Q8_TX_CMD_FLAGS_HW_VLAN_ID 0x40 /* opcode field */ #define Q8_TX_CMD_OP_XMT_UDP_CHKSUM_IPV6 (0xC << 7) #define Q8_TX_CMD_OP_XMT_TCP_CHKSUM_IPV6 (0xB << 7) #define Q8_TX_CMD_OP_XMT_TCP_LSO_IPV6 (0x6 << 7) #define Q8_TX_CMD_OP_XMT_TCP_LSO (0x5 << 7) #define Q8_TX_CMD_OP_XMT_UDP_CHKSUM (0x3 << 7) #define Q8_TX_CMD_OP_XMT_TCP_CHKSUM (0x2 << 7) #define Q8_TX_CMD_OP_XMT_ETHER (0x1 << 7) uint8_t n_bufs; /* # of data segs in data buffer */ uint8_t data_len_lo; /* data length lower 8 bits */ uint16_t data_len_hi; /* data length upper 16 bits */ uint64_t buf2_addr; /* buffer 2 address */ uint16_t rsrvd0; uint16_t mss; /* MSS for this packet */ uint8_t cntxtid; /* Bits 7-4: ContextId; 3-0: reserved */ #define Q8_TX_CMD_PORT_CNXTID(c_id) ((c_id & 0xF) << 4) uint8_t total_hdr_len; /* MAC+IP+TCP Header Length for LSO */ uint16_t rsrvd1; uint64_t buf3_addr; /* buffer 3 address */ uint64_t buf1_addr; /* buffer 1 address */ uint16_t buf1_len; /* length of buffer 1 */ uint16_t buf2_len; /* length of buffer 2 */ uint16_t buf3_len; /* length of buffer 3 */ uint16_t buf4_len; /* length of buffer 4 */ uint64_t buf4_addr; /* buffer 4 address */ uint32_t rsrvd2; uint16_t rsrvd3; uint16_t vlan_tci; /* VLAN TCI when hw tagging is enabled*/ } __packed q80_tx_cmd_t; /* 64 bytes */ #define Q8_TX_CMD_MAX_SEGMENTS 4 #define Q8_TX_CMD_TSO_ALIGN 2 #define Q8_TX_MAX_NON_TSO_SEGS 62 /* * Receive Related Definitions */ #define MAX_RDS_RING_SETS 8 /* Max# of Receive Descriptor Rings */ #ifdef QL_ENABLE_ISCSI_TLV #define MAX_SDS_RINGS 32 /* Max# of Status Descriptor Rings */ #define NUM_TX_RINGS (MAX_SDS_RINGS * 2) #else #define MAX_SDS_RINGS 4 /* Max# of Status Descriptor Rings */ #define NUM_TX_RINGS MAX_SDS_RINGS #endif /* #ifdef QL_ENABLE_ISCSI_TLV */ #define MAX_RDS_RINGS MAX_SDS_RINGS /* Max# of Rcv Descriptor Rings */ typedef struct _q80_rq_sds_ring { uint64_t paddr; /* physical addr of status ring in system memory */ uint64_t hdr_split1; uint64_t hdr_split2; uint16_t size; /* number of entries in status ring */ uint16_t hdr_split1_size; uint16_t hdr_split2_size; uint16_t hdr_split_count; uint16_t intr_id; uint8_t intr_src_bit; uint8_t rsrvd[5]; } __packed q80_rq_sds_ring_t; /* 10 32bit words */ typedef struct _q80_rq_rds_ring { uint64_t paddr_std; /* physical addr of rcv ring in system memory */ uint64_t paddr_jumbo; /* physical addr of rcv ring in system memory */ uint16_t std_bsize; uint16_t std_nentries; uint16_t jumbo_bsize; uint16_t jumbo_nentries; } __packed q80_rq_rds_ring_t; /* 6 32bit words */ #define MAX_RCNTXT_SDS_RINGS 8 typedef struct _q80_rq_rcv_cntxt { uint16_t opcode; uint16_t count_version; uint32_t cap0; #define Q8_RCV_CNTXT_CAP0_BASEFW (1 << 0) #define Q8_RCV_CNTXT_CAP0_MULTI_RDS (1 << 1) #define Q8_RCV_CNTXT_CAP0_LRO (1 << 5) #define Q8_RCV_CNTXT_CAP0_HW_LRO (1 << 10) #define Q8_RCV_CNTXT_CAP0_VLAN_ALIGN (1 << 14) #define Q8_RCV_CNTXT_CAP0_RSS (1 << 15) #define Q8_RCV_CNTXT_CAP0_MSFT_RSS (1 << 16) #define Q8_RCV_CNTXT_CAP0_SGL_JUMBO (1 << 18) #define Q8_RCV_CNTXT_CAP0_SGL_LRO (1 << 19) #define Q8_RCV_CNTXT_CAP0_SINGLE_JUMBO (1 << 26) uint32_t cap1; uint32_t cap2; uint32_t cap3; uint8_t nrds_sets_rings; uint8_t nsds_rings; uint16_t rds_producer_mode; #define Q8_RCV_CNTXT_RDS_PROD_MODE_UNIQUE 0 #define Q8_RCV_CNTXT_RDS_PROD_MODE_SHARED 1 uint16_t rcv_vpid; uint16_t rsrvd0; uint32_t rsrvd1; q80_rq_sds_ring_t sds[MAX_RCNTXT_SDS_RINGS]; q80_rq_rds_ring_t rds[MAX_RDS_RING_SETS]; } __packed q80_rq_rcv_cntxt_t; typedef struct _q80_rsp_rds_ring { uint32_t prod_std; uint32_t prod_jumbo; } __packed q80_rsp_rds_ring_t; /* 8 bytes */ typedef struct _q80_rsp_rcv_cntxt { uint16_t opcode; uint16_t regcnt_status; uint8_t nrds_sets_rings; uint8_t nsds_rings; uint16_t cntxt_id; uint8_t state; uint8_t num_funcs; uint8_t phy_port; uint8_t virt_port; uint32_t sds_cons[MAX_RCNTXT_SDS_RINGS]; q80_rsp_rds_ring_t rds[MAX_RDS_RING_SETS]; } __packed q80_rsp_rcv_cntxt_t; typedef struct _q80_rcv_cntxt_destroy { uint16_t opcode; uint16_t count_version; uint32_t cntxt_id; } __packed q80_rcv_cntxt_destroy_t; typedef struct _q80_rcv_cntxt_destroy_rsp { uint16_t opcode; uint16_t regcnt_status; } __packed q80_rcv_cntxt_destroy_rsp_t; /* * Add Receive Rings */ typedef struct _q80_rq_add_rcv_rings { uint16_t opcode; uint16_t count_version; uint8_t nrds_sets_rings; uint8_t nsds_rings; uint16_t cntxt_id; q80_rq_sds_ring_t sds[MAX_RCNTXT_SDS_RINGS]; q80_rq_rds_ring_t rds[MAX_RDS_RING_SETS]; } __packed q80_rq_add_rcv_rings_t; typedef struct _q80_rsp_add_rcv_rings { uint16_t opcode; uint16_t regcnt_status; uint8_t nrds_sets_rings; uint8_t nsds_rings; uint16_t cntxt_id; uint32_t sds_cons[MAX_RCNTXT_SDS_RINGS]; q80_rsp_rds_ring_t rds[MAX_RDS_RING_SETS]; } __packed q80_rsp_add_rcv_rings_t; /* * Map Status Ring to Receive Descriptor Set */ #define MAX_SDS_TO_RDS_MAP 16 typedef struct _q80_sds_rds_map_e { uint8_t sds_ring; uint8_t rsrvd0; uint8_t rds_ring; uint8_t rsrvd1; } __packed q80_sds_rds_map_e_t; typedef struct _q80_rq_map_sds_to_rds { uint16_t opcode; uint16_t count_version; uint16_t cntxt_id; uint16_t num_rings; q80_sds_rds_map_e_t sds_rds[MAX_SDS_TO_RDS_MAP]; } __packed q80_rq_map_sds_to_rds_t; typedef struct _q80_rsp_map_sds_to_rds { uint16_t opcode; uint16_t regcnt_status; uint16_t cntxt_id; uint16_t num_rings; q80_sds_rds_map_e_t sds_rds[MAX_SDS_TO_RDS_MAP]; } __packed q80_rsp_map_sds_to_rds_t; /* * Receive Descriptor corresponding to each entry in the receive ring */ typedef struct _q80_rcv_desc { uint16_t handle; uint16_t rsrvd; uint32_t buf_size; /* buffer size in bytes */ uint64_t buf_addr; /* physical address of buffer */ } __packed q80_recv_desc_t; /* * Status Descriptor corresponding to each entry in the Status ring */ typedef struct _q80_stat_desc { uint64_t data[2]; } __packed q80_stat_desc_t; /* * definitions for data[0] field of Status Descriptor */ #define Q8_STAT_DESC_RSS_HASH(data) (data & 0xFFFFFFFF) #define Q8_STAT_DESC_TOTAL_LENGTH(data) ((data >> 32) & 0x3FFF) #define Q8_STAT_DESC_TOTAL_LENGTH_SGL_RCV(data) ((data >> 32) & 0xFFFF) #define Q8_STAT_DESC_HANDLE(data) ((data >> 48) & 0xFFFF) /* * definitions for data[1] field of Status Descriptor */ #define Q8_STAT_DESC_OPCODE(data) ((data >> 42) & 0xF) #define Q8_STAT_DESC_OPCODE_RCV_PKT 0x01 #define Q8_STAT_DESC_OPCODE_LRO_PKT 0x02 #define Q8_STAT_DESC_OPCODE_SGL_LRO 0x04 #define Q8_STAT_DESC_OPCODE_SGL_RCV 0x05 #define Q8_STAT_DESC_OPCODE_CONT 0x06 /* * definitions for data[1] field of Status Descriptor for standard frames * status descriptor opcode equals 0x04 */ #define Q8_STAT_DESC_STATUS(data) ((data >> 39) & 0x0007) #define Q8_STAT_DESC_STATUS_CHKSUM_NOT_DONE 0x00 #define Q8_STAT_DESC_STATUS_NO_CHKSUM 0x01 #define Q8_STAT_DESC_STATUS_CHKSUM_OK 0x02 #define Q8_STAT_DESC_STATUS_CHKSUM_ERR 0x03 #define Q8_STAT_DESC_VLAN(data) ((data >> 47) & 1) #define Q8_STAT_DESC_VLAN_ID(data) ((data >> 48) & 0xFFFF) #define Q8_STAT_DESC_PROTOCOL(data) ((data >> 44) & 0x000F) #define Q8_STAT_DESC_L2_OFFSET(data) ((data >> 48) & 0x001F) #define Q8_STAT_DESC_COUNT(data) ((data >> 37) & 0x0007) /* * definitions for data[0-1] fields of Status Descriptor for LRO * status descriptor opcode equals 0x04 */ /* definitions for data[1] field */ #define Q8_LRO_STAT_DESC_SEQ_NUM(data) (uint32_t)(data) /* * definitions specific to opcode 0x04 data[1] */ #define Q8_STAT_DESC_COUNT_SGL_LRO(data) ((data >> 13) & 0x0007) #define Q8_SGL_LRO_STAT_L2_OFFSET(data) ((data >> 16) & 0xFF) #define Q8_SGL_LRO_STAT_L4_OFFSET(data) ((data >> 24) & 0xFF) #define Q8_SGL_LRO_STAT_TS(data) ((data >> 40) & 0x1) #define Q8_SGL_LRO_STAT_PUSH_BIT(data) ((data >> 41) & 0x1) /* * definitions specific to opcode 0x05 data[1] */ #define Q8_STAT_DESC_COUNT_SGL_RCV(data) ((data >> 37) & 0x0003) /* * definitions for opcode 0x06 */ /* definitions for data[0] field */ #define Q8_SGL_STAT_DESC_HANDLE1(data) (data & 0xFFFF) #define Q8_SGL_STAT_DESC_HANDLE2(data) ((data >> 16) & 0xFFFF) #define Q8_SGL_STAT_DESC_HANDLE3(data) ((data >> 32) & 0xFFFF) #define Q8_SGL_STAT_DESC_HANDLE4(data) ((data >> 48) & 0xFFFF) /* definitions for data[1] field */ #define Q8_SGL_STAT_DESC_HANDLE5(data) (data & 0xFFFF) #define Q8_SGL_STAT_DESC_HANDLE6(data) ((data >> 16) & 0xFFFF) #define Q8_SGL_STAT_DESC_NUM_HANDLES(data) ((data >> 32) & 0x7) #define Q8_SGL_STAT_DESC_HANDLE7(data) ((data >> 48) & 0xFFFF) /** Driver Related Definitions Begin **/ #define TX_SMALL_PKT_SIZE 128 /* size in bytes of small packets */ /* The number of descriptors should be a power of 2 */ #define NUM_TX_DESCRIPTORS 1024 #define NUM_STATUS_DESCRIPTORS 1024 #define NUM_RX_DESCRIPTORS 2048 /* * structure describing various dma buffers */ typedef struct qla_dmabuf { volatile struct { uint32_t tx_ring :1, rds_ring :1, sds_ring :1, minidump :1; } flags; qla_dma_t tx_ring; qla_dma_t rds_ring[MAX_RDS_RINGS]; qla_dma_t sds_ring[MAX_SDS_RINGS]; qla_dma_t minidump; } qla_dmabuf_t; typedef struct _qla_sds { q80_stat_desc_t *sds_ring_base; /* start of sds ring */ uint32_t sdsr_next; /* next entry in SDS ring to process */ struct lro_ctrl lro; void *rxb_free; uint32_t rx_free; volatile uint32_t rcv_active; uint32_t sds_consumer; uint64_t intr_count; uint64_t spurious_intr_count; } qla_sds_t; #define Q8_MAX_LRO_CONT_DESC 7 #define Q8_MAX_HANDLES_LRO (1 + (Q8_MAX_LRO_CONT_DESC * 7)) #define Q8_MAX_HANDLES_NON_LRO 8 typedef struct _qla_sgl_rcv { uint16_t pkt_length; uint16_t num_handles; uint16_t chksum_status; uint32_t rss_hash; uint16_t rss_hash_flags; uint16_t vlan_tag; uint16_t handle[Q8_MAX_HANDLES_NON_LRO]; } qla_sgl_rcv_t; typedef struct _qla_sgl_lro { uint16_t flags; #define Q8_LRO_COMP_TS 0x1 #define Q8_LRO_COMP_PUSH_BIT 0x2 uint16_t l2_offset; uint16_t l4_offset; uint16_t payload_length; uint16_t num_handles; uint32_t rss_hash; uint16_t rss_hash_flags; uint16_t vlan_tag; uint16_t handle[Q8_MAX_HANDLES_LRO]; } qla_sgl_lro_t; typedef union { qla_sgl_rcv_t rcv; qla_sgl_lro_t lro; } qla_sgl_comp_t; #define QL_FRAME_HDR_SIZE (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN +\ sizeof (struct ip6_hdr) + sizeof (struct tcphdr) + 16) typedef struct _qla_hw_tx_cntxt { q80_tx_cmd_t *tx_ring_base; bus_addr_t tx_ring_paddr; volatile uint32_t *tx_cons; /* tx consumer shadow reg */ bus_addr_t tx_cons_paddr; volatile uint32_t txr_free; /* # of free entries in tx ring */ volatile uint32_t txr_next; /* # next available tx ring entry */ volatile uint32_t txr_comp; /* index of last tx entry completed */ uint32_t tx_prod_reg; uint16_t tx_cntxt_id; uint8_t frame_hdr[QL_FRAME_HDR_SIZE]; } qla_hw_tx_cntxt_t; typedef struct _qla_mcast { uint16_t rsrvd; - uint8_t addr[6]; + uint8_t addr[ETHER_ADDR_LEN]; } __packed qla_mcast_t; typedef struct _qla_rdesc { volatile uint32_t prod_std; volatile uint32_t prod_jumbo; volatile uint32_t rx_next; /* next standard rcv ring to arm fw */ volatile int32_t rx_in; /* next standard rcv ring to add mbufs */ volatile uint64_t count; } qla_rdesc_t; typedef struct _qla_flash_desc_table { uint32_t flash_valid; uint16_t flash_ver; uint16_t flash_len; uint16_t flash_cksum; uint16_t flash_unused; uint8_t flash_model[16]; uint16_t flash_manuf; uint16_t flash_id; uint8_t flash_flag; uint8_t erase_cmd; uint8_t alt_erase_cmd; uint8_t write_enable_cmd; uint8_t write_enable_bits; uint8_t write_statusreg_cmd; uint8_t unprotected_sec_cmd; uint8_t read_manuf_cmd; uint32_t block_size; uint32_t alt_block_size; uint32_t flash_size; uint32_t write_enable_data; uint8_t readid_addr_len; uint8_t write_disable_bits; uint8_t read_dev_id_len; uint8_t chip_erase_cmd; uint16_t read_timeo; uint8_t protected_sec_cmd; uint8_t resvd[65]; } __packed qla_flash_desc_table_t; /* * struct for storing hardware specific information for a given interface */ typedef struct _qla_hw { struct { uint32_t unicast_mac :1, bcast_mac :1, loopback_mode :2, init_tx_cnxt :1, init_rx_cnxt :1, init_intr_cnxt :1, fduplex :1, autoneg :1, fdt_valid :1; } flags; uint16_t link_speed; uint16_t cable_length; uint32_t cable_oui; uint8_t link_up; uint8_t module_type; uint8_t link_faults; uint8_t mac_rcv_mode; uint32_t max_mtu; uint8_t mac_addr[ETHER_ADDR_LEN]; uint32_t num_sds_rings; uint32_t num_rds_rings; uint32_t num_tx_rings; qla_dmabuf_t dma_buf; /* Transmit Side */ qla_hw_tx_cntxt_t tx_cntxt[NUM_TX_RINGS]; /* Receive Side */ uint16_t rcv_cntxt_id; uint32_t mbx_intr_mask_offset; uint16_t intr_id[MAX_SDS_RINGS]; uint32_t intr_src[MAX_SDS_RINGS]; qla_sds_t sds[MAX_SDS_RINGS]; uint32_t mbox[Q8_NUM_MBOX]; qla_rdesc_t rds[MAX_RDS_RINGS]; uint32_t rds_pidx_thres; uint32_t sds_cidx_thres; uint32_t rcv_intr_coalesce; uint32_t xmt_intr_coalesce; /* Immediate Completion */ volatile uint32_t imd_compl; volatile uint32_t aen_mb0; volatile uint32_t aen_mb1; volatile uint32_t aen_mb2; volatile uint32_t aen_mb3; volatile uint32_t aen_mb4; /* multicast address list */ uint32_t nmcast; qla_mcast_t mcast[Q8_MAX_NUM_MULTICAST_ADDRS]; + uint8_t mac_addr_arr[(Q8_MAX_MAC_ADDRS * ETHER_ADDR_LEN)]; /* reset sequence */ #define Q8_MAX_RESET_SEQ_IDX 16 uint32_t rst_seq[Q8_MAX_RESET_SEQ_IDX]; uint32_t rst_seq_idx; /* heart beat register value */ uint32_t hbeat_value; uint32_t health_count; uint32_t max_tx_segs; uint32_t min_lro_pkt_size; uint32_t enable_9kb; uint32_t user_pri_nic; uint32_t user_pri_iscsi; uint64_t iscsi_pkt_count; /* Flash Descriptor Table */ qla_flash_desc_table_t fdt; /* Minidump Related */ uint32_t mdump_init; uint32_t mdump_done; uint32_t mdump_active; uint32_t mdump_capture_mask; uint32_t mdump_start_seq_index; void *mdump_buffer; uint32_t mdump_buffer_size; void *mdump_template; uint32_t mdump_template_size; } qla_hw_t; #define QL_UPDATE_RDS_PRODUCER_INDEX(ha, prod_reg, val) \ bus_write_4((ha->pci_reg), prod_reg, val); #define QL_UPDATE_TX_PRODUCER_INDEX(ha, val, i) \ WRITE_REG32(ha, ha->hw.tx_cntxt[i].tx_prod_reg, val) #define QL_UPDATE_SDS_CONSUMER_INDEX(ha, i, val) \ bus_write_4((ha->pci_reg), (ha->hw.sds[i].sds_consumer), val); #define QL_ENABLE_INTERRUPTS(ha, i) \ bus_write_4((ha->pci_reg), (ha->hw.intr_src[i]), 0); #define QL_BUFFER_ALIGN 16 /* * Flash Configuration */ #define Q8_BOARD_CONFIG_OFFSET 0x370000 #define Q8_BOARD_CONFIG_LENGTH 0x2000 #define Q8_BOARD_CONFIG_MAC0_LO 0x400 #define Q8_FDT_LOCK_MAGIC_ID 0x00FD00FD #define Q8_FDT_FLASH_ADDR_VAL 0xFD009F #define Q8_FDT_FLASH_CTRL_VAL 0x3F #define Q8_FDT_MASK_VAL 0xFF #define Q8_WR_ENABLE_FL_ADDR 0xFD0100 #define Q8_WR_ENABLE_FL_CTRL 0x5 #define Q8_ERASE_LOCK_MAGIC_ID 0x00EF00EF #define Q8_ERASE_FL_ADDR_MASK 0xFD0300 #define Q8_ERASE_FL_CTRL_MASK 0x3D #define Q8_WR_FL_LOCK_MAGIC_ID 0xABCDABCD #define Q8_WR_FL_ADDR_MASK 0x800000 #define Q8_WR_FL_CTRL_MASK 0x3D #define QL_FDT_OFFSET 0x3F0000 #define Q8_FLASH_SECTOR_SIZE 0x10000 /* * Off Chip Memory Access */ typedef struct _q80_offchip_mem_val { uint32_t data_lo; uint32_t data_hi; uint32_t data_ulo; uint32_t data_uhi; } q80_offchip_mem_val_t; #endif /* #ifndef _QL_HW_H_ */ Index: user/alc/PQ_LAUNDRY/sys/dev/qlxgbe/ql_os.c =================================================================== --- user/alc/PQ_LAUNDRY/sys/dev/qlxgbe/ql_os.c (revision 306831) +++ user/alc/PQ_LAUNDRY/sys/dev/qlxgbe/ql_os.c (revision 306832) @@ -1,1813 +1,1815 @@ /* * Copyright (c) 2013-2016 Qlogic Corporation * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * and ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * File: ql_os.c * Author : David C Somayajulu, Qlogic Corporation, Aliso Viejo, CA 92656. */ #include __FBSDID("$FreeBSD$"); #include "ql_os.h" #include "ql_hw.h" #include "ql_def.h" #include "ql_inline.h" #include "ql_ver.h" #include "ql_glbl.h" #include "ql_dbg.h" #include /* * Some PCI Configuration Space Related Defines */ #ifndef PCI_VENDOR_QLOGIC #define PCI_VENDOR_QLOGIC 0x1077 #endif #ifndef PCI_PRODUCT_QLOGIC_ISP8030 #define PCI_PRODUCT_QLOGIC_ISP8030 0x8030 #endif #define PCI_QLOGIC_ISP8030 \ ((PCI_PRODUCT_QLOGIC_ISP8030 << 16) | PCI_VENDOR_QLOGIC) /* * static functions */ static int qla_alloc_parent_dma_tag(qla_host_t *ha); static void qla_free_parent_dma_tag(qla_host_t *ha); static int qla_alloc_xmt_bufs(qla_host_t *ha); static void qla_free_xmt_bufs(qla_host_t *ha); static int qla_alloc_rcv_bufs(qla_host_t *ha); static void qla_free_rcv_bufs(qla_host_t *ha); static void qla_clear_tx_buf(qla_host_t *ha, qla_tx_buf_t *txb); static void qla_init_ifnet(device_t dev, qla_host_t *ha); static int qla_sysctl_get_stats(SYSCTL_HANDLER_ARGS); static int qla_sysctl_get_link_status(SYSCTL_HANDLER_ARGS); static void qla_release(qla_host_t *ha); static void qla_dmamap_callback(void *arg, bus_dma_segment_t *segs, int nsegs, int error); static void qla_stop(qla_host_t *ha); static int qla_send(qla_host_t *ha, struct mbuf **m_headp); static void qla_tx_done(void *context, int pending); static void qla_get_peer(qla_host_t *ha); static void qla_error_recovery(void *context, int pending); static void qla_async_event(void *context, int pending); /* * Hooks to the Operating Systems */ static int qla_pci_probe (device_t); static int qla_pci_attach (device_t); static int qla_pci_detach (device_t); static void qla_init(void *arg); static int qla_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data); static int qla_media_change(struct ifnet *ifp); static void qla_media_status(struct ifnet *ifp, struct ifmediareq *ifmr); static void qla_start(struct ifnet *ifp); static device_method_t qla_pci_methods[] = { /* Device interface */ DEVMETHOD(device_probe, qla_pci_probe), DEVMETHOD(device_attach, qla_pci_attach), DEVMETHOD(device_detach, qla_pci_detach), { 0, 0 } }; static driver_t qla_pci_driver = { "ql", qla_pci_methods, sizeof (qla_host_t), }; static devclass_t qla83xx_devclass; DRIVER_MODULE(qla83xx, pci, qla_pci_driver, qla83xx_devclass, 0, 0); MODULE_DEPEND(qla83xx, pci, 1, 1, 1); MODULE_DEPEND(qla83xx, ether, 1, 1, 1); MALLOC_DEFINE(M_QLA83XXBUF, "qla83xxbuf", "Buffers for qla83xx driver"); #define QL_STD_REPLENISH_THRES 0 #define QL_JUMBO_REPLENISH_THRES 32 static char dev_str[64]; static char ver_str[64]; /* * Name: qla_pci_probe * Function: Validate the PCI device to be a QLA80XX device */ static int qla_pci_probe(device_t dev) { switch ((pci_get_device(dev) << 16) | (pci_get_vendor(dev))) { case PCI_QLOGIC_ISP8030: snprintf(dev_str, sizeof(dev_str), "%s v%d.%d.%d", "Qlogic ISP 83xx PCI CNA Adapter-Ethernet Function", QLA_VERSION_MAJOR, QLA_VERSION_MINOR, QLA_VERSION_BUILD); snprintf(ver_str, sizeof(ver_str), "v%d.%d.%d", QLA_VERSION_MAJOR, QLA_VERSION_MINOR, QLA_VERSION_BUILD); device_set_desc(dev, dev_str); break; default: return (ENXIO); } if (bootverbose) printf("%s: %s\n ", __func__, dev_str); return (BUS_PROBE_DEFAULT); } static void qla_add_sysctls(qla_host_t *ha) { device_t dev = ha->pci_dev; SYSCTL_ADD_STRING(device_get_sysctl_ctx(dev), SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), OID_AUTO, "version", CTLFLAG_RD, ver_str, 0, "Driver Version"); SYSCTL_ADD_PROC(device_get_sysctl_ctx(dev), SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), OID_AUTO, "stats", CTLTYPE_INT | CTLFLAG_RW, (void *)ha, 0, qla_sysctl_get_stats, "I", "Statistics"); SYSCTL_ADD_STRING(device_get_sysctl_ctx(dev), SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), OID_AUTO, "fw_version", CTLFLAG_RD, ha->fw_ver_str, 0, "firmware version"); SYSCTL_ADD_PROC(device_get_sysctl_ctx(dev), SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), OID_AUTO, "link_status", CTLTYPE_INT | CTLFLAG_RW, (void *)ha, 0, qla_sysctl_get_link_status, "I", "Link Status"); ha->dbg_level = 0; SYSCTL_ADD_UINT(device_get_sysctl_ctx(dev), SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), OID_AUTO, "debug", CTLFLAG_RW, &ha->dbg_level, ha->dbg_level, "Debug Level"); ha->std_replenish = QL_STD_REPLENISH_THRES; SYSCTL_ADD_UINT(device_get_sysctl_ctx(dev), SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), OID_AUTO, "std_replenish", CTLFLAG_RW, &ha->std_replenish, ha->std_replenish, "Threshold for Replenishing Standard Frames"); SYSCTL_ADD_QUAD(device_get_sysctl_ctx(dev), SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), OID_AUTO, "ipv4_lro", CTLFLAG_RD, &ha->ipv4_lro, "number of ipv4 lro completions"); SYSCTL_ADD_QUAD(device_get_sysctl_ctx(dev), SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), OID_AUTO, "ipv6_lro", CTLFLAG_RD, &ha->ipv6_lro, "number of ipv6 lro completions"); SYSCTL_ADD_QUAD(device_get_sysctl_ctx(dev), SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), OID_AUTO, "tx_tso_frames", CTLFLAG_RD, &ha->tx_tso_frames, "number of Tx TSO Frames"); SYSCTL_ADD_QUAD(device_get_sysctl_ctx(dev), SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), OID_AUTO, "hw_vlan_tx_frames", CTLFLAG_RD, &ha->hw_vlan_tx_frames, "number of Tx VLAN Frames"); return; } static void qla_watchdog(void *arg) { qla_host_t *ha = arg; qla_hw_t *hw; struct ifnet *ifp; uint32_t i; qla_hw_tx_cntxt_t *hw_tx_cntxt; hw = &ha->hw; ifp = ha->ifp; if (ha->flags.qla_watchdog_exit) { ha->qla_watchdog_exited = 1; return; } ha->qla_watchdog_exited = 0; if (!ha->flags.qla_watchdog_pause) { if (ql_hw_check_health(ha) || ha->qla_initiate_recovery || (ha->msg_from_peer == QL_PEER_MSG_RESET)) { ha->qla_watchdog_paused = 1; ha->flags.qla_watchdog_pause = 1; ha->qla_initiate_recovery = 0; ha->err_inject = 0; + device_printf(ha->pci_dev, + "%s: taskqueue_enqueue(err_task) \n", __func__); taskqueue_enqueue(ha->err_tq, &ha->err_task); } else if (ha->flags.qla_interface_up) { if (ha->async_event) { ha->async_event = 0; taskqueue_enqueue(ha->async_event_tq, &ha->async_event_task); } for (i = 0; i < ha->hw.num_tx_rings; i++) { hw_tx_cntxt = &hw->tx_cntxt[i]; if (qla_le32_to_host(*(hw_tx_cntxt->tx_cons)) != hw_tx_cntxt->txr_comp) { taskqueue_enqueue(ha->tx_tq, &ha->tx_task); break; } } if ((ifp->if_snd.ifq_head != NULL) && QL_RUNNING(ifp)) { taskqueue_enqueue(ha->tx_tq, &ha->tx_task); } ha->qla_watchdog_paused = 0; } else { ha->qla_watchdog_paused = 0; } } else { ha->qla_watchdog_paused = 1; } ha->watchdog_ticks = ha->watchdog_ticks++ % 1000; callout_reset(&ha->tx_callout, QLA_WATCHDOG_CALLOUT_TICKS, qla_watchdog, ha); } /* * Name: qla_pci_attach * Function: attaches the device to the operating system */ static int qla_pci_attach(device_t dev) { qla_host_t *ha = NULL; uint32_t rsrc_len; int i; uint32_t num_rcvq = 0; if ((ha = device_get_softc(dev)) == NULL) { device_printf(dev, "cannot get softc\n"); return (ENOMEM); } memset(ha, 0, sizeof (qla_host_t)); if (pci_get_device(dev) != PCI_PRODUCT_QLOGIC_ISP8030) { device_printf(dev, "device is not ISP8030\n"); return (ENXIO); } ha->pci_func = pci_get_function(dev) & 0x1; ha->pci_dev = dev; pci_enable_busmaster(dev); ha->reg_rid = PCIR_BAR(0); ha->pci_reg = bus_alloc_resource_any(dev, SYS_RES_MEMORY, &ha->reg_rid, RF_ACTIVE); if (ha->pci_reg == NULL) { device_printf(dev, "unable to map any ports\n"); goto qla_pci_attach_err; } rsrc_len = (uint32_t) bus_get_resource_count(dev, SYS_RES_MEMORY, ha->reg_rid); mtx_init(&ha->hw_lock, "qla83xx_hw_lock", MTX_NETWORK_LOCK, MTX_SPIN); mtx_init(&ha->tx_lock, "qla83xx_tx_lock", MTX_NETWORK_LOCK, MTX_DEF); qla_add_sysctls(ha); ql_hw_add_sysctls(ha); ha->flags.lock_init = 1; ha->reg_rid1 = PCIR_BAR(2); ha->pci_reg1 = bus_alloc_resource_any(dev, SYS_RES_MEMORY, &ha->reg_rid1, RF_ACTIVE); ha->msix_count = pci_msix_count(dev); if (ha->msix_count < (ha->hw.num_sds_rings + 1)) { device_printf(dev, "%s: msix_count[%d] not enough\n", __func__, ha->msix_count); goto qla_pci_attach_err; } QL_DPRINT2(ha, (dev, "%s: ha %p pci_func 0x%x rsrc_count 0x%08x" " msix_count 0x%x pci_reg %p\n", __func__, ha, ha->pci_func, rsrc_len, ha->msix_count, ha->pci_reg)); /* initialize hardware */ if (ql_init_hw(ha)) { device_printf(dev, "%s: ql_init_hw failed\n", __func__); goto qla_pci_attach_err; } device_printf(dev, "%s: firmware[%d.%d.%d.%d]\n", __func__, ha->fw_ver_major, ha->fw_ver_minor, ha->fw_ver_sub, ha->fw_ver_build); snprintf(ha->fw_ver_str, sizeof(ha->fw_ver_str), "%d.%d.%d.%d", ha->fw_ver_major, ha->fw_ver_minor, ha->fw_ver_sub, ha->fw_ver_build); if (qla_get_nic_partition(ha, NULL, &num_rcvq)) { device_printf(dev, "%s: qla_get_nic_partition failed\n", __func__); goto qla_pci_attach_err; } device_printf(dev, "%s: ha %p pci_func 0x%x rsrc_count 0x%08x" " msix_count 0x%x pci_reg %p num_rcvq = %d\n", __func__, ha, ha->pci_func, rsrc_len, ha->msix_count, ha->pci_reg, num_rcvq); #ifdef QL_ENABLE_ISCSI_TLV if ((ha->msix_count < 64) || (num_rcvq != 32)) { ha->hw.num_sds_rings = 15; ha->hw.num_tx_rings = 32; } #endif /* #ifdef QL_ENABLE_ISCSI_TLV */ ha->hw.num_rds_rings = ha->hw.num_sds_rings; ha->msix_count = ha->hw.num_sds_rings + 1; if (pci_alloc_msix(dev, &ha->msix_count)) { device_printf(dev, "%s: pci_alloc_msi[%d] failed\n", __func__, ha->msix_count); ha->msix_count = 0; goto qla_pci_attach_err; } ha->mbx_irq_rid = 1; ha->mbx_irq = bus_alloc_resource_any(dev, SYS_RES_IRQ, &ha->mbx_irq_rid, (RF_ACTIVE | RF_SHAREABLE)); if (ha->mbx_irq == NULL) { device_printf(dev, "could not allocate mbx interrupt\n"); goto qla_pci_attach_err; } if (bus_setup_intr(dev, ha->mbx_irq, (INTR_TYPE_NET | INTR_MPSAFE), NULL, ql_mbx_isr, ha, &ha->mbx_handle)) { device_printf(dev, "could not setup mbx interrupt\n"); goto qla_pci_attach_err; } for (i = 0; i < ha->hw.num_sds_rings; i++) { ha->irq_vec[i].sds_idx = i; ha->irq_vec[i].ha = ha; ha->irq_vec[i].irq_rid = 2 + i; ha->irq_vec[i].irq = bus_alloc_resource_any(dev, SYS_RES_IRQ, &ha->irq_vec[i].irq_rid, (RF_ACTIVE | RF_SHAREABLE)); if (ha->irq_vec[i].irq == NULL) { device_printf(dev, "could not allocate interrupt\n"); goto qla_pci_attach_err; } if (bus_setup_intr(dev, ha->irq_vec[i].irq, (INTR_TYPE_NET | INTR_MPSAFE), NULL, ql_isr, &ha->irq_vec[i], &ha->irq_vec[i].handle)) { device_printf(dev, "could not setup interrupt\n"); goto qla_pci_attach_err; } } printf("%s: mp__ncpus %d sds %d rds %d msi-x %d\n", __func__, mp_ncpus, ha->hw.num_sds_rings, ha->hw.num_rds_rings, ha->msix_count); ql_read_mac_addr(ha); /* allocate parent dma tag */ if (qla_alloc_parent_dma_tag(ha)) { device_printf(dev, "%s: qla_alloc_parent_dma_tag failed\n", __func__); goto qla_pci_attach_err; } /* alloc all dma buffers */ if (ql_alloc_dma(ha)) { device_printf(dev, "%s: ql_alloc_dma failed\n", __func__); goto qla_pci_attach_err; } qla_get_peer(ha); if (ql_minidump_init(ha) != 0) { device_printf(dev, "%s: ql_minidump_init failed\n", __func__); goto qla_pci_attach_err; } /* create the o.s ethernet interface */ qla_init_ifnet(dev, ha); ha->flags.qla_watchdog_active = 1; ha->flags.qla_watchdog_pause = 0; TASK_INIT(&ha->tx_task, 0, qla_tx_done, ha); - ha->tx_tq = taskqueue_create_fast("qla_txq", M_NOWAIT, + ha->tx_tq = taskqueue_create("qla_txq", M_NOWAIT, taskqueue_thread_enqueue, &ha->tx_tq); taskqueue_start_threads(&ha->tx_tq, 1, PI_NET, "%s txq", device_get_nameunit(ha->pci_dev)); callout_init(&ha->tx_callout, TRUE); ha->flags.qla_callout_init = 1; /* create ioctl device interface */ if (ql_make_cdev(ha)) { device_printf(dev, "%s: ql_make_cdev failed\n", __func__); goto qla_pci_attach_err; } callout_reset(&ha->tx_callout, QLA_WATCHDOG_CALLOUT_TICKS, qla_watchdog, ha); TASK_INIT(&ha->err_task, 0, qla_error_recovery, ha); - ha->err_tq = taskqueue_create_fast("qla_errq", M_NOWAIT, + ha->err_tq = taskqueue_create("qla_errq", M_NOWAIT, taskqueue_thread_enqueue, &ha->err_tq); taskqueue_start_threads(&ha->err_tq, 1, PI_NET, "%s errq", device_get_nameunit(ha->pci_dev)); TASK_INIT(&ha->async_event_task, 0, qla_async_event, ha); - ha->async_event_tq = taskqueue_create_fast("qla_asyncq", M_NOWAIT, + ha->async_event_tq = taskqueue_create("qla_asyncq", M_NOWAIT, taskqueue_thread_enqueue, &ha->async_event_tq); taskqueue_start_threads(&ha->async_event_tq, 1, PI_NET, "%s asyncq", device_get_nameunit(ha->pci_dev)); QL_DPRINT2(ha, (dev, "%s: exit 0\n", __func__)); return (0); qla_pci_attach_err: qla_release(ha); QL_DPRINT2(ha, (dev, "%s: exit ENXIO\n", __func__)); return (ENXIO); } /* * Name: qla_pci_detach * Function: Unhooks the device from the operating system */ static int qla_pci_detach(device_t dev) { qla_host_t *ha = NULL; struct ifnet *ifp; QL_DPRINT2(ha, (dev, "%s: enter\n", __func__)); if ((ha = device_get_softc(dev)) == NULL) { device_printf(dev, "cannot get softc\n"); return (ENOMEM); } ifp = ha->ifp; (void)QLA_LOCK(ha, __func__, 0); qla_stop(ha); QLA_UNLOCK(ha, __func__); qla_release(ha); QL_DPRINT2(ha, (dev, "%s: exit\n", __func__)); return (0); } /* * SYSCTL Related Callbacks */ static int qla_sysctl_get_stats(SYSCTL_HANDLER_ARGS) { int err, ret = 0; qla_host_t *ha; err = sysctl_handle_int(oidp, &ret, 0, req); if (err || !req->newptr) return (err); if (ret == 1) { ha = (qla_host_t *)arg1; ql_get_stats(ha); } return (err); } static int qla_sysctl_get_link_status(SYSCTL_HANDLER_ARGS) { int err, ret = 0; qla_host_t *ha; err = sysctl_handle_int(oidp, &ret, 0, req); if (err || !req->newptr) return (err); if (ret == 1) { ha = (qla_host_t *)arg1; ql_hw_link_status(ha); } return (err); } /* * Name: qla_release * Function: Releases the resources allocated for the device */ static void qla_release(qla_host_t *ha) { device_t dev; int i; dev = ha->pci_dev; if (ha->async_event_tq) { taskqueue_drain(ha->async_event_tq, &ha->async_event_task); taskqueue_free(ha->async_event_tq); } if (ha->err_tq) { taskqueue_drain(ha->err_tq, &ha->err_task); taskqueue_free(ha->err_tq); } if (ha->tx_tq) { taskqueue_drain(ha->tx_tq, &ha->tx_task); taskqueue_free(ha->tx_tq); } ql_del_cdev(ha); if (ha->flags.qla_watchdog_active) { ha->flags.qla_watchdog_exit = 1; while (ha->qla_watchdog_exited == 0) qla_mdelay(__func__, 1); } if (ha->flags.qla_callout_init) callout_stop(&ha->tx_callout); if (ha->ifp != NULL) ether_ifdetach(ha->ifp); ql_free_dma(ha); qla_free_parent_dma_tag(ha); if (ha->mbx_handle) (void)bus_teardown_intr(dev, ha->mbx_irq, ha->mbx_handle); if (ha->mbx_irq) (void) bus_release_resource(dev, SYS_RES_IRQ, ha->mbx_irq_rid, ha->mbx_irq); for (i = 0; i < ha->hw.num_sds_rings; i++) { if (ha->irq_vec[i].handle) { (void)bus_teardown_intr(dev, ha->irq_vec[i].irq, ha->irq_vec[i].handle); } if (ha->irq_vec[i].irq) { (void)bus_release_resource(dev, SYS_RES_IRQ, ha->irq_vec[i].irq_rid, ha->irq_vec[i].irq); } } if (ha->msix_count) pci_release_msi(dev); if (ha->flags.lock_init) { mtx_destroy(&ha->tx_lock); mtx_destroy(&ha->hw_lock); } if (ha->pci_reg) (void) bus_release_resource(dev, SYS_RES_MEMORY, ha->reg_rid, ha->pci_reg); if (ha->pci_reg1) (void) bus_release_resource(dev, SYS_RES_MEMORY, ha->reg_rid1, ha->pci_reg1); } /* * DMA Related Functions */ static void qla_dmamap_callback(void *arg, bus_dma_segment_t *segs, int nsegs, int error) { *((bus_addr_t *)arg) = 0; if (error) { printf("%s: bus_dmamap_load failed (%d)\n", __func__, error); return; } *((bus_addr_t *)arg) = segs[0].ds_addr; return; } int ql_alloc_dmabuf(qla_host_t *ha, qla_dma_t *dma_buf) { int ret = 0; device_t dev; bus_addr_t b_addr; dev = ha->pci_dev; QL_DPRINT2(ha, (dev, "%s: enter\n", __func__)); ret = bus_dma_tag_create( ha->parent_tag,/* parent */ dma_buf->alignment, ((bus_size_t)(1ULL << 32)),/* boundary */ BUS_SPACE_MAXADDR, /* lowaddr */ BUS_SPACE_MAXADDR, /* highaddr */ NULL, NULL, /* filter, filterarg */ dma_buf->size, /* maxsize */ 1, /* nsegments */ dma_buf->size, /* maxsegsize */ 0, /* flags */ NULL, NULL, /* lockfunc, lockarg */ &dma_buf->dma_tag); if (ret) { device_printf(dev, "%s: could not create dma tag\n", __func__); goto ql_alloc_dmabuf_exit; } ret = bus_dmamem_alloc(dma_buf->dma_tag, (void **)&dma_buf->dma_b, (BUS_DMA_ZERO | BUS_DMA_COHERENT | BUS_DMA_NOWAIT), &dma_buf->dma_map); if (ret) { bus_dma_tag_destroy(dma_buf->dma_tag); device_printf(dev, "%s: bus_dmamem_alloc failed\n", __func__); goto ql_alloc_dmabuf_exit; } ret = bus_dmamap_load(dma_buf->dma_tag, dma_buf->dma_map, dma_buf->dma_b, dma_buf->size, qla_dmamap_callback, &b_addr, BUS_DMA_NOWAIT); if (ret || !b_addr) { bus_dma_tag_destroy(dma_buf->dma_tag); bus_dmamem_free(dma_buf->dma_tag, dma_buf->dma_b, dma_buf->dma_map); ret = -1; goto ql_alloc_dmabuf_exit; } dma_buf->dma_addr = b_addr; ql_alloc_dmabuf_exit: QL_DPRINT2(ha, (dev, "%s: exit ret 0x%08x tag %p map %p b %p sz 0x%x\n", __func__, ret, (void *)dma_buf->dma_tag, (void *)dma_buf->dma_map, (void *)dma_buf->dma_b, dma_buf->size)); return ret; } void ql_free_dmabuf(qla_host_t *ha, qla_dma_t *dma_buf) { bus_dmamem_free(dma_buf->dma_tag, dma_buf->dma_b, dma_buf->dma_map); bus_dma_tag_destroy(dma_buf->dma_tag); } static int qla_alloc_parent_dma_tag(qla_host_t *ha) { int ret; device_t dev; dev = ha->pci_dev; /* * Allocate parent DMA Tag */ ret = bus_dma_tag_create( bus_get_dma_tag(dev), /* parent */ 1,((bus_size_t)(1ULL << 32)),/* alignment, boundary */ BUS_SPACE_MAXADDR, /* lowaddr */ BUS_SPACE_MAXADDR, /* highaddr */ NULL, NULL, /* filter, filterarg */ BUS_SPACE_MAXSIZE_32BIT,/* maxsize */ 0, /* nsegments */ BUS_SPACE_MAXSIZE_32BIT,/* maxsegsize */ 0, /* flags */ NULL, NULL, /* lockfunc, lockarg */ &ha->parent_tag); if (ret) { device_printf(dev, "%s: could not create parent dma tag\n", __func__); return (-1); } ha->flags.parent_tag = 1; return (0); } static void qla_free_parent_dma_tag(qla_host_t *ha) { if (ha->flags.parent_tag) { bus_dma_tag_destroy(ha->parent_tag); ha->flags.parent_tag = 0; } } /* * Name: qla_init_ifnet * Function: Creates the Network Device Interface and Registers it with the O.S */ static void qla_init_ifnet(device_t dev, qla_host_t *ha) { struct ifnet *ifp; QL_DPRINT2(ha, (dev, "%s: enter\n", __func__)); ifp = ha->ifp = if_alloc(IFT_ETHER); if (ifp == NULL) panic("%s: cannot if_alloc()\n", device_get_nameunit(dev)); if_initname(ifp, device_get_name(dev), device_get_unit(dev)); ifp->if_baudrate = IF_Gbps(10); ifp->if_capabilities = IFCAP_LINKSTATE; ifp->if_mtu = ETHERMTU; ifp->if_init = qla_init; ifp->if_softc = ha; ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST; ifp->if_ioctl = qla_ioctl; ifp->if_start = qla_start; IFQ_SET_MAXLEN(&ifp->if_snd, qla_get_ifq_snd_maxlen(ha)); ifp->if_snd.ifq_drv_maxlen = qla_get_ifq_snd_maxlen(ha); IFQ_SET_READY(&ifp->if_snd); ha->max_frame_size = ifp->if_mtu + ETHER_HDR_LEN + ETHER_CRC_LEN; ether_ifattach(ifp, qla_get_mac_addr(ha)); ifp->if_capabilities = IFCAP_HWCSUM | IFCAP_TSO4 | IFCAP_JUMBO_MTU; ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU; ifp->if_capabilities |= IFCAP_VLAN_HWTSO; ifp->if_capenable = ifp->if_capabilities; ifp->if_hdrlen = sizeof(struct ether_vlan_header); ifmedia_init(&ha->media, IFM_IMASK, qla_media_change, qla_media_status); ifmedia_add(&ha->media, (IFM_ETHER | qla_get_optics(ha) | IFM_FDX), 0, NULL); ifmedia_add(&ha->media, (IFM_ETHER | IFM_AUTO), 0, NULL); ifmedia_set(&ha->media, (IFM_ETHER | IFM_AUTO)); QL_DPRINT2(ha, (dev, "%s: exit\n", __func__)); return; } static void qla_init_locked(qla_host_t *ha) { struct ifnet *ifp = ha->ifp; qla_stop(ha); if (qla_alloc_xmt_bufs(ha) != 0) return; qla_confirm_9kb_enable(ha); if (qla_alloc_rcv_bufs(ha) != 0) return; bcopy(IF_LLADDR(ha->ifp), ha->hw.mac_addr, ETHER_ADDR_LEN); ifp->if_hwassist = CSUM_TCP | CSUM_UDP | CSUM_TSO; ha->flags.stop_rcv = 0; if (ql_init_hw_if(ha) == 0) { ifp = ha->ifp; ifp->if_drv_flags |= IFF_DRV_RUNNING; ifp->if_drv_flags &= ~IFF_DRV_OACTIVE; ha->flags.qla_watchdog_pause = 0; ha->hw_vlan_tx_frames = 0; ha->tx_tso_frames = 0; ha->flags.qla_interface_up = 1; } return; } static void qla_init(void *arg) { qla_host_t *ha; ha = (qla_host_t *)arg; QL_DPRINT2(ha, (ha->pci_dev, "%s: enter\n", __func__)); (void)QLA_LOCK(ha, __func__, 0); qla_init_locked(ha); QLA_UNLOCK(ha, __func__); QL_DPRINT2(ha, (ha->pci_dev, "%s: exit\n", __func__)); } static int qla_set_multi(qla_host_t *ha, uint32_t add_multi) { uint8_t mta[Q8_MAX_NUM_MULTICAST_ADDRS * Q8_MAC_ADDR_LEN]; struct ifmultiaddr *ifma; int mcnt = 0; struct ifnet *ifp = ha->ifp; int ret = 0; if_maddr_rlock(ifp); TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { if (ifma->ifma_addr->sa_family != AF_LINK) continue; if (mcnt == Q8_MAX_NUM_MULTICAST_ADDRS) break; bcopy(LLADDR((struct sockaddr_dl *) ifma->ifma_addr), &mta[mcnt * Q8_MAC_ADDR_LEN], Q8_MAC_ADDR_LEN); mcnt++; } if_maddr_runlock(ifp); if (QLA_LOCK(ha, __func__, 1) == 0) { ret = ql_hw_set_multi(ha, mta, mcnt, add_multi); QLA_UNLOCK(ha, __func__); } return (ret); } static int qla_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) { int ret = 0; struct ifreq *ifr = (struct ifreq *)data; struct ifaddr *ifa = (struct ifaddr *)data; qla_host_t *ha; ha = (qla_host_t *)ifp->if_softc; switch (cmd) { case SIOCSIFADDR: QL_DPRINT4(ha, (ha->pci_dev, "%s: SIOCSIFADDR (0x%lx)\n", __func__, cmd)); if (ifa->ifa_addr->sa_family == AF_INET) { ifp->if_flags |= IFF_UP; if (!(ifp->if_drv_flags & IFF_DRV_RUNNING)) { (void)QLA_LOCK(ha, __func__, 0); qla_init_locked(ha); QLA_UNLOCK(ha, __func__); } QL_DPRINT4(ha, (ha->pci_dev, "%s: SIOCSIFADDR (0x%lx) ipv4 [0x%08x]\n", __func__, cmd, ntohl(IA_SIN(ifa)->sin_addr.s_addr))); arp_ifinit(ifp, ifa); } else { ether_ioctl(ifp, cmd, data); } break; case SIOCSIFMTU: QL_DPRINT4(ha, (ha->pci_dev, "%s: SIOCSIFMTU (0x%lx)\n", __func__, cmd)); if (ifr->ifr_mtu > QLA_MAX_MTU) { ret = EINVAL; } else { (void) QLA_LOCK(ha, __func__, 0); ifp->if_mtu = ifr->ifr_mtu; ha->max_frame_size = ifp->if_mtu + ETHER_HDR_LEN + ETHER_CRC_LEN; if ((ifp->if_drv_flags & IFF_DRV_RUNNING)) { ret = ql_set_max_mtu(ha, ha->max_frame_size, ha->hw.rcv_cntxt_id); } if (ifp->if_mtu > ETHERMTU) ha->std_replenish = QL_JUMBO_REPLENISH_THRES; else ha->std_replenish = QL_STD_REPLENISH_THRES; QLA_UNLOCK(ha, __func__); if (ret) ret = EINVAL; } break; case SIOCSIFFLAGS: QL_DPRINT4(ha, (ha->pci_dev, "%s: SIOCSIFFLAGS (0x%lx)\n", __func__, cmd)); (void)QLA_LOCK(ha, __func__, 0); if (ifp->if_flags & IFF_UP) { if ((ifp->if_drv_flags & IFF_DRV_RUNNING)) { if ((ifp->if_flags ^ ha->if_flags) & IFF_PROMISC) { ret = ql_set_promisc(ha); } else if ((ifp->if_flags ^ ha->if_flags) & IFF_ALLMULTI) { ret = ql_set_allmulti(ha); } } else { qla_init_locked(ha); ha->max_frame_size = ifp->if_mtu + ETHER_HDR_LEN + ETHER_CRC_LEN; ret = ql_set_max_mtu(ha, ha->max_frame_size, ha->hw.rcv_cntxt_id); } } else { if (ifp->if_drv_flags & IFF_DRV_RUNNING) qla_stop(ha); ha->if_flags = ifp->if_flags; } QLA_UNLOCK(ha, __func__); break; case SIOCADDMULTI: QL_DPRINT4(ha, (ha->pci_dev, "%s: %s (0x%lx)\n", __func__, "SIOCADDMULTI", cmd)); if (ifp->if_drv_flags & IFF_DRV_RUNNING) { if (qla_set_multi(ha, 1)) ret = EINVAL; } break; case SIOCDELMULTI: QL_DPRINT4(ha, (ha->pci_dev, "%s: %s (0x%lx)\n", __func__, "SIOCDELMULTI", cmd)); if (ifp->if_drv_flags & IFF_DRV_RUNNING) { if (qla_set_multi(ha, 0)) ret = EINVAL; } break; case SIOCSIFMEDIA: case SIOCGIFMEDIA: QL_DPRINT4(ha, (ha->pci_dev, "%s: SIOCSIFMEDIA/SIOCGIFMEDIA (0x%lx)\n", __func__, cmd)); ret = ifmedia_ioctl(ifp, ifr, &ha->media, cmd); break; case SIOCSIFCAP: { int mask = ifr->ifr_reqcap ^ ifp->if_capenable; QL_DPRINT4(ha, (ha->pci_dev, "%s: SIOCSIFCAP (0x%lx)\n", __func__, cmd)); if (mask & IFCAP_HWCSUM) ifp->if_capenable ^= IFCAP_HWCSUM; if (mask & IFCAP_TSO4) ifp->if_capenable ^= IFCAP_TSO4; if (mask & IFCAP_VLAN_HWTAGGING) ifp->if_capenable ^= IFCAP_VLAN_HWTAGGING; if (mask & IFCAP_VLAN_HWTSO) ifp->if_capenable ^= IFCAP_VLAN_HWTSO; if (!(ifp->if_drv_flags & IFF_DRV_RUNNING)) qla_init(ha); VLAN_CAPABILITIES(ifp); break; } default: QL_DPRINT4(ha, (ha->pci_dev, "%s: default (0x%lx)\n", __func__, cmd)); ret = ether_ioctl(ifp, cmd, data); break; } return (ret); } static int qla_media_change(struct ifnet *ifp) { qla_host_t *ha; struct ifmedia *ifm; int ret = 0; ha = (qla_host_t *)ifp->if_softc; QL_DPRINT2(ha, (ha->pci_dev, "%s: enter\n", __func__)); ifm = &ha->media; if (IFM_TYPE(ifm->ifm_media) != IFM_ETHER) ret = EINVAL; QL_DPRINT2(ha, (ha->pci_dev, "%s: exit\n", __func__)); return (ret); } static void qla_media_status(struct ifnet *ifp, struct ifmediareq *ifmr) { qla_host_t *ha; ha = (qla_host_t *)ifp->if_softc; QL_DPRINT2(ha, (ha->pci_dev, "%s: enter\n", __func__)); ifmr->ifm_status = IFM_AVALID; ifmr->ifm_active = IFM_ETHER; ql_update_link_state(ha); if (ha->hw.link_up) { ifmr->ifm_status |= IFM_ACTIVE; ifmr->ifm_active |= (IFM_FDX | qla_get_optics(ha)); } QL_DPRINT2(ha, (ha->pci_dev, "%s: exit (%s)\n", __func__,\ (ha->hw.link_up ? "link_up" : "link_down"))); return; } static void qla_start(struct ifnet *ifp) { struct mbuf *m_head; qla_host_t *ha = (qla_host_t *)ifp->if_softc; QL_DPRINT8(ha, (ha->pci_dev, "%s: enter\n", __func__)); if (!mtx_trylock(&ha->tx_lock)) { QL_DPRINT8(ha, (ha->pci_dev, "%s: mtx_trylock(&ha->tx_lock) failed\n", __func__)); return; } if ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) != IFF_DRV_RUNNING) { QL_DPRINT8(ha, (ha->pci_dev, "%s: !IFF_DRV_RUNNING\n", __func__)); QLA_TX_UNLOCK(ha); return; } if (!ha->hw.link_up || !ha->watchdog_ticks) ql_update_link_state(ha); if (!ha->hw.link_up) { QL_DPRINT8(ha, (ha->pci_dev, "%s: link down\n", __func__)); QLA_TX_UNLOCK(ha); return; } while (ifp->if_snd.ifq_head != NULL) { IF_DEQUEUE(&ifp->if_snd, m_head); if (m_head == NULL) { QL_DPRINT8(ha, (ha->pci_dev, "%s: m_head == NULL\n", __func__)); break; } if (qla_send(ha, &m_head)) { if (m_head == NULL) break; QL_DPRINT8(ha, (ha->pci_dev, "%s: PREPEND\n", __func__)); ifp->if_drv_flags |= IFF_DRV_OACTIVE; IF_PREPEND(&ifp->if_snd, m_head); break; } /* Send a copy of the frame to the BPF listener */ ETHER_BPF_MTAP(ifp, m_head); } QLA_TX_UNLOCK(ha); QL_DPRINT8(ha, (ha->pci_dev, "%s: exit\n", __func__)); return; } static int qla_send(qla_host_t *ha, struct mbuf **m_headp) { bus_dma_segment_t segs[QLA_MAX_SEGMENTS]; bus_dmamap_t map; int nsegs; int ret = -1; uint32_t tx_idx; struct mbuf *m_head = *m_headp; uint32_t txr_idx = ha->txr_idx; uint32_t iscsi_pdu = 0; QL_DPRINT8(ha, (ha->pci_dev, "%s: enter\n", __func__)); /* check if flowid is set */ if (M_HASHTYPE_GET(m_head) != M_HASHTYPE_NONE) { #ifdef QL_ENABLE_ISCSI_TLV if (qla_iscsi_pdu(ha, m_head) == 0) { iscsi_pdu = 1; txr_idx = m_head->m_pkthdr.flowid & ((ha->hw.num_tx_rings >> 1) - 1); } else { txr_idx = m_head->m_pkthdr.flowid & (ha->hw.num_tx_rings - 1); } #else txr_idx = m_head->m_pkthdr.flowid & (ha->hw.num_tx_rings - 1); #endif /* #ifdef QL_ENABLE_ISCSI_TLV */ } tx_idx = ha->hw.tx_cntxt[txr_idx].txr_next; map = ha->tx_ring[txr_idx].tx_buf[tx_idx].map; ret = bus_dmamap_load_mbuf_sg(ha->tx_tag, map, m_head, segs, &nsegs, BUS_DMA_NOWAIT); if (ret == EFBIG) { struct mbuf *m; QL_DPRINT8(ha, (ha->pci_dev, "%s: EFBIG [%d]\n", __func__, m_head->m_pkthdr.len)); m = m_defrag(m_head, M_NOWAIT); if (m == NULL) { ha->err_tx_defrag++; m_freem(m_head); *m_headp = NULL; device_printf(ha->pci_dev, "%s: m_defrag() = NULL [%d]\n", __func__, ret); return (ENOBUFS); } m_head = m; *m_headp = m_head; if ((ret = bus_dmamap_load_mbuf_sg(ha->tx_tag, map, m_head, segs, &nsegs, BUS_DMA_NOWAIT))) { ha->err_tx_dmamap_load++; device_printf(ha->pci_dev, "%s: bus_dmamap_load_mbuf_sg failed0[%d, %d]\n", __func__, ret, m_head->m_pkthdr.len); if (ret != ENOMEM) { m_freem(m_head); *m_headp = NULL; } return (ret); } } else if (ret) { ha->err_tx_dmamap_load++; device_printf(ha->pci_dev, "%s: bus_dmamap_load_mbuf_sg failed1[%d, %d]\n", __func__, ret, m_head->m_pkthdr.len); if (ret != ENOMEM) { m_freem(m_head); *m_headp = NULL; } return (ret); } QL_ASSERT(ha, (nsegs != 0), ("qla_send: empty packet")); bus_dmamap_sync(ha->tx_tag, map, BUS_DMASYNC_PREWRITE); if (!(ret = ql_hw_send(ha, segs, nsegs, tx_idx, m_head, txr_idx, iscsi_pdu))) { ha->tx_ring[txr_idx].count++; ha->tx_ring[txr_idx].tx_buf[tx_idx].m_head = m_head; } else { if (ret == EINVAL) { if (m_head) m_freem(m_head); *m_headp = NULL; } } QL_DPRINT8(ha, (ha->pci_dev, "%s: exit\n", __func__)); return (ret); } static void qla_stop(qla_host_t *ha) { struct ifnet *ifp = ha->ifp; device_t dev; dev = ha->pci_dev; ifp->if_drv_flags &= ~(IFF_DRV_OACTIVE | IFF_DRV_RUNNING); QLA_TX_LOCK(ha); QLA_TX_UNLOCK(ha); ha->flags.qla_watchdog_pause = 1; while (!ha->qla_watchdog_paused) qla_mdelay(__func__, 1); ha->flags.qla_interface_up = 0; ql_hw_stop_rcv(ha); ql_del_hw_if(ha); qla_free_xmt_bufs(ha); qla_free_rcv_bufs(ha); return; } /* * Buffer Management Functions for Transmit and Receive Rings */ static int qla_alloc_xmt_bufs(qla_host_t *ha) { int ret = 0; uint32_t i, j; qla_tx_buf_t *txb; if (bus_dma_tag_create(NULL, /* parent */ 1, 0, /* alignment, bounds */ BUS_SPACE_MAXADDR, /* lowaddr */ BUS_SPACE_MAXADDR, /* highaddr */ NULL, NULL, /* filter, filterarg */ QLA_MAX_TSO_FRAME_SIZE, /* maxsize */ QLA_MAX_SEGMENTS, /* nsegments */ PAGE_SIZE, /* maxsegsize */ BUS_DMA_ALLOCNOW, /* flags */ NULL, /* lockfunc */ NULL, /* lockfuncarg */ &ha->tx_tag)) { device_printf(ha->pci_dev, "%s: tx_tag alloc failed\n", __func__); return (ENOMEM); } for (i = 0; i < ha->hw.num_tx_rings; i++) { bzero((void *)ha->tx_ring[i].tx_buf, (sizeof(qla_tx_buf_t) * NUM_TX_DESCRIPTORS)); } for (j = 0; j < ha->hw.num_tx_rings; j++) { for (i = 0; i < NUM_TX_DESCRIPTORS; i++) { txb = &ha->tx_ring[j].tx_buf[i]; if ((ret = bus_dmamap_create(ha->tx_tag, BUS_DMA_NOWAIT, &txb->map))) { ha->err_tx_dmamap_create++; device_printf(ha->pci_dev, "%s: bus_dmamap_create failed[%d]\n", __func__, ret); qla_free_xmt_bufs(ha); return (ret); } } } return 0; } /* * Release mbuf after it sent on the wire */ static void qla_clear_tx_buf(qla_host_t *ha, qla_tx_buf_t *txb) { QL_DPRINT2(ha, (ha->pci_dev, "%s: enter\n", __func__)); if (txb->m_head && txb->map) { bus_dmamap_unload(ha->tx_tag, txb->map); m_freem(txb->m_head); txb->m_head = NULL; } if (txb->map) bus_dmamap_destroy(ha->tx_tag, txb->map); QL_DPRINT2(ha, (ha->pci_dev, "%s: exit\n", __func__)); } static void qla_free_xmt_bufs(qla_host_t *ha) { int i, j; for (j = 0; j < ha->hw.num_tx_rings; j++) { for (i = 0; i < NUM_TX_DESCRIPTORS; i++) qla_clear_tx_buf(ha, &ha->tx_ring[j].tx_buf[i]); } if (ha->tx_tag != NULL) { bus_dma_tag_destroy(ha->tx_tag); ha->tx_tag = NULL; } for (i = 0; i < ha->hw.num_tx_rings; i++) { bzero((void *)ha->tx_ring[i].tx_buf, (sizeof(qla_tx_buf_t) * NUM_TX_DESCRIPTORS)); } return; } static int qla_alloc_rcv_std(qla_host_t *ha) { int i, j, k, r, ret = 0; qla_rx_buf_t *rxb; qla_rx_ring_t *rx_ring; for (r = 0; r < ha->hw.num_rds_rings; r++) { rx_ring = &ha->rx_ring[r]; for (i = 0; i < NUM_RX_DESCRIPTORS; i++) { rxb = &rx_ring->rx_buf[i]; ret = bus_dmamap_create(ha->rx_tag, BUS_DMA_NOWAIT, &rxb->map); if (ret) { device_printf(ha->pci_dev, "%s: dmamap[%d, %d] failed\n", __func__, r, i); for (k = 0; k < r; k++) { for (j = 0; j < NUM_RX_DESCRIPTORS; j++) { rxb = &ha->rx_ring[k].rx_buf[j]; bus_dmamap_destroy(ha->rx_tag, rxb->map); } } for (j = 0; j < i; j++) { bus_dmamap_destroy(ha->rx_tag, rx_ring->rx_buf[j].map); } goto qla_alloc_rcv_std_err; } } } qla_init_hw_rcv_descriptors(ha); for (r = 0; r < ha->hw.num_rds_rings; r++) { rx_ring = &ha->rx_ring[r]; for (i = 0; i < NUM_RX_DESCRIPTORS; i++) { rxb = &rx_ring->rx_buf[i]; rxb->handle = i; if (!(ret = ql_get_mbuf(ha, rxb, NULL))) { /* * set the physical address in the * corresponding descriptor entry in the * receive ring/queue for the hba */ qla_set_hw_rcv_desc(ha, r, i, rxb->handle, rxb->paddr, (rxb->m_head)->m_pkthdr.len); } else { device_printf(ha->pci_dev, "%s: ql_get_mbuf [%d, %d] failed\n", __func__, r, i); bus_dmamap_destroy(ha->rx_tag, rxb->map); goto qla_alloc_rcv_std_err; } } } return 0; qla_alloc_rcv_std_err: return (-1); } static void qla_free_rcv_std(qla_host_t *ha) { int i, r; qla_rx_buf_t *rxb; for (r = 0; r < ha->hw.num_rds_rings; r++) { for (i = 0; i < NUM_RX_DESCRIPTORS; i++) { rxb = &ha->rx_ring[r].rx_buf[i]; if (rxb->m_head != NULL) { bus_dmamap_unload(ha->rx_tag, rxb->map); bus_dmamap_destroy(ha->rx_tag, rxb->map); m_freem(rxb->m_head); rxb->m_head = NULL; } } } return; } static int qla_alloc_rcv_bufs(qla_host_t *ha) { int i, ret = 0; if (bus_dma_tag_create(NULL, /* parent */ 1, 0, /* alignment, bounds */ BUS_SPACE_MAXADDR, /* lowaddr */ BUS_SPACE_MAXADDR, /* highaddr */ NULL, NULL, /* filter, filterarg */ MJUM9BYTES, /* maxsize */ 1, /* nsegments */ MJUM9BYTES, /* maxsegsize */ BUS_DMA_ALLOCNOW, /* flags */ NULL, /* lockfunc */ NULL, /* lockfuncarg */ &ha->rx_tag)) { device_printf(ha->pci_dev, "%s: rx_tag alloc failed\n", __func__); return (ENOMEM); } bzero((void *)ha->rx_ring, (sizeof(qla_rx_ring_t) * MAX_RDS_RINGS)); for (i = 0; i < ha->hw.num_sds_rings; i++) { ha->hw.sds[i].sdsr_next = 0; ha->hw.sds[i].rxb_free = NULL; ha->hw.sds[i].rx_free = 0; } ret = qla_alloc_rcv_std(ha); return (ret); } static void qla_free_rcv_bufs(qla_host_t *ha) { int i; qla_free_rcv_std(ha); if (ha->rx_tag != NULL) { bus_dma_tag_destroy(ha->rx_tag); ha->rx_tag = NULL; } bzero((void *)ha->rx_ring, (sizeof(qla_rx_ring_t) * MAX_RDS_RINGS)); for (i = 0; i < ha->hw.num_sds_rings; i++) { ha->hw.sds[i].sdsr_next = 0; ha->hw.sds[i].rxb_free = NULL; ha->hw.sds[i].rx_free = 0; } return; } int ql_get_mbuf(qla_host_t *ha, qla_rx_buf_t *rxb, struct mbuf *nmp) { register struct mbuf *mp = nmp; struct ifnet *ifp; int ret = 0; uint32_t offset; bus_dma_segment_t segs[1]; int nsegs, mbuf_size; QL_DPRINT2(ha, (ha->pci_dev, "%s: enter\n", __func__)); ifp = ha->ifp; if (ha->hw.enable_9kb) mbuf_size = MJUM9BYTES; else mbuf_size = MCLBYTES; if (mp == NULL) { if (QL_ERR_INJECT(ha, INJCT_M_GETCL_M_GETJCL_FAILURE)) return(-1); if (ha->hw.enable_9kb) mp = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, mbuf_size); else mp = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); if (mp == NULL) { ha->err_m_getcl++; ret = ENOBUFS; device_printf(ha->pci_dev, "%s: m_getcl failed\n", __func__); goto exit_ql_get_mbuf; } mp->m_len = mp->m_pkthdr.len = mbuf_size; } else { mp->m_len = mp->m_pkthdr.len = mbuf_size; mp->m_data = mp->m_ext.ext_buf; mp->m_next = NULL; } offset = (uint32_t)((unsigned long long)mp->m_data & 0x7ULL); if (offset) { offset = 8 - offset; m_adj(mp, offset); } /* * Using memory from the mbuf cluster pool, invoke the bus_dma * machinery to arrange the memory mapping. */ ret = bus_dmamap_load_mbuf_sg(ha->rx_tag, rxb->map, mp, segs, &nsegs, BUS_DMA_NOWAIT); rxb->paddr = segs[0].ds_addr; if (ret || !rxb->paddr || (nsegs != 1)) { m_free(mp); rxb->m_head = NULL; device_printf(ha->pci_dev, "%s: bus_dmamap_load failed[%d, 0x%016llx, %d]\n", __func__, ret, (long long unsigned int)rxb->paddr, nsegs); ret = -1; goto exit_ql_get_mbuf; } rxb->m_head = mp; bus_dmamap_sync(ha->rx_tag, rxb->map, BUS_DMASYNC_PREREAD); exit_ql_get_mbuf: QL_DPRINT2(ha, (ha->pci_dev, "%s: exit ret = 0x%08x\n", __func__, ret)); return (ret); } static void qla_tx_done(void *context, int pending) { qla_host_t *ha = context; struct ifnet *ifp; ifp = ha->ifp; if (!ifp) return; if (!(ifp->if_drv_flags & IFF_DRV_RUNNING)) { QL_DPRINT8(ha, (ha->pci_dev, "%s: !IFF_DRV_RUNNING\n", __func__)); return; } ql_hw_tx_done(ha); qla_start(ha->ifp); } static void qla_get_peer(qla_host_t *ha) { device_t *peers; int count, i, slot; int my_slot = pci_get_slot(ha->pci_dev); if (device_get_children(device_get_parent(ha->pci_dev), &peers, &count)) return; for (i = 0; i < count; i++) { slot = pci_get_slot(peers[i]); if ((slot >= 0) && (slot == my_slot) && (pci_get_device(peers[i]) == pci_get_device(ha->pci_dev))) { if (ha->pci_dev != peers[i]) ha->peer_dev = peers[i]; } } } static void qla_send_msg_to_peer(qla_host_t *ha, uint32_t msg_to_peer) { qla_host_t *ha_peer; if (ha->peer_dev) { if ((ha_peer = device_get_softc(ha->peer_dev)) != NULL) { ha_peer->msg_from_peer = msg_to_peer; } } } static void qla_error_recovery(void *context, int pending) { qla_host_t *ha = context; uint32_t msecs_100 = 100; struct ifnet *ifp = ha->ifp; (void)QLA_LOCK(ha, __func__, 0); if (ha->flags.qla_interface_up) { ha->hw.imd_compl = 1; qla_mdelay(__func__, 300); ql_hw_stop_rcv(ha); ifp->if_drv_flags &= ~(IFF_DRV_OACTIVE | IFF_DRV_RUNNING); QLA_TX_LOCK(ha); QLA_TX_UNLOCK(ha); } QLA_UNLOCK(ha, __func__); if ((ha->pci_func & 0x1) == 0) { if (!ha->msg_from_peer) { qla_send_msg_to_peer(ha, QL_PEER_MSG_RESET); while ((ha->msg_from_peer != QL_PEER_MSG_ACK) && msecs_100--) qla_mdelay(__func__, 100); } ha->msg_from_peer = 0; (void)QLA_LOCK(ha, __func__, 0); ql_minidump(ha); QLA_UNLOCK(ha, __func__); (void) ql_init_hw(ha); (void)QLA_LOCK(ha, __func__, 0); if (ha->flags.qla_interface_up) { qla_free_xmt_bufs(ha); qla_free_rcv_bufs(ha); } QLA_UNLOCK(ha, __func__); qla_send_msg_to_peer(ha, QL_PEER_MSG_ACK); } else { if (ha->msg_from_peer == QL_PEER_MSG_RESET) { ha->msg_from_peer = 0; qla_send_msg_to_peer(ha, QL_PEER_MSG_ACK); } else { qla_send_msg_to_peer(ha, QL_PEER_MSG_RESET); } while ((ha->msg_from_peer != QL_PEER_MSG_ACK) && msecs_100--) qla_mdelay(__func__, 100); ha->msg_from_peer = 0; (void) ql_init_hw(ha); (void)QLA_LOCK(ha, __func__, 0); if (ha->flags.qla_interface_up) { qla_free_xmt_bufs(ha); qla_free_rcv_bufs(ha); } QLA_UNLOCK(ha, __func__); } (void)QLA_LOCK(ha, __func__, 0); if (ha->flags.qla_interface_up) { if (qla_alloc_xmt_bufs(ha) != 0) { QLA_UNLOCK(ha, __func__); return; } qla_confirm_9kb_enable(ha); if (qla_alloc_rcv_bufs(ha) != 0) { QLA_UNLOCK(ha, __func__); return; } ha->flags.stop_rcv = 0; if (ql_init_hw_if(ha) == 0) { ifp = ha->ifp; ifp->if_drv_flags |= IFF_DRV_RUNNING; ifp->if_drv_flags &= ~IFF_DRV_OACTIVE; ha->flags.qla_watchdog_pause = 0; } } else ha->flags.qla_watchdog_pause = 0; QLA_UNLOCK(ha, __func__); } static void qla_async_event(void *context, int pending) { qla_host_t *ha = context; (void)QLA_LOCK(ha, __func__, 0); qla_hw_async_event(ha); QLA_UNLOCK(ha, __func__); } Index: user/alc/PQ_LAUNDRY/sys/dev/qlxgbe/ql_ver.h =================================================================== --- user/alc/PQ_LAUNDRY/sys/dev/qlxgbe/ql_ver.h (revision 306831) +++ user/alc/PQ_LAUNDRY/sys/dev/qlxgbe/ql_ver.h (revision 306832) @@ -1,41 +1,41 @@ /* * Copyright (c) 2013-2016 Qlogic Corporation * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * * $FreeBSD$ */ /* * File: ql_ver.h * Author : David C Somayajulu, Qlogic Corporation, Aliso Viejo, CA 92656. */ #ifndef _QL_VER_H_ #define _QL_VER_H_ #define QLA_VERSION_MAJOR 3 #define QLA_VERSION_MINOR 10 -#define QLA_VERSION_BUILD 30 +#define QLA_VERSION_BUILD 31 #endif /* #ifndef _QL_VER_H_ */ Index: user/alc/PQ_LAUNDRY/sys/kern/vfs_cache.c =================================================================== --- user/alc/PQ_LAUNDRY/sys/kern/vfs_cache.c (revision 306831) +++ user/alc/PQ_LAUNDRY/sys/kern/vfs_cache.c (revision 306832) @@ -1,2248 +1,2248 @@ /*- * Copyright (c) 1989, 1993, 1995 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * Poul-Henning Kamp of the FreeBSD Project. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)vfs_cache.c 8.5 (Berkeley) 3/22/95 */ #include __FBSDID("$FreeBSD$"); #include "opt_ktrace.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef KTRACE #include #endif #include SDT_PROVIDER_DECLARE(vfs); SDT_PROBE_DEFINE3(vfs, namecache, enter, done, "struct vnode *", "char *", "struct vnode *"); SDT_PROBE_DEFINE2(vfs, namecache, enter_negative, done, "struct vnode *", "char *"); SDT_PROBE_DEFINE1(vfs, namecache, fullpath, entry, "struct vnode *"); SDT_PROBE_DEFINE3(vfs, namecache, fullpath, hit, "struct vnode *", "char *", "struct vnode *"); SDT_PROBE_DEFINE1(vfs, namecache, fullpath, miss, "struct vnode *"); SDT_PROBE_DEFINE3(vfs, namecache, fullpath, return, "int", "struct vnode *", "char *"); SDT_PROBE_DEFINE3(vfs, namecache, lookup, hit, "struct vnode *", "char *", "struct vnode *"); SDT_PROBE_DEFINE2(vfs, namecache, lookup, hit__negative, "struct vnode *", "char *"); SDT_PROBE_DEFINE2(vfs, namecache, lookup, miss, "struct vnode *", "char *"); SDT_PROBE_DEFINE1(vfs, namecache, purge, done, "struct vnode *"); SDT_PROBE_DEFINE1(vfs, namecache, purge_negative, done, "struct vnode *"); SDT_PROBE_DEFINE1(vfs, namecache, purgevfs, done, "struct mount *"); SDT_PROBE_DEFINE3(vfs, namecache, zap, done, "struct vnode *", "char *", "struct vnode *"); SDT_PROBE_DEFINE2(vfs, namecache, zap_negative, done, "struct vnode *", "char *"); /* * This structure describes the elements in the cache of recent * names looked up by namei. */ struct namecache { LIST_ENTRY(namecache) nc_hash; /* hash chain */ LIST_ENTRY(namecache) nc_src; /* source vnode list */ TAILQ_ENTRY(namecache) nc_dst; /* destination vnode list */ struct vnode *nc_dvp; /* vnode of parent of name */ struct vnode *nc_vp; /* vnode the name refers to */ u_char nc_flag; /* flag bits */ u_char nc_nlen; /* length of name */ char nc_name[0]; /* segment name + nul */ }; /* * struct namecache_ts repeats struct namecache layout up to the * nc_nlen member. * struct namecache_ts is used in place of struct namecache when time(s) need * to be stored. The nc_dotdottime field is used when a cache entry is mapping * both a non-dotdot directory name plus dotdot for the directory's * parent. */ struct namecache_ts { LIST_ENTRY(namecache) nc_hash; /* hash chain */ LIST_ENTRY(namecache) nc_src; /* source vnode list */ TAILQ_ENTRY(namecache) nc_dst; /* destination vnode list */ struct vnode *nc_dvp; /* vnode of parent of name */ struct vnode *nc_vp; /* vnode the name refers to */ u_char nc_flag; /* flag bits */ u_char nc_nlen; /* length of name */ struct timespec nc_time; /* timespec provided by fs */ struct timespec nc_dotdottime; /* dotdot timespec provided by fs */ int nc_ticks; /* ticks value when entry was added */ char nc_name[0]; /* segment name + nul */ }; /* * Flags in namecache.nc_flag */ #define NCF_WHITE 0x01 #define NCF_ISDOTDOT 0x02 #define NCF_TS 0x04 #define NCF_DTS 0x08 #define NCF_DVDROP 0x10 /* * Name caching works as follows: * * Names found by directory scans are retained in a cache * for future reference. It is managed LRU, so frequently * used names will hang around. Cache is indexed by hash value * obtained from (vp, name) where vp refers to the directory * containing name. * * If it is a "negative" entry, (i.e. for a name that is known NOT to * exist) the vnode pointer will be NULL. * * Upon reaching the last segment of a path, if the reference * is for DELETE, or NOCACHE is set (rewrite), and the * name is located in the cache, it will be dropped. * * These locks are used (in the order in which they can be taken): * NAME TYPE ROLE * vnodelock mtx vnode lists and v_cache_dd field protection * bucketlock rwlock for access to given set of hash buckets * ncneg_mtx mtx negative entry LRU management * * Additionally, ncneg_shrink_lock mtx is used to have at most one thread * shrinking the LRU list. * * It is legal to take multiple vnodelock and bucketlock locks. The locking * order is lower address first. Both are recursive. * * "." lookups are lockless. * * ".." and vnode -> name lookups require vnodelock. * * name -> vnode lookup requires the relevant bucketlock to be held for reading. * * Insertions and removals of entries require involved vnodes and bucketlocks * to be write-locked to prevent other threads from seeing the entry. * * Some lookups result in removal of the found entry (e.g. getting rid of a * negative entry with the intent to create a positive one), which poses a * problem when multiple threads reach the state. Similarly, two different * threads can purge two different vnodes and try to remove the same name. * * If the already held vnode lock is lower than the second required lock, we * can just take the other lock. However, in the opposite case, this could * deadlock. As such, this is resolved by trylocking and if that fails unlocking * the first node, locking everything in order and revalidating the state. */ /* * Structures associated with name caching. */ #define NCHHASH(hash) \ (&nchashtbl[(hash) & nchash]) static LIST_HEAD(nchashhead, namecache) *nchashtbl; /* Hash Table */ static TAILQ_HEAD(, namecache) ncneg; /* Hash Table */ static u_long nchash; /* size of hash table */ SYSCTL_ULONG(_debug, OID_AUTO, nchash, CTLFLAG_RD, &nchash, 0, "Size of namecache hash table"); static u_long ncnegfactor = 16; /* ratio of negative entries */ SYSCTL_ULONG(_vfs, OID_AUTO, ncnegfactor, CTLFLAG_RW, &ncnegfactor, 0, "Ratio of negative namecache entries"); static u_long numneg; /* number of negative entries allocated */ SYSCTL_ULONG(_debug, OID_AUTO, numneg, CTLFLAG_RD, &numneg, 0, "Number of negative entries in namecache"); static u_long numcache; /* number of cache entries allocated */ SYSCTL_ULONG(_debug, OID_AUTO, numcache, CTLFLAG_RD, &numcache, 0, "Number of namecache entries"); static u_long numcachehv; /* number of cache entries with vnodes held */ SYSCTL_ULONG(_debug, OID_AUTO, numcachehv, CTLFLAG_RD, &numcachehv, 0, "Number of namecache entries with vnodes held"); u_int ncsizefactor = 2; SYSCTL_UINT(_vfs, OID_AUTO, ncsizefactor, CTLFLAG_RW, &ncsizefactor, 0, "Size factor for namecache"); static u_int ncpurgeminvnodes; SYSCTL_UINT(_vfs, OID_AUTO, ncpurgeminvnodes, CTLFLAG_RW, &ncpurgeminvnodes, 0, "Number of vnodes below which purgevfs ignores the request"); struct nchstats nchstats; /* cache effectiveness statistics */ static struct mtx ncneg_shrink_lock; MTX_SYSINIT(vfscache_shrink_neg, &ncneg_shrink_lock, "Name Cache shrink neg", MTX_DEF); static struct mtx_padalign ncneg_mtx; MTX_SYSINIT(vfscache_neg, &ncneg_mtx, "ncneg", MTX_DEF); static u_int numbucketlocks; static struct rwlock_padalign *bucketlocks; #define HASH2BUCKETLOCK(hash) \ ((struct rwlock *)(&bucketlocks[((hash) % numbucketlocks)])) static u_int numvnodelocks; static struct mtx *vnodelocks; static inline struct mtx * VP2VNODELOCK(struct vnode *vp) { struct mtx *vlp; if (vp == NULL) return (NULL); vlp = &vnodelocks[(((uintptr_t)(vp) >> 8) % numvnodelocks)]; return (vlp); } /* * UMA zones for the VFS cache. * * The small cache is used for entries with short names, which are the * most common. The large cache is used for entries which are too big to * fit in the small cache. */ static uma_zone_t cache_zone_small; static uma_zone_t cache_zone_small_ts; static uma_zone_t cache_zone_large; static uma_zone_t cache_zone_large_ts; #define CACHE_PATH_CUTOFF 35 static struct namecache * cache_alloc(int len, int ts) { if (len > CACHE_PATH_CUTOFF) { if (ts) return (uma_zalloc(cache_zone_large_ts, M_WAITOK)); else return (uma_zalloc(cache_zone_large, M_WAITOK)); } if (ts) return (uma_zalloc(cache_zone_small_ts, M_WAITOK)); else return (uma_zalloc(cache_zone_small, M_WAITOK)); } static void cache_free(struct namecache *ncp) { int ts; if (ncp == NULL) return; ts = ncp->nc_flag & NCF_TS; if ((ncp->nc_flag & NCF_DVDROP) != 0) vdrop(ncp->nc_dvp); if (ncp->nc_nlen <= CACHE_PATH_CUTOFF) { if (ts) uma_zfree(cache_zone_small_ts, ncp); else uma_zfree(cache_zone_small, ncp); } else if (ts) uma_zfree(cache_zone_large_ts, ncp); else uma_zfree(cache_zone_large, ncp); } static char * nc_get_name(struct namecache *ncp) { struct namecache_ts *ncp_ts; if ((ncp->nc_flag & NCF_TS) == 0) return (ncp->nc_name); ncp_ts = (struct namecache_ts *)ncp; return (ncp_ts->nc_name); } static void cache_out_ts(struct namecache *ncp, struct timespec *tsp, int *ticksp) { KASSERT((ncp->nc_flag & NCF_TS) != 0 || (tsp == NULL && ticksp == NULL), ("No NCF_TS")); if (tsp != NULL) *tsp = ((struct namecache_ts *)ncp)->nc_time; if (ticksp != NULL) *ticksp = ((struct namecache_ts *)ncp)->nc_ticks; } static int doingcache = 1; /* 1 => enable the cache */ SYSCTL_INT(_debug, OID_AUTO, vfscache, CTLFLAG_RW, &doingcache, 0, "VFS namecache enabled"); /* Export size information to userland */ SYSCTL_INT(_debug_sizeof, OID_AUTO, namecache, CTLFLAG_RD, SYSCTL_NULL_INT_PTR, sizeof(struct namecache), "sizeof(struct namecache)"); /* * The new name cache statistics */ static SYSCTL_NODE(_vfs, OID_AUTO, cache, CTLFLAG_RW, 0, "Name cache statistics"); #define STATNODE_ULONG(name, descr) \ SYSCTL_ULONG(_vfs_cache, OID_AUTO, name, CTLFLAG_RD, &name, 0, descr); #define STATNODE_COUNTER(name, descr) \ static counter_u64_t name; \ SYSCTL_COUNTER_U64(_vfs_cache, OID_AUTO, name, CTLFLAG_RD, &name, descr); STATNODE_ULONG(numneg, "Number of negative cache entries"); STATNODE_ULONG(numcache, "Number of cache entries"); STATNODE_COUNTER(numcalls, "Number of cache lookups"); STATNODE_COUNTER(dothits, "Number of '.' hits"); STATNODE_COUNTER(dotdothits, "Number of '..' hits"); STATNODE_COUNTER(numchecks, "Number of checks in lookup"); STATNODE_COUNTER(nummiss, "Number of cache misses"); STATNODE_COUNTER(nummisszap, "Number of cache misses we do not want to cache"); STATNODE_COUNTER(numposzaps, "Number of cache hits (positive) we do not want to cache"); STATNODE_COUNTER(numposhits, "Number of cache hits (positive)"); STATNODE_COUNTER(numnegzaps, "Number of cache hits (negative) we do not want to cache"); STATNODE_COUNTER(numneghits, "Number of cache hits (negative)"); /* These count for kern___getcwd(), too. */ STATNODE_COUNTER(numfullpathcalls, "Number of fullpath search calls"); STATNODE_COUNTER(numfullpathfail1, "Number of fullpath search errors (ENOTDIR)"); STATNODE_COUNTER(numfullpathfail2, "Number of fullpath search errors (VOP_VPTOCNP failures)"); STATNODE_COUNTER(numfullpathfail4, "Number of fullpath search errors (ENOMEM)"); STATNODE_COUNTER(numfullpathfound, "Number of successful fullpath calls"); static long zap_and_exit_bucket_fail; STATNODE_ULONG(zap_and_exit_bucket_fail, "Number of times zap_and_exit failed to lock"); static long cache_lock_vnodes_cel_3_failures; STATNODE_ULONG(cache_lock_vnodes_cel_3_failures, "Number of times 3-way vnode locking failed"); static void cache_zap_locked(struct namecache *ncp, bool neg_locked); static int vn_fullpath1(struct thread *td, struct vnode *vp, struct vnode *rdir, char *buf, char **retbuf, u_int buflen); static MALLOC_DEFINE(M_VFSCACHE, "vfscache", "VFS name cache entries"); static int cache_yield; SYSCTL_INT(_vfs_cache, OID_AUTO, yield, CTLFLAG_RD, &cache_yield, 0, "Number of times cache called yield"); static void cache_maybe_yield(void) { if (should_yield()) { cache_yield++; kern_yield(PRI_USER); } } static inline void cache_assert_vlp_locked(struct mtx *vlp) { if (vlp != NULL) mtx_assert(vlp, MA_OWNED); } static inline void cache_assert_vnode_locked(struct vnode *vp) { struct mtx *vlp; vlp = VP2VNODELOCK(vp); cache_assert_vlp_locked(vlp); } static uint32_t cache_get_hash(char *name, u_char len, struct vnode *dvp) { uint32_t hash; hash = fnv_32_buf(name, len, FNV1_32_INIT); hash = fnv_32_buf(&dvp, sizeof(dvp), hash); return (hash); } static inline struct rwlock * NCP2BUCKETLOCK(struct namecache *ncp) { uint32_t hash; hash = cache_get_hash(nc_get_name(ncp), ncp->nc_nlen, ncp->nc_dvp); return (HASH2BUCKETLOCK(hash)); } #ifdef INVARIANTS static void cache_assert_bucket_locked(struct namecache *ncp, int mode) { struct rwlock *blp; blp = NCP2BUCKETLOCK(ncp); rw_assert(blp, mode); } #else #define cache_assert_bucket_locked(x, y) do { } while (0) #endif #define cache_sort(x, y) _cache_sort((void **)(x), (void **)(y)) static void _cache_sort(void **p1, void **p2) { void *tmp; if (*p1 > *p2) { tmp = *p2; *p2 = *p1; *p1 = tmp; } } static void cache_lock_all_buckets(void) { u_int i; for (i = 0; i < numbucketlocks; i++) rw_wlock(&bucketlocks[i]); } static void cache_unlock_all_buckets(void) { u_int i; for (i = 0; i < numbucketlocks; i++) rw_wunlock(&bucketlocks[i]); } static void cache_lock_all_vnodes(void) { u_int i; for (i = 0; i < numvnodelocks; i++) mtx_lock(&vnodelocks[i]); } static void cache_unlock_all_vnodes(void) { u_int i; for (i = 0; i < numvnodelocks; i++) mtx_unlock(&vnodelocks[i]); } static int cache_trylock_vnodes(struct mtx *vlp1, struct mtx *vlp2) { cache_sort(&vlp1, &vlp2); MPASS(vlp2 != NULL); if (vlp1 != NULL) { if (!mtx_trylock(vlp1)) return (EAGAIN); } if (!mtx_trylock(vlp2)) { if (vlp1 != NULL) mtx_unlock(vlp1); return (EAGAIN); } return (0); } static void cache_unlock_vnodes(struct mtx *vlp1, struct mtx *vlp2) { MPASS(vlp1 != NULL || vlp2 != NULL); if (vlp1 != NULL) mtx_unlock(vlp1); if (vlp2 != NULL) mtx_unlock(vlp2); } static int sysctl_nchstats(SYSCTL_HANDLER_ARGS) { struct nchstats snap; if (req->oldptr == NULL) return (SYSCTL_OUT(req, 0, sizeof(snap))); snap = nchstats; snap.ncs_goodhits = counter_u64_fetch(numposhits); snap.ncs_neghits = counter_u64_fetch(numneghits); snap.ncs_badhits = counter_u64_fetch(numposzaps) + counter_u64_fetch(numnegzaps); snap.ncs_miss = counter_u64_fetch(nummisszap) + counter_u64_fetch(nummiss); return (SYSCTL_OUT(req, &snap, sizeof(snap))); } SYSCTL_PROC(_vfs_cache, OID_AUTO, nchstats, CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 0, sysctl_nchstats, "LU", "VFS cache effectiveness statistics"); #ifdef DIAGNOSTIC /* * Grab an atomic snapshot of the name cache hash chain lengths */ static SYSCTL_NODE(_debug, OID_AUTO, hashstat, CTLFLAG_RW, NULL, "hash table stats"); static int sysctl_debug_hashstat_rawnchash(SYSCTL_HANDLER_ARGS) { struct nchashhead *ncpp; struct namecache *ncp; int i, error, n_nchash, *cntbuf; retry: n_nchash = nchash + 1; /* nchash is max index, not count */ if (req->oldptr == NULL) return SYSCTL_OUT(req, 0, n_nchash * sizeof(int)); cntbuf = malloc(n_nchash * sizeof(int), M_TEMP, M_ZERO | M_WAITOK); cache_lock_all_buckets(); if (n_nchash != nchash + 1) { cache_unlock_all_buckets(); free(cntbuf, M_TEMP); goto retry; } /* Scan hash tables counting entries */ for (ncpp = nchashtbl, i = 0; i < n_nchash; ncpp++, i++) LIST_FOREACH(ncp, ncpp, nc_hash) cntbuf[i]++; cache_unlock_all_buckets(); for (error = 0, i = 0; i < n_nchash; i++) if ((error = SYSCTL_OUT(req, &cntbuf[i], sizeof(int))) != 0) break; free(cntbuf, M_TEMP); return (error); } SYSCTL_PROC(_debug_hashstat, OID_AUTO, rawnchash, CTLTYPE_INT|CTLFLAG_RD| CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_rawnchash, "S,int", "nchash chain lengths"); static int sysctl_debug_hashstat_nchash(SYSCTL_HANDLER_ARGS) { int error; struct nchashhead *ncpp; struct namecache *ncp; int n_nchash; int count, maxlength, used, pct; if (!req->oldptr) return SYSCTL_OUT(req, 0, 4 * sizeof(int)); cache_lock_all_buckets(); n_nchash = nchash + 1; /* nchash is max index, not count */ used = 0; maxlength = 0; /* Scan hash tables for applicable entries */ for (ncpp = nchashtbl; n_nchash > 0; n_nchash--, ncpp++) { count = 0; LIST_FOREACH(ncp, ncpp, nc_hash) { count++; } if (count) used++; if (maxlength < count) maxlength = count; } n_nchash = nchash + 1; cache_unlock_all_buckets(); pct = (used * 100) / (n_nchash / 100); error = SYSCTL_OUT(req, &n_nchash, sizeof(n_nchash)); if (error) return (error); error = SYSCTL_OUT(req, &used, sizeof(used)); if (error) return (error); error = SYSCTL_OUT(req, &maxlength, sizeof(maxlength)); if (error) return (error); error = SYSCTL_OUT(req, &pct, sizeof(pct)); if (error) return (error); return (0); } SYSCTL_PROC(_debug_hashstat, OID_AUTO, nchash, CTLTYPE_INT|CTLFLAG_RD| CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_nchash, "I", "nchash statistics (number of total/used buckets, maximum chain length, usage percentage)"); #endif /* * Negative entries management */ static void cache_negative_hit(struct namecache *ncp) { MPASS(ncp->nc_vp == NULL); mtx_lock(&ncneg_mtx); TAILQ_REMOVE(&ncneg, ncp, nc_dst); TAILQ_INSERT_TAIL(&ncneg, ncp, nc_dst); mtx_unlock(&ncneg_mtx); } static void cache_negative_insert(struct namecache *ncp) { MPASS(ncp->nc_vp == NULL); cache_assert_bucket_locked(ncp, RA_WLOCKED); mtx_lock(&ncneg_mtx); TAILQ_INSERT_TAIL(&ncneg, ncp, nc_dst); numneg++; mtx_unlock(&ncneg_mtx); } static void cache_negative_remove(struct namecache *ncp, bool neg_locked) { MPASS(ncp->nc_vp == NULL); cache_assert_bucket_locked(ncp, RA_WLOCKED); if (!neg_locked) mtx_lock(&ncneg_mtx); else mtx_assert(&ncneg_mtx, MA_OWNED); TAILQ_REMOVE(&ncneg, ncp, nc_dst); numneg--; if (!neg_locked) mtx_unlock(&ncneg_mtx); } static void cache_negative_zap_one(void) { struct namecache *ncp, *ncp2; struct mtx *dvlp; struct rwlock *blp; if (!mtx_trylock(&ncneg_shrink_lock)) return; mtx_lock(&ncneg_mtx); ncp = TAILQ_FIRST(&ncneg); if (ncp == NULL) { mtx_unlock(&ncneg_mtx); goto out; } MPASS(ncp->nc_vp == NULL); dvlp = VP2VNODELOCK(ncp->nc_dvp); blp = NCP2BUCKETLOCK(ncp); mtx_unlock(&ncneg_mtx); mtx_lock(dvlp); rw_wlock(blp); mtx_lock(&ncneg_mtx); ncp2 = TAILQ_FIRST(&ncneg); if (ncp != ncp2 || dvlp != VP2VNODELOCK(ncp2->nc_dvp) || blp != NCP2BUCKETLOCK(ncp2) || ncp2->nc_vp != NULL) { ncp = NULL; goto out_unlock_all; } cache_zap_locked(ncp, true); out_unlock_all: mtx_unlock(&ncneg_mtx); rw_wunlock(blp); mtx_unlock(dvlp); out: mtx_unlock(&ncneg_shrink_lock); cache_free(ncp); } /* * cache_zap_locked(): * * Removes a namecache entry from cache, whether it contains an actual * pointer to a vnode or if it is just a negative cache entry. */ static void cache_zap_locked(struct namecache *ncp, bool neg_locked) { cache_assert_vnode_locked(ncp->nc_vp); cache_assert_vnode_locked(ncp->nc_dvp); cache_assert_bucket_locked(ncp, RA_WLOCKED); CTR2(KTR_VFS, "cache_zap(%p) vp %p", ncp, ncp->nc_vp); if (ncp->nc_vp != NULL) { SDT_PROBE3(vfs, namecache, zap, done, ncp->nc_dvp, nc_get_name(ncp), ncp->nc_vp); } else { SDT_PROBE2(vfs, namecache, zap_negative, done, ncp->nc_dvp, nc_get_name(ncp)); } LIST_REMOVE(ncp, nc_hash); if (ncp->nc_flag & NCF_ISDOTDOT) { if (ncp == ncp->nc_dvp->v_cache_dd) ncp->nc_dvp->v_cache_dd = NULL; } else { LIST_REMOVE(ncp, nc_src); if (LIST_EMPTY(&ncp->nc_dvp->v_cache_src)) { ncp->nc_flag |= NCF_DVDROP; atomic_subtract_rel_long(&numcachehv, 1); } } if (ncp->nc_vp) { TAILQ_REMOVE(&ncp->nc_vp->v_cache_dst, ncp, nc_dst); if (ncp == ncp->nc_vp->v_cache_dd) ncp->nc_vp->v_cache_dd = NULL; } else { cache_negative_remove(ncp, neg_locked); } atomic_subtract_rel_long(&numcache, 1); } static void cache_zap_negative_locked_vnode_kl(struct namecache *ncp, struct vnode *vp) { struct rwlock *blp; MPASS(ncp->nc_dvp == vp); MPASS(ncp->nc_vp == NULL); cache_assert_vnode_locked(vp); blp = NCP2BUCKETLOCK(ncp); rw_wlock(blp); cache_zap_locked(ncp, false); rw_wunlock(blp); } static bool cache_zap_locked_vnode_kl2(struct namecache *ncp, struct vnode *vp, struct mtx **vlpp) { struct mtx *pvlp, *vlp1, *vlp2, *to_unlock; struct rwlock *blp; MPASS(vp == ncp->nc_dvp || vp == ncp->nc_vp); cache_assert_vnode_locked(vp); if (ncp->nc_vp == NULL) { if (*vlpp != NULL) { mtx_unlock(*vlpp); *vlpp = NULL; } cache_zap_negative_locked_vnode_kl(ncp, vp); return (true); } pvlp = VP2VNODELOCK(vp); blp = NCP2BUCKETLOCK(ncp); vlp1 = VP2VNODELOCK(ncp->nc_dvp); vlp2 = VP2VNODELOCK(ncp->nc_vp); if (*vlpp == vlp1 || *vlpp == vlp2) { to_unlock = *vlpp; *vlpp = NULL; } else { if (*vlpp != NULL) { mtx_unlock(*vlpp); *vlpp = NULL; } cache_sort(&vlp1, &vlp2); if (vlp1 == pvlp) { mtx_lock(vlp2); to_unlock = vlp2; } else { if (!mtx_trylock(vlp1)) goto out_relock; to_unlock = vlp1; } } rw_wlock(blp); cache_zap_locked(ncp, false); rw_wunlock(blp); if (to_unlock != NULL) mtx_unlock(to_unlock); return (true); out_relock: mtx_unlock(vlp2); mtx_lock(vlp1); mtx_lock(vlp2); MPASS(*vlpp == NULL); *vlpp = vlp1; return (false); } static int cache_zap_locked_vnode(struct namecache *ncp, struct vnode *vp) { struct mtx *pvlp, *vlp1, *vlp2, *to_unlock; struct rwlock *blp; int error = 0; MPASS(vp == ncp->nc_dvp || vp == ncp->nc_vp); cache_assert_vnode_locked(vp); pvlp = VP2VNODELOCK(vp); if (ncp->nc_vp == NULL) { cache_zap_negative_locked_vnode_kl(ncp, vp); goto out; } blp = NCP2BUCKETLOCK(ncp); vlp1 = VP2VNODELOCK(ncp->nc_dvp); vlp2 = VP2VNODELOCK(ncp->nc_vp); cache_sort(&vlp1, &vlp2); if (vlp1 == pvlp) { mtx_lock(vlp2); to_unlock = vlp2; } else { if (!mtx_trylock(vlp1)) { error = EAGAIN; goto out; } to_unlock = vlp1; } rw_wlock(blp); cache_zap_locked(ncp, false); rw_wunlock(blp); mtx_unlock(to_unlock); out: mtx_unlock(pvlp); return (error); } static int cache_zap_rlocked_bucket(struct namecache *ncp, struct rwlock *blp) { struct mtx *dvlp, *vlp; cache_assert_bucket_locked(ncp, RA_RLOCKED); dvlp = VP2VNODELOCK(ncp->nc_dvp); vlp = VP2VNODELOCK(ncp->nc_vp); if (cache_trylock_vnodes(dvlp, vlp) == 0) { rw_runlock(blp); rw_wlock(blp); cache_zap_locked(ncp, false); rw_wunlock(blp); cache_unlock_vnodes(dvlp, vlp); return (0); } rw_runlock(blp); return (EAGAIN); } static int cache_zap_wlocked_bucket_kl(struct namecache *ncp, struct rwlock *blp, struct mtx **vlpp1, struct mtx **vlpp2) { struct mtx *dvlp, *vlp; cache_assert_bucket_locked(ncp, RA_WLOCKED); dvlp = VP2VNODELOCK(ncp->nc_dvp); vlp = VP2VNODELOCK(ncp->nc_vp); cache_sort(&dvlp, &vlp); if (*vlpp1 == dvlp && *vlpp2 == vlp) { cache_zap_locked(ncp, false); cache_unlock_vnodes(dvlp, vlp); *vlpp1 = NULL; *vlpp2 = NULL; return (0); } if (*vlpp1 != NULL) mtx_unlock(*vlpp1); if (*vlpp2 != NULL) mtx_unlock(*vlpp2); *vlpp1 = NULL; *vlpp2 = NULL; if (cache_trylock_vnodes(dvlp, vlp) == 0) { cache_zap_locked(ncp, false); cache_unlock_vnodes(dvlp, vlp); return (0); } rw_wunlock(blp); *vlpp1 = dvlp; *vlpp2 = vlp; if (*vlpp1 != NULL) mtx_lock(*vlpp1); mtx_lock(*vlpp2); rw_wlock(blp); return (EAGAIN); } static void cache_lookup_unlock(struct rwlock *blp, struct mtx *vlp) { if (blp != NULL) { rw_runlock(blp); mtx_assert(vlp, MA_NOTOWNED); } else { mtx_unlock(vlp); } } /* * Lookup an entry in the cache * * Lookup is called with dvp pointing to the directory to search, * cnp pointing to the name of the entry being sought. If the lookup * succeeds, the vnode is returned in *vpp, and a status of -1 is * returned. If the lookup determines that the name does not exist * (negative caching), a status of ENOENT is returned. If the lookup * fails, a status of zero is returned. If the directory vnode is * recycled out from under us due to a forced unmount, a status of * ENOENT is returned. * * vpp is locked and ref'd on return. If we're looking up DOTDOT, dvp is * unlocked. If we're looking up . an extra ref is taken, but the lock is * not recursively acquired. */ int cache_lookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, struct timespec *tsp, int *ticksp) { struct namecache *ncp; struct rwlock *blp; struct mtx *dvlp, *dvlp2; uint32_t hash; int error, ltype; if (!doingcache) { cnp->cn_flags &= ~MAKEENTRY; return (0); } retry: blp = NULL; dvlp = VP2VNODELOCK(dvp); error = 0; counter_u64_add(numcalls, 1); if (cnp->cn_nameptr[0] == '.') { if (cnp->cn_namelen == 1) { *vpp = dvp; CTR2(KTR_VFS, "cache_lookup(%p, %s) found via .", dvp, cnp->cn_nameptr); counter_u64_add(dothits, 1); SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ".", *vpp); if (tsp != NULL) timespecclear(tsp); if (ticksp != NULL) *ticksp = ticks; VREF(*vpp); /* * When we lookup "." we still can be asked to lock it * differently... */ ltype = cnp->cn_lkflags & LK_TYPE_MASK; if (ltype != VOP_ISLOCKED(*vpp)) { if (ltype == LK_EXCLUSIVE) { vn_lock(*vpp, LK_UPGRADE | LK_RETRY); if ((*vpp)->v_iflag & VI_DOOMED) { /* forced unmount */ vrele(*vpp); *vpp = NULL; return (ENOENT); } } else vn_lock(*vpp, LK_DOWNGRADE | LK_RETRY); } return (-1); } if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') { counter_u64_add(dotdothits, 1); dvlp2 = NULL; mtx_lock(dvlp); retry_dotdot: ncp = dvp->v_cache_dd; if (ncp == NULL) { SDT_PROBE3(vfs, namecache, lookup, miss, dvp, "..", NULL); mtx_unlock(dvlp); return (0); } if ((cnp->cn_flags & MAKEENTRY) == 0) { if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) { if (ncp->nc_dvp != dvp) panic("dvp %p v_cache_dd %p\n", dvp, ncp); if (!cache_zap_locked_vnode_kl2(ncp, dvp, &dvlp2)) goto retry_dotdot; MPASS(dvp->v_cache_dd == NULL); mtx_unlock(dvlp); if (dvlp2 != NULL) mtx_unlock(dvlp2); cache_free(ncp); } else { dvp->v_cache_dd = NULL; mtx_unlock(dvlp); if (dvlp2 != NULL) mtx_unlock(dvlp2); } return (0); } if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) *vpp = ncp->nc_vp; else *vpp = ncp->nc_dvp; /* Return failure if negative entry was found. */ if (*vpp == NULL) goto negative_success; CTR3(KTR_VFS, "cache_lookup(%p, %s) found %p via ..", dvp, cnp->cn_nameptr, *vpp); SDT_PROBE3(vfs, namecache, lookup, hit, dvp, "..", *vpp); cache_out_ts(ncp, tsp, ticksp); if ((ncp->nc_flag & (NCF_ISDOTDOT | NCF_DTS)) == NCF_DTS && tsp != NULL) *tsp = ((struct namecache_ts *)ncp)-> nc_dotdottime; goto success; } } hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp); blp = HASH2BUCKETLOCK(hash); rw_rlock(blp); LIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { counter_u64_add(numchecks, 1); if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && !bcmp(nc_get_name(ncp), cnp->cn_nameptr, ncp->nc_nlen)) break; } /* We failed to find an entry */ if (ncp == NULL) { SDT_PROBE3(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr, NULL); if ((cnp->cn_flags & MAKEENTRY) == 0) { counter_u64_add(nummisszap, 1); } else { counter_u64_add(nummiss, 1); } goto unlock; } /* We don't want to have an entry, so dump it */ if ((cnp->cn_flags & MAKEENTRY) == 0) { counter_u64_add(numposzaps, 1); goto zap_and_exit; } /* We found a "positive" match, return the vnode */ if (ncp->nc_vp) { counter_u64_add(numposhits, 1); *vpp = ncp->nc_vp; CTR4(KTR_VFS, "cache_lookup(%p, %s) found %p via ncp %p", dvp, cnp->cn_nameptr, *vpp, ncp); SDT_PROBE3(vfs, namecache, lookup, hit, dvp, nc_get_name(ncp), *vpp); cache_out_ts(ncp, tsp, ticksp); goto success; } negative_success: /* We found a negative match, and want to create it, so purge */ if (cnp->cn_nameiop == CREATE) { counter_u64_add(numnegzaps, 1); goto zap_and_exit; } counter_u64_add(numneghits, 1); cache_negative_hit(ncp); if (ncp->nc_flag & NCF_WHITE) cnp->cn_flags |= ISWHITEOUT; SDT_PROBE2(vfs, namecache, lookup, hit__negative, dvp, nc_get_name(ncp)); cache_out_ts(ncp, tsp, ticksp); cache_lookup_unlock(blp, dvlp); return (ENOENT); success: /* * On success we return a locked and ref'd vnode as per the lookup * protocol. */ MPASS(dvp != *vpp); ltype = 0; /* silence gcc warning */ if (cnp->cn_flags & ISDOTDOT) { ltype = VOP_ISLOCKED(dvp); VOP_UNLOCK(dvp, 0); } vhold(*vpp); cache_lookup_unlock(blp, dvlp); error = vget(*vpp, cnp->cn_lkflags | LK_VNHELD, cnp->cn_thread); if (cnp->cn_flags & ISDOTDOT) { vn_lock(dvp, ltype | LK_RETRY); if (dvp->v_iflag & VI_DOOMED) { if (error == 0) vput(*vpp); *vpp = NULL; return (ENOENT); } } if (error) { *vpp = NULL; goto retry; } if ((cnp->cn_flags & ISLASTCN) && (cnp->cn_lkflags & LK_TYPE_MASK) == LK_EXCLUSIVE) { ASSERT_VOP_ELOCKED(*vpp, "cache_lookup"); } return (-1); unlock: cache_lookup_unlock(blp, dvlp); return (0); zap_and_exit: if (blp != NULL) error = cache_zap_rlocked_bucket(ncp, blp); else error = cache_zap_locked_vnode(ncp, dvp); if (error != 0) { zap_and_exit_bucket_fail++; cache_maybe_yield(); goto retry; } cache_free(ncp); return (0); } struct celockstate { struct mtx *vlp[3]; struct rwlock *blp[2]; }; CTASSERT((nitems(((struct celockstate *)0)->vlp) == 3)); CTASSERT((nitems(((struct celockstate *)0)->blp) == 2)); static inline void cache_celockstate_init(struct celockstate *cel) { bzero(cel, sizeof(*cel)); } static void cache_lock_vnodes_cel(struct celockstate *cel, struct vnode *vp, struct vnode *dvp) { struct mtx *vlp1, *vlp2; MPASS(cel->vlp[0] == NULL); MPASS(cel->vlp[1] == NULL); MPASS(cel->vlp[2] == NULL); MPASS(vp != NULL || dvp != NULL); vlp1 = VP2VNODELOCK(vp); vlp2 = VP2VNODELOCK(dvp); cache_sort(&vlp1, &vlp2); if (vlp1 != NULL) { mtx_lock(vlp1); cel->vlp[0] = vlp1; } mtx_lock(vlp2); cel->vlp[1] = vlp2; } static void cache_unlock_vnodes_cel(struct celockstate *cel) { MPASS(cel->vlp[0] != NULL || cel->vlp[1] != NULL); if (cel->vlp[0] != NULL) mtx_unlock(cel->vlp[0]); if (cel->vlp[1] != NULL) mtx_unlock(cel->vlp[1]); if (cel->vlp[2] != NULL) mtx_unlock(cel->vlp[2]); } static bool cache_lock_vnodes_cel_3(struct celockstate *cel, struct vnode *vp) { struct mtx *vlp; bool ret; cache_assert_vlp_locked(cel->vlp[0]); cache_assert_vlp_locked(cel->vlp[1]); MPASS(cel->vlp[2] == NULL); vlp = VP2VNODELOCK(vp); if (vlp == NULL) return (true); ret = true; if (vlp >= cel->vlp[1]) { mtx_lock(vlp); } else { if (mtx_trylock(vlp)) goto out; cache_lock_vnodes_cel_3_failures++; cache_unlock_vnodes_cel(cel); if (vlp < cel->vlp[0]) { mtx_lock(vlp); mtx_lock(cel->vlp[0]); mtx_lock(cel->vlp[1]); } else { if (cel->vlp[0] != NULL) mtx_lock(cel->vlp[0]); mtx_lock(vlp); mtx_lock(cel->vlp[1]); } ret = false; } out: cel->vlp[2] = vlp; return (ret); } static void cache_lock_buckets_cel(struct celockstate *cel, struct rwlock *blp1, struct rwlock *blp2) { MPASS(cel->blp[0] == NULL); MPASS(cel->blp[1] == NULL); cache_sort(&blp1, &blp2); if (blp1 != NULL) { rw_wlock(blp1); cel->blp[0] = blp1; } rw_wlock(blp2); cel->blp[1] = blp2; } static void cache_unlock_buckets_cel(struct celockstate *cel) { if (cel->blp[0] != NULL) rw_wunlock(cel->blp[0]); rw_wunlock(cel->blp[1]); } /* * Lock part of the cache affected by the insertion. * * This means vnodelocks for dvp, vp and the relevant bucketlock. * However, insertion can result in removal of an old entry. In this * case we have an additional vnode and bucketlock pair to lock. If the * entry is negative, ncelock is locked instead of the vnode. * * That is, in the worst case we have to lock 3 vnodes and 2 bucketlocks, while * preserving the locking order (smaller address first). */ static void cache_enter_lock(struct celockstate *cel, struct vnode *dvp, struct vnode *vp, uint32_t hash) { struct namecache *ncp; struct rwlock *blps[2]; blps[0] = HASH2BUCKETLOCK(hash); for (;;) { blps[1] = NULL; cache_lock_vnodes_cel(cel, dvp, vp); if (vp == NULL || vp->v_type != VDIR) break; ncp = vp->v_cache_dd; if (ncp == NULL) break; if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) break; MPASS(ncp->nc_dvp == vp); blps[1] = NCP2BUCKETLOCK(ncp); if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp)) break; /* * All vnodes got re-locked. Re-validate the state and if * nothing changed we are done. Otherwise restart. */ if (ncp == vp->v_cache_dd && (ncp->nc_flag & NCF_ISDOTDOT) != 0 && blps[1] == NCP2BUCKETLOCK(ncp) && VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2]) break; cache_unlock_vnodes_cel(cel); cel->vlp[0] = NULL; cel->vlp[1] = NULL; cel->vlp[2] = NULL; } cache_lock_buckets_cel(cel, blps[0], blps[1]); } static void cache_enter_lock_dd(struct celockstate *cel, struct vnode *dvp, struct vnode *vp, uint32_t hash) { struct namecache *ncp; struct rwlock *blps[2]; blps[0] = HASH2BUCKETLOCK(hash); for (;;) { blps[1] = NULL; cache_lock_vnodes_cel(cel, dvp, vp); ncp = dvp->v_cache_dd; if (ncp == NULL) break; if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) break; MPASS(ncp->nc_dvp == dvp); blps[1] = NCP2BUCKETLOCK(ncp); if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp)) break; if (ncp == dvp->v_cache_dd && (ncp->nc_flag & NCF_ISDOTDOT) != 0 && blps[1] == NCP2BUCKETLOCK(ncp) && VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2]) break; cache_unlock_vnodes_cel(cel); cel->vlp[0] = NULL; cel->vlp[1] = NULL; cel->vlp[2] = NULL; } cache_lock_buckets_cel(cel, blps[0], blps[1]); } static void cache_enter_unlock(struct celockstate *cel) { cache_unlock_buckets_cel(cel); cache_unlock_vnodes_cel(cel); } /* * Add an entry to the cache. */ void cache_enter_time(struct vnode *dvp, struct vnode *vp, struct componentname *cnp, struct timespec *tsp, struct timespec *dtsp) { struct celockstate cel; struct namecache *ncp, *n2, *ndd; struct namecache_ts *n3; struct nchashhead *ncpp; uint32_t hash; int flag; int len; CTR3(KTR_VFS, "cache_enter(%p, %p, %s)", dvp, vp, cnp->cn_nameptr); VNASSERT(vp == NULL || (vp->v_iflag & VI_DOOMED) == 0, vp, ("cache_enter: Adding a doomed vnode")); VNASSERT(dvp == NULL || (dvp->v_iflag & VI_DOOMED) == 0, dvp, ("cache_enter: Doomed vnode used as src")); if (!doingcache) return; /* * Avoid blowout in namecache entries. */ if (numcache >= desiredvnodes * ncsizefactor) return; cache_celockstate_init(&cel); ndd = NULL; flag = 0; if (cnp->cn_nameptr[0] == '.') { if (cnp->cn_namelen == 1) return; if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') { len = cnp->cn_namelen; hash = cache_get_hash(cnp->cn_nameptr, len, dvp); cache_enter_lock_dd(&cel, dvp, vp, hash); /* * If dotdot entry already exists, just retarget it * to new parent vnode, otherwise continue with new * namecache entry allocation. */ if ((ncp = dvp->v_cache_dd) != NULL && ncp->nc_flag & NCF_ISDOTDOT) { KASSERT(ncp->nc_dvp == dvp, ("wrong isdotdot parent")); if (ncp->nc_vp != NULL) { TAILQ_REMOVE(&ncp->nc_vp->v_cache_dst, ncp, nc_dst); } else { cache_negative_remove(ncp, false); } if (vp != NULL) { TAILQ_INSERT_HEAD(&vp->v_cache_dst, ncp, nc_dst); } else { cache_negative_insert(ncp); } ncp->nc_vp = vp; cache_enter_unlock(&cel); return; } dvp->v_cache_dd = NULL; cache_enter_unlock(&cel); cache_celockstate_init(&cel); SDT_PROBE3(vfs, namecache, enter, done, dvp, "..", vp); flag = NCF_ISDOTDOT; } } /* * Calculate the hash key and setup as much of the new * namecache entry as possible before acquiring the lock. */ ncp = cache_alloc(cnp->cn_namelen, tsp != NULL); ncp->nc_vp = vp; ncp->nc_dvp = dvp; ncp->nc_flag = flag; if (tsp != NULL) { n3 = (struct namecache_ts *)ncp; n3->nc_time = *tsp; n3->nc_ticks = ticks; n3->nc_flag |= NCF_TS; if (dtsp != NULL) { n3->nc_dotdottime = *dtsp; n3->nc_flag |= NCF_DTS; } } len = ncp->nc_nlen = cnp->cn_namelen; hash = cache_get_hash(cnp->cn_nameptr, len, dvp); strlcpy(nc_get_name(ncp), cnp->cn_nameptr, len + 1); cache_enter_lock(&cel, dvp, vp, hash); /* * See if this vnode or negative entry is already in the cache * with this name. This can happen with concurrent lookups of * the same path name. */ ncpp = NCHHASH(hash); LIST_FOREACH(n2, ncpp, nc_hash) { if (n2->nc_dvp == dvp && n2->nc_nlen == cnp->cn_namelen && !bcmp(nc_get_name(n2), cnp->cn_nameptr, n2->nc_nlen)) { if (tsp != NULL) { KASSERT((n2->nc_flag & NCF_TS) != 0, ("no NCF_TS")); n3 = (struct namecache_ts *)n2; n3->nc_time = ((struct namecache_ts *)ncp)->nc_time; n3->nc_ticks = ((struct namecache_ts *)ncp)->nc_ticks; if (dtsp != NULL) { n3->nc_dotdottime = ((struct namecache_ts *)ncp)-> nc_dotdottime; n3->nc_flag |= NCF_DTS; } } goto out_unlock_free; } } if (flag == NCF_ISDOTDOT) { /* * See if we are trying to add .. entry, but some other lookup * has populated v_cache_dd pointer already. */ if (dvp->v_cache_dd != NULL) goto out_unlock_free; KASSERT(vp == NULL || vp->v_type == VDIR, ("wrong vnode type %p", vp)); dvp->v_cache_dd = ncp; } atomic_add_rel_long(&numcache, 1); if (vp != NULL) { if (vp->v_type == VDIR) { if (flag != NCF_ISDOTDOT) { /* * For this case, the cache entry maps both the * directory name in it and the name ".." for the * directory's parent. */ if ((ndd = vp->v_cache_dd) != NULL) { if ((ndd->nc_flag & NCF_ISDOTDOT) != 0) cache_zap_locked(ndd, false); else ndd = NULL; } vp->v_cache_dd = ncp; } } else { vp->v_cache_dd = NULL; } } if (flag != NCF_ISDOTDOT) { if (LIST_EMPTY(&dvp->v_cache_src)) { vhold(dvp); atomic_add_rel_long(&numcachehv, 1); } LIST_INSERT_HEAD(&dvp->v_cache_src, ncp, nc_src); } /* * Insert the new namecache entry into the appropriate chain * within the cache entries table. */ LIST_INSERT_HEAD(ncpp, ncp, nc_hash); /* * If the entry is "negative", we place it into the * "negative" cache queue, otherwise, we place it into the * destination vnode's cache entries queue. */ if (vp != NULL) { TAILQ_INSERT_HEAD(&vp->v_cache_dst, ncp, nc_dst); SDT_PROBE3(vfs, namecache, enter, done, dvp, nc_get_name(ncp), vp); } else { if (cnp->cn_flags & ISWHITEOUT) ncp->nc_flag |= NCF_WHITE; cache_negative_insert(ncp); SDT_PROBE2(vfs, namecache, enter_negative, done, dvp, nc_get_name(ncp)); } cache_enter_unlock(&cel); if (numneg * ncnegfactor > numcache) cache_negative_zap_one(); cache_free(ndd); return; out_unlock_free: cache_enter_unlock(&cel); cache_free(ncp); return; } static u_int cache_roundup_2(u_int val) { u_int res; for (res = 1; res <= val; res <<= 1) continue; return (res); } /* * Name cache initialization, from vfs_init() when we are booting */ static void nchinit(void *dummy __unused) { u_int i; TAILQ_INIT(&ncneg); cache_zone_small = uma_zcreate("S VFS Cache", sizeof(struct namecache) + CACHE_PATH_CUTOFF + 1, NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_ZINIT); cache_zone_small_ts = uma_zcreate("STS VFS Cache", sizeof(struct namecache_ts) + CACHE_PATH_CUTOFF + 1, NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_ZINIT); cache_zone_large = uma_zcreate("L VFS Cache", sizeof(struct namecache) + NAME_MAX + 1, NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_ZINIT); cache_zone_large_ts = uma_zcreate("LTS VFS Cache", sizeof(struct namecache_ts) + NAME_MAX + 1, NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_ZINIT); nchashtbl = hashinit(desiredvnodes * 2, M_VFSCACHE, &nchash); numbucketlocks = cache_roundup_2(mp_ncpus * 64); bucketlocks = malloc(sizeof(*bucketlocks) * numbucketlocks, M_VFSCACHE, M_WAITOK | M_ZERO); for (i = 0; i < numbucketlocks; i++) rw_init_flags(&bucketlocks[i], "ncbuc", RW_DUPOK | RW_RECURSE); numvnodelocks = cache_roundup_2(mp_ncpus * 64); vnodelocks = malloc(sizeof(*vnodelocks) * numvnodelocks, M_VFSCACHE, M_WAITOK | M_ZERO); for (i = 0; i < numvnodelocks; i++) mtx_init(&vnodelocks[i], "ncvn", NULL, MTX_DUPOK | MTX_RECURSE); ncpurgeminvnodes = numbucketlocks; numcalls = counter_u64_alloc(M_WAITOK); dothits = counter_u64_alloc(M_WAITOK); dotdothits = counter_u64_alloc(M_WAITOK); numchecks = counter_u64_alloc(M_WAITOK); nummiss = counter_u64_alloc(M_WAITOK); nummisszap = counter_u64_alloc(M_WAITOK); numposzaps = counter_u64_alloc(M_WAITOK); numposhits = counter_u64_alloc(M_WAITOK); numnegzaps = counter_u64_alloc(M_WAITOK); numneghits = counter_u64_alloc(M_WAITOK); numfullpathcalls = counter_u64_alloc(M_WAITOK); numfullpathfail1 = counter_u64_alloc(M_WAITOK); numfullpathfail2 = counter_u64_alloc(M_WAITOK); numfullpathfail4 = counter_u64_alloc(M_WAITOK); numfullpathfound = counter_u64_alloc(M_WAITOK); } SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_SECOND, nchinit, NULL); void cache_changesize(int newmaxvnodes) { struct nchashhead *new_nchashtbl, *old_nchashtbl; u_long new_nchash, old_nchash; struct namecache *ncp; uint32_t hash; int i; new_nchashtbl = hashinit(newmaxvnodes * 2, M_VFSCACHE, &new_nchash); /* If same hash table size, nothing to do */ if (nchash == new_nchash) { free(new_nchashtbl, M_VFSCACHE); return; } /* * Move everything from the old hash table to the new table. * None of the namecache entries in the table can be removed * because to do so, they have to be removed from the hash table. */ cache_lock_all_vnodes(); cache_lock_all_buckets(); old_nchashtbl = nchashtbl; old_nchash = nchash; nchashtbl = new_nchashtbl; nchash = new_nchash; for (i = 0; i <= old_nchash; i++) { while ((ncp = LIST_FIRST(&old_nchashtbl[i])) != NULL) { hash = cache_get_hash(nc_get_name(ncp), ncp->nc_nlen, ncp->nc_dvp); LIST_REMOVE(ncp, nc_hash); LIST_INSERT_HEAD(NCHHASH(hash), ncp, nc_hash); } } cache_unlock_all_buckets(); cache_unlock_all_vnodes(); free(old_nchashtbl, M_VFSCACHE); } /* * Invalidate all entries to a particular vnode. */ void cache_purge(struct vnode *vp) { TAILQ_HEAD(, namecache) ncps; struct namecache *ncp, *nnp; struct mtx *vlp, *vlp2; CTR1(KTR_VFS, "cache_purge(%p)", vp); SDT_PROBE1(vfs, namecache, purge, done, vp); if (LIST_EMPTY(&vp->v_cache_src) && TAILQ_EMPTY(&vp->v_cache_dst) && vp->v_cache_dd == NULL) return; TAILQ_INIT(&ncps); vlp = VP2VNODELOCK(vp); vlp2 = NULL; mtx_lock(vlp); retry: while (!LIST_EMPTY(&vp->v_cache_src)) { ncp = LIST_FIRST(&vp->v_cache_src); if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2)) goto retry; TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst); } while (!TAILQ_EMPTY(&vp->v_cache_dst)) { ncp = TAILQ_FIRST(&vp->v_cache_dst); if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2)) goto retry; TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst); } ncp = vp->v_cache_dd; if (ncp != NULL) { KASSERT(ncp->nc_flag & NCF_ISDOTDOT, ("lost dotdot link")); if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2)) goto retry; TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst); } KASSERT(vp->v_cache_dd == NULL, ("incomplete purge")); mtx_unlock(vlp); if (vlp2 != NULL) mtx_unlock(vlp2); TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) { cache_free(ncp); } } /* * Invalidate all negative entries for a particular directory vnode. */ void cache_purge_negative(struct vnode *vp) { TAILQ_HEAD(, namecache) ncps; struct namecache *ncp, *nnp; struct mtx *vlp; CTR1(KTR_VFS, "cache_purge_negative(%p)", vp); SDT_PROBE1(vfs, namecache, purge_negative, done, vp); TAILQ_INIT(&ncps); vlp = VP2VNODELOCK(vp); mtx_lock(vlp); LIST_FOREACH_SAFE(ncp, &vp->v_cache_src, nc_src, nnp) { if (ncp->nc_vp != NULL) continue; cache_zap_negative_locked_vnode_kl(ncp, vp); TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst); } mtx_unlock(vlp); TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) { cache_free(ncp); } } /* * Flush all entries referencing a particular filesystem. */ void -cache_purgevfs(struct mount *mp) +cache_purgevfs(struct mount *mp, bool force) { TAILQ_HEAD(, namecache) ncps; struct mtx *vlp1, *vlp2; struct rwlock *blp; struct nchashhead *bucket; struct namecache *ncp, *nnp; u_long i, j, n_nchash; int error; /* Scan hash tables for applicable entries */ SDT_PROBE1(vfs, namecache, purgevfs, done, mp); - if (mp->mnt_nvnodelistsize <= ncpurgeminvnodes) + if (!force && mp->mnt_nvnodelistsize <= ncpurgeminvnodes) return; TAILQ_INIT(&ncps); n_nchash = nchash + 1; vlp1 = vlp2 = NULL; for (i = 0; i < numbucketlocks; i++) { blp = (struct rwlock *)&bucketlocks[i]; rw_wlock(blp); for (j = i; j < n_nchash; j += numbucketlocks) { retry: bucket = &nchashtbl[j]; LIST_FOREACH_SAFE(ncp, bucket, nc_hash, nnp) { cache_assert_bucket_locked(ncp, RA_WLOCKED); if (ncp->nc_dvp->v_mount != mp) continue; error = cache_zap_wlocked_bucket_kl(ncp, blp, &vlp1, &vlp2); if (error != 0) goto retry; TAILQ_INSERT_HEAD(&ncps, ncp, nc_dst); } } rw_wunlock(blp); if (vlp1 == NULL && vlp2 == NULL) cache_maybe_yield(); } if (vlp1 != NULL) mtx_unlock(vlp1); if (vlp2 != NULL) mtx_unlock(vlp2); TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) { cache_free(ncp); } } /* * Perform canonical checks and cache lookup and pass on to filesystem * through the vop_cachedlookup only if needed. */ int vfs_cache_lookup(struct vop_lookup_args *ap) { struct vnode *dvp; int error; struct vnode **vpp = ap->a_vpp; struct componentname *cnp = ap->a_cnp; struct ucred *cred = cnp->cn_cred; int flags = cnp->cn_flags; struct thread *td = cnp->cn_thread; *vpp = NULL; dvp = ap->a_dvp; if (dvp->v_type != VDIR) return (ENOTDIR); if ((flags & ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY) && (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) return (EROFS); error = VOP_ACCESS(dvp, VEXEC, cred, td); if (error) return (error); error = cache_lookup(dvp, vpp, cnp, NULL, NULL); if (error == 0) return (VOP_CACHEDLOOKUP(dvp, vpp, cnp)); if (error == -1) return (0); return (error); } /* * XXX All of these sysctls would probably be more productive dead. */ static int disablecwd; SYSCTL_INT(_debug, OID_AUTO, disablecwd, CTLFLAG_RW, &disablecwd, 0, "Disable the getcwd syscall"); /* Implementation of the getcwd syscall. */ int sys___getcwd(struct thread *td, struct __getcwd_args *uap) { return (kern___getcwd(td, uap->buf, UIO_USERSPACE, uap->buflen, MAXPATHLEN)); } int kern___getcwd(struct thread *td, char *buf, enum uio_seg bufseg, u_int buflen, u_int path_max) { char *bp, *tmpbuf; struct filedesc *fdp; struct vnode *cdir, *rdir; int error; if (disablecwd) return (ENODEV); if (buflen < 2) return (EINVAL); if (buflen > path_max) buflen = path_max; tmpbuf = malloc(buflen, M_TEMP, M_WAITOK); fdp = td->td_proc->p_fd; FILEDESC_SLOCK(fdp); cdir = fdp->fd_cdir; VREF(cdir); rdir = fdp->fd_rdir; VREF(rdir); FILEDESC_SUNLOCK(fdp); error = vn_fullpath1(td, cdir, rdir, tmpbuf, &bp, buflen); vrele(rdir); vrele(cdir); if (!error) { if (bufseg == UIO_SYSSPACE) bcopy(bp, buf, strlen(bp) + 1); else error = copyout(bp, buf, strlen(bp) + 1); #ifdef KTRACE if (KTRPOINT(curthread, KTR_NAMEI)) ktrnamei(bp); #endif } free(tmpbuf, M_TEMP); return (error); } /* * Thus begins the fullpath magic. */ static int disablefullpath; SYSCTL_INT(_debug, OID_AUTO, disablefullpath, CTLFLAG_RW, &disablefullpath, 0, "Disable the vn_fullpath function"); /* * Retrieve the full filesystem path that correspond to a vnode from the name * cache (if available) */ int vn_fullpath(struct thread *td, struct vnode *vn, char **retbuf, char **freebuf) { char *buf; struct filedesc *fdp; struct vnode *rdir; int error; if (disablefullpath) return (ENODEV); if (vn == NULL) return (EINVAL); buf = malloc(MAXPATHLEN, M_TEMP, M_WAITOK); fdp = td->td_proc->p_fd; FILEDESC_SLOCK(fdp); rdir = fdp->fd_rdir; VREF(rdir); FILEDESC_SUNLOCK(fdp); error = vn_fullpath1(td, vn, rdir, buf, retbuf, MAXPATHLEN); vrele(rdir); if (!error) *freebuf = buf; else free(buf, M_TEMP); return (error); } /* * This function is similar to vn_fullpath, but it attempts to lookup the * pathname relative to the global root mount point. This is required for the * auditing sub-system, as audited pathnames must be absolute, relative to the * global root mount point. */ int vn_fullpath_global(struct thread *td, struct vnode *vn, char **retbuf, char **freebuf) { char *buf; int error; if (disablefullpath) return (ENODEV); if (vn == NULL) return (EINVAL); buf = malloc(MAXPATHLEN, M_TEMP, M_WAITOK); error = vn_fullpath1(td, vn, rootvnode, buf, retbuf, MAXPATHLEN); if (!error) *freebuf = buf; else free(buf, M_TEMP); return (error); } int vn_vptocnp(struct vnode **vp, struct ucred *cred, char *buf, u_int *buflen) { struct vnode *dvp; struct namecache *ncp; struct mtx *vlp; int error; vlp = VP2VNODELOCK(*vp); mtx_lock(vlp); TAILQ_FOREACH(ncp, &((*vp)->v_cache_dst), nc_dst) { if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) break; } if (ncp != NULL) { if (*buflen < ncp->nc_nlen) { mtx_unlock(vlp); vrele(*vp); counter_u64_add(numfullpathfail4, 1); error = ENOMEM; SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL); return (error); } *buflen -= ncp->nc_nlen; memcpy(buf + *buflen, nc_get_name(ncp), ncp->nc_nlen); SDT_PROBE3(vfs, namecache, fullpath, hit, ncp->nc_dvp, nc_get_name(ncp), vp); dvp = *vp; *vp = ncp->nc_dvp; vref(*vp); mtx_unlock(vlp); vrele(dvp); return (0); } SDT_PROBE1(vfs, namecache, fullpath, miss, vp); mtx_unlock(vlp); vn_lock(*vp, LK_SHARED | LK_RETRY); error = VOP_VPTOCNP(*vp, &dvp, cred, buf, buflen); vput(*vp); if (error) { counter_u64_add(numfullpathfail2, 1); SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL); return (error); } *vp = dvp; if (dvp->v_iflag & VI_DOOMED) { /* forced unmount */ vrele(dvp); error = ENOENT; SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL); return (error); } /* * *vp has its use count incremented still. */ return (0); } /* * The magic behind kern___getcwd() and vn_fullpath(). */ static int vn_fullpath1(struct thread *td, struct vnode *vp, struct vnode *rdir, char *buf, char **retbuf, u_int buflen) { int error, slash_prefixed; #ifdef KDTRACE_HOOKS struct vnode *startvp = vp; #endif struct vnode *vp1; buflen--; buf[buflen] = '\0'; error = 0; slash_prefixed = 0; SDT_PROBE1(vfs, namecache, fullpath, entry, vp); counter_u64_add(numfullpathcalls, 1); vref(vp); if (vp->v_type != VDIR) { error = vn_vptocnp(&vp, td->td_ucred, buf, &buflen); if (error) return (error); if (buflen == 0) { vrele(vp); return (ENOMEM); } buf[--buflen] = '/'; slash_prefixed = 1; } while (vp != rdir && vp != rootvnode) { if (vp->v_vflag & VV_ROOT) { if (vp->v_iflag & VI_DOOMED) { /* forced unmount */ vrele(vp); error = ENOENT; SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL); break; } vp1 = vp->v_mount->mnt_vnodecovered; vref(vp1); vrele(vp); vp = vp1; continue; } if (vp->v_type != VDIR) { vrele(vp); counter_u64_add(numfullpathfail1, 1); error = ENOTDIR; SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL); break; } error = vn_vptocnp(&vp, td->td_ucred, buf, &buflen); if (error) break; if (buflen == 0) { vrele(vp); error = ENOMEM; SDT_PROBE3(vfs, namecache, fullpath, return, error, startvp, NULL); break; } buf[--buflen] = '/'; slash_prefixed = 1; } if (error) return (error); if (!slash_prefixed) { if (buflen == 0) { vrele(vp); counter_u64_add(numfullpathfail4, 1); SDT_PROBE3(vfs, namecache, fullpath, return, ENOMEM, startvp, NULL); return (ENOMEM); } buf[--buflen] = '/'; } counter_u64_add(numfullpathfound, 1); vrele(vp); SDT_PROBE3(vfs, namecache, fullpath, return, 0, startvp, buf + buflen); *retbuf = buf + buflen; return (0); } struct vnode * vn_dir_dd_ino(struct vnode *vp) { struct namecache *ncp; struct vnode *ddvp; struct mtx *vlp; ASSERT_VOP_LOCKED(vp, "vn_dir_dd_ino"); vlp = VP2VNODELOCK(vp); mtx_lock(vlp); TAILQ_FOREACH(ncp, &(vp->v_cache_dst), nc_dst) { if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) continue; ddvp = ncp->nc_dvp; vhold(ddvp); mtx_unlock(vlp); if (vget(ddvp, LK_SHARED | LK_NOWAIT | LK_VNHELD, curthread)) return (NULL); return (ddvp); } mtx_unlock(vlp); return (NULL); } int vn_commname(struct vnode *vp, char *buf, u_int buflen) { struct namecache *ncp; struct mtx *vlp; int l; vlp = VP2VNODELOCK(vp); mtx_lock(vlp); TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst) if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) break; if (ncp == NULL) { mtx_unlock(vlp); return (ENOENT); } l = min(ncp->nc_nlen, buflen - 1); memcpy(buf, nc_get_name(ncp), l); mtx_unlock(vlp); buf[l] = '\0'; return (0); } /* ABI compat shims for old kernel modules. */ #undef cache_enter void cache_enter(struct vnode *dvp, struct vnode *vp, struct componentname *cnp); void cache_enter(struct vnode *dvp, struct vnode *vp, struct componentname *cnp) { cache_enter_time(dvp, vp, cnp, NULL, NULL); } /* * This function updates path string to vnode's full global path * and checks the size of the new path string against the pathlen argument. * * Requires a locked, referenced vnode. * Vnode is re-locked on success or ENODEV, otherwise unlocked. * * If sysctl debug.disablefullpath is set, ENODEV is returned, * vnode is left locked and path remain untouched. * * If vp is a directory, the call to vn_fullpath_global() always succeeds * because it falls back to the ".." lookup if the namecache lookup fails. */ int vn_path_to_global_path(struct thread *td, struct vnode *vp, char *path, u_int pathlen) { struct nameidata nd; struct vnode *vp1; char *rpath, *fbuf; int error; ASSERT_VOP_ELOCKED(vp, __func__); /* Return ENODEV if sysctl debug.disablefullpath==1 */ if (disablefullpath) return (ENODEV); /* Construct global filesystem path from vp. */ VOP_UNLOCK(vp, 0); error = vn_fullpath_global(td, vp, &rpath, &fbuf); if (error != 0) { vrele(vp); return (error); } if (strlen(rpath) >= pathlen) { vrele(vp); error = ENAMETOOLONG; goto out; } /* * Re-lookup the vnode by path to detect a possible rename. * As a side effect, the vnode is relocked. * If vnode was renamed, return ENOENT. */ NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, UIO_SYSSPACE, path, td); error = namei(&nd); if (error != 0) { vrele(vp); goto out; } NDFREE(&nd, NDF_ONLY_PNBUF); vp1 = nd.ni_vp; vrele(vp); if (vp1 == vp) strcpy(path, rpath); else { vput(vp1); error = ENOENT; } out: free(fbuf, M_TEMP); return (error); } Index: user/alc/PQ_LAUNDRY/sys/kern/vfs_mount.c =================================================================== --- user/alc/PQ_LAUNDRY/sys/kern/vfs_mount.c (revision 306831) +++ user/alc/PQ_LAUNDRY/sys/kern/vfs_mount.c (revision 306832) @@ -1,2003 +1,2003 @@ /*- * Copyright (c) 1999-2004 Poul-Henning Kamp * Copyright (c) 1999 Michael Smith * Copyright (c) 1989, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define VFS_MOUNTARG_SIZE_MAX (1024 * 64) static int vfs_domount(struct thread *td, const char *fstype, char *fspath, uint64_t fsflags, struct vfsoptlist **optlist); static void free_mntarg(struct mntarg *ma); static int usermount = 0; SYSCTL_INT(_vfs, OID_AUTO, usermount, CTLFLAG_RW, &usermount, 0, "Unprivileged users may mount and unmount file systems"); MALLOC_DEFINE(M_MOUNT, "mount", "vfs mount structure"); static uma_zone_t mount_zone; /* List of mounted filesystems. */ struct mntlist mountlist = TAILQ_HEAD_INITIALIZER(mountlist); /* For any iteration/modification of mountlist */ struct mtx mountlist_mtx; MTX_SYSINIT(mountlist, &mountlist_mtx, "mountlist", MTX_DEF); /* * Global opts, taken by all filesystems */ static const char *global_opts[] = { "errmsg", "fstype", "fspath", "ro", "rw", "nosuid", "noexec", NULL }; static int mount_init(void *mem, int size, int flags) { struct mount *mp; mp = (struct mount *)mem; mtx_init(&mp->mnt_mtx, "struct mount mtx", NULL, MTX_DEF); mtx_init(&mp->mnt_listmtx, "struct mount vlist mtx", NULL, MTX_DEF); lockinit(&mp->mnt_explock, PVFS, "explock", 0, 0); return (0); } static void mount_fini(void *mem, int size) { struct mount *mp; mp = (struct mount *)mem; lockdestroy(&mp->mnt_explock); mtx_destroy(&mp->mnt_listmtx); mtx_destroy(&mp->mnt_mtx); } static void vfs_mount_init(void *dummy __unused) { mount_zone = uma_zcreate("Mountpoints", sizeof(struct mount), NULL, NULL, mount_init, mount_fini, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); } SYSINIT(vfs_mount, SI_SUB_VFS, SI_ORDER_ANY, vfs_mount_init, NULL); /* * --------------------------------------------------------------------- * Functions for building and sanitizing the mount options */ /* Remove one mount option. */ static void vfs_freeopt(struct vfsoptlist *opts, struct vfsopt *opt) { TAILQ_REMOVE(opts, opt, link); free(opt->name, M_MOUNT); if (opt->value != NULL) free(opt->value, M_MOUNT); free(opt, M_MOUNT); } /* Release all resources related to the mount options. */ void vfs_freeopts(struct vfsoptlist *opts) { struct vfsopt *opt; while (!TAILQ_EMPTY(opts)) { opt = TAILQ_FIRST(opts); vfs_freeopt(opts, opt); } free(opts, M_MOUNT); } void vfs_deleteopt(struct vfsoptlist *opts, const char *name) { struct vfsopt *opt, *temp; if (opts == NULL) return; TAILQ_FOREACH_SAFE(opt, opts, link, temp) { if (strcmp(opt->name, name) == 0) vfs_freeopt(opts, opt); } } static int vfs_isopt_ro(const char *opt) { if (strcmp(opt, "ro") == 0 || strcmp(opt, "rdonly") == 0 || strcmp(opt, "norw") == 0) return (1); return (0); } static int vfs_isopt_rw(const char *opt) { if (strcmp(opt, "rw") == 0 || strcmp(opt, "noro") == 0) return (1); return (0); } /* * Check if options are equal (with or without the "no" prefix). */ static int vfs_equalopts(const char *opt1, const char *opt2) { char *p; /* "opt" vs. "opt" or "noopt" vs. "noopt" */ if (strcmp(opt1, opt2) == 0) return (1); /* "noopt" vs. "opt" */ if (strncmp(opt1, "no", 2) == 0 && strcmp(opt1 + 2, opt2) == 0) return (1); /* "opt" vs. "noopt" */ if (strncmp(opt2, "no", 2) == 0 && strcmp(opt1, opt2 + 2) == 0) return (1); while ((p = strchr(opt1, '.')) != NULL && !strncmp(opt1, opt2, ++p - opt1)) { opt2 += p - opt1; opt1 = p; /* "foo.noopt" vs. "foo.opt" */ if (strncmp(opt1, "no", 2) == 0 && strcmp(opt1 + 2, opt2) == 0) return (1); /* "foo.opt" vs. "foo.noopt" */ if (strncmp(opt2, "no", 2) == 0 && strcmp(opt1, opt2 + 2) == 0) return (1); } /* "ro" / "rdonly" / "norw" / "rw" / "noro" */ if ((vfs_isopt_ro(opt1) || vfs_isopt_rw(opt1)) && (vfs_isopt_ro(opt2) || vfs_isopt_rw(opt2))) return (1); return (0); } /* * If a mount option is specified several times, * (with or without the "no" prefix) only keep * the last occurrence of it. */ static void vfs_sanitizeopts(struct vfsoptlist *opts) { struct vfsopt *opt, *opt2, *tmp; TAILQ_FOREACH_REVERSE(opt, opts, vfsoptlist, link) { opt2 = TAILQ_PREV(opt, vfsoptlist, link); while (opt2 != NULL) { if (vfs_equalopts(opt->name, opt2->name)) { tmp = TAILQ_PREV(opt2, vfsoptlist, link); vfs_freeopt(opts, opt2); opt2 = tmp; } else { opt2 = TAILQ_PREV(opt2, vfsoptlist, link); } } } } /* * Build a linked list of mount options from a struct uio. */ int vfs_buildopts(struct uio *auio, struct vfsoptlist **options) { struct vfsoptlist *opts; struct vfsopt *opt; size_t memused, namelen, optlen; unsigned int i, iovcnt; int error; opts = malloc(sizeof(struct vfsoptlist), M_MOUNT, M_WAITOK); TAILQ_INIT(opts); memused = 0; iovcnt = auio->uio_iovcnt; for (i = 0; i < iovcnt; i += 2) { namelen = auio->uio_iov[i].iov_len; optlen = auio->uio_iov[i + 1].iov_len; memused += sizeof(struct vfsopt) + optlen + namelen; /* * Avoid consuming too much memory, and attempts to overflow * memused. */ if (memused > VFS_MOUNTARG_SIZE_MAX || optlen > VFS_MOUNTARG_SIZE_MAX || namelen > VFS_MOUNTARG_SIZE_MAX) { error = EINVAL; goto bad; } opt = malloc(sizeof(struct vfsopt), M_MOUNT, M_WAITOK); opt->name = malloc(namelen, M_MOUNT, M_WAITOK); opt->value = NULL; opt->len = 0; opt->pos = i / 2; opt->seen = 0; /* * Do this early, so jumps to "bad" will free the current * option. */ TAILQ_INSERT_TAIL(opts, opt, link); if (auio->uio_segflg == UIO_SYSSPACE) { bcopy(auio->uio_iov[i].iov_base, opt->name, namelen); } else { error = copyin(auio->uio_iov[i].iov_base, opt->name, namelen); if (error) goto bad; } /* Ensure names are null-terminated strings. */ if (namelen == 0 || opt->name[namelen - 1] != '\0') { error = EINVAL; goto bad; } if (optlen != 0) { opt->len = optlen; opt->value = malloc(optlen, M_MOUNT, M_WAITOK); if (auio->uio_segflg == UIO_SYSSPACE) { bcopy(auio->uio_iov[i + 1].iov_base, opt->value, optlen); } else { error = copyin(auio->uio_iov[i + 1].iov_base, opt->value, optlen); if (error) goto bad; } } } vfs_sanitizeopts(opts); *options = opts; return (0); bad: vfs_freeopts(opts); return (error); } /* * Merge the old mount options with the new ones passed * in the MNT_UPDATE case. * * XXX: This function will keep a "nofoo" option in the new * options. E.g, if the option's canonical name is "foo", * "nofoo" ends up in the mount point's active options. */ static void vfs_mergeopts(struct vfsoptlist *toopts, struct vfsoptlist *oldopts) { struct vfsopt *opt, *new; TAILQ_FOREACH(opt, oldopts, link) { new = malloc(sizeof(struct vfsopt), M_MOUNT, M_WAITOK); new->name = strdup(opt->name, M_MOUNT); if (opt->len != 0) { new->value = malloc(opt->len, M_MOUNT, M_WAITOK); bcopy(opt->value, new->value, opt->len); } else new->value = NULL; new->len = opt->len; new->seen = opt->seen; TAILQ_INSERT_HEAD(toopts, new, link); } vfs_sanitizeopts(toopts); } /* * Mount a filesystem. */ int sys_nmount(td, uap) struct thread *td; struct nmount_args /* { struct iovec *iovp; unsigned int iovcnt; int flags; } */ *uap; { struct uio *auio; int error; u_int iovcnt; uint64_t flags; /* * Mount flags are now 64-bits. On 32-bit archtectures only * 32-bits are passed in, but from here on everything handles * 64-bit flags correctly. */ flags = uap->flags; AUDIT_ARG_FFLAGS(flags); CTR4(KTR_VFS, "%s: iovp %p with iovcnt %d and flags %d", __func__, uap->iovp, uap->iovcnt, flags); /* * Filter out MNT_ROOTFS. We do not want clients of nmount() in * userspace to set this flag, but we must filter it out if we want * MNT_UPDATE on the root file system to work. * MNT_ROOTFS should only be set by the kernel when mounting its * root file system. */ flags &= ~MNT_ROOTFS; iovcnt = uap->iovcnt; /* * Check that we have an even number of iovec's * and that we have at least two options. */ if ((iovcnt & 1) || (iovcnt < 4)) { CTR2(KTR_VFS, "%s: failed for invalid iovcnt %d", __func__, uap->iovcnt); return (EINVAL); } error = copyinuio(uap->iovp, iovcnt, &auio); if (error) { CTR2(KTR_VFS, "%s: failed for invalid uio op with %d errno", __func__, error); return (error); } error = vfs_donmount(td, flags, auio); free(auio, M_IOV); return (error); } /* * --------------------------------------------------------------------- * Various utility functions */ void vfs_ref(struct mount *mp) { CTR2(KTR_VFS, "%s: mp %p", __func__, mp); MNT_ILOCK(mp); MNT_REF(mp); MNT_IUNLOCK(mp); } void vfs_rel(struct mount *mp) { CTR2(KTR_VFS, "%s: mp %p", __func__, mp); MNT_ILOCK(mp); MNT_REL(mp); MNT_IUNLOCK(mp); } /* * Allocate and initialize the mount point struct. */ struct mount * vfs_mount_alloc(struct vnode *vp, struct vfsconf *vfsp, const char *fspath, struct ucred *cred) { struct mount *mp; mp = uma_zalloc(mount_zone, M_WAITOK); bzero(&mp->mnt_startzero, __rangeof(struct mount, mnt_startzero, mnt_endzero)); TAILQ_INIT(&mp->mnt_nvnodelist); mp->mnt_nvnodelistsize = 0; TAILQ_INIT(&mp->mnt_activevnodelist); mp->mnt_activevnodelistsize = 0; TAILQ_INIT(&mp->mnt_tmpfreevnodelist); mp->mnt_tmpfreevnodelistsize = 0; mp->mnt_ref = 0; (void) vfs_busy(mp, MBF_NOWAIT); atomic_add_acq_int(&vfsp->vfc_refcount, 1); mp->mnt_op = vfsp->vfc_vfsops; mp->mnt_vfc = vfsp; mp->mnt_stat.f_type = vfsp->vfc_typenum; mp->mnt_gen++; strlcpy(mp->mnt_stat.f_fstypename, vfsp->vfc_name, MFSNAMELEN); mp->mnt_vnodecovered = vp; mp->mnt_cred = crdup(cred); mp->mnt_stat.f_owner = cred->cr_uid; strlcpy(mp->mnt_stat.f_mntonname, fspath, MNAMELEN); mp->mnt_iosize_max = DFLTPHYS; #ifdef MAC mac_mount_init(mp); mac_mount_create(cred, mp); #endif arc4rand(&mp->mnt_hashseed, sizeof mp->mnt_hashseed, 0); TAILQ_INIT(&mp->mnt_uppers); return (mp); } /* * Destroy the mount struct previously allocated by vfs_mount_alloc(). */ void vfs_mount_destroy(struct mount *mp) { MNT_ILOCK(mp); mp->mnt_kern_flag |= MNTK_REFEXPIRE; if (mp->mnt_kern_flag & MNTK_MWAIT) { mp->mnt_kern_flag &= ~MNTK_MWAIT; wakeup(mp); } while (mp->mnt_ref) msleep(mp, MNT_MTX(mp), PVFS, "mntref", 0); KASSERT(mp->mnt_ref == 0, ("%s: invalid refcount in the drain path @ %s:%d", __func__, __FILE__, __LINE__)); if (mp->mnt_writeopcount != 0) panic("vfs_mount_destroy: nonzero writeopcount"); if (mp->mnt_secondary_writes != 0) panic("vfs_mount_destroy: nonzero secondary_writes"); atomic_subtract_rel_int(&mp->mnt_vfc->vfc_refcount, 1); if (!TAILQ_EMPTY(&mp->mnt_nvnodelist)) { struct vnode *vp; TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) vn_printf(vp, "dangling vnode "); panic("unmount: dangling vnode"); } KASSERT(TAILQ_EMPTY(&mp->mnt_uppers), ("mnt_uppers")); if (mp->mnt_nvnodelistsize != 0) panic("vfs_mount_destroy: nonzero nvnodelistsize"); if (mp->mnt_activevnodelistsize != 0) panic("vfs_mount_destroy: nonzero activevnodelistsize"); if (mp->mnt_lockref != 0) panic("vfs_mount_destroy: nonzero lock refcount"); MNT_IUNLOCK(mp); #ifdef MAC mac_mount_destroy(mp); #endif if (mp->mnt_opt != NULL) vfs_freeopts(mp->mnt_opt); crfree(mp->mnt_cred); uma_zfree(mount_zone, mp); } int vfs_donmount(struct thread *td, uint64_t fsflags, struct uio *fsoptions) { struct vfsoptlist *optlist; struct vfsopt *opt, *tmp_opt; char *fstype, *fspath, *errmsg; int error, fstypelen, fspathlen, errmsg_len, errmsg_pos; errmsg = fspath = NULL; errmsg_len = fspathlen = 0; errmsg_pos = -1; error = vfs_buildopts(fsoptions, &optlist); if (error) return (error); if (vfs_getopt(optlist, "errmsg", (void **)&errmsg, &errmsg_len) == 0) errmsg_pos = vfs_getopt_pos(optlist, "errmsg"); /* * We need these two options before the others, * and they are mandatory for any filesystem. * Ensure they are NUL terminated as well. */ fstypelen = 0; error = vfs_getopt(optlist, "fstype", (void **)&fstype, &fstypelen); if (error || fstype[fstypelen - 1] != '\0') { error = EINVAL; if (errmsg != NULL) strncpy(errmsg, "Invalid fstype", errmsg_len); goto bail; } fspathlen = 0; error = vfs_getopt(optlist, "fspath", (void **)&fspath, &fspathlen); if (error || fspath[fspathlen - 1] != '\0') { error = EINVAL; if (errmsg != NULL) strncpy(errmsg, "Invalid fspath", errmsg_len); goto bail; } /* * We need to see if we have the "update" option * before we call vfs_domount(), since vfs_domount() has special * logic based on MNT_UPDATE. This is very important * when we want to update the root filesystem. */ TAILQ_FOREACH_SAFE(opt, optlist, link, tmp_opt) { if (strcmp(opt->name, "update") == 0) { fsflags |= MNT_UPDATE; vfs_freeopt(optlist, opt); } else if (strcmp(opt->name, "async") == 0) fsflags |= MNT_ASYNC; else if (strcmp(opt->name, "force") == 0) { fsflags |= MNT_FORCE; vfs_freeopt(optlist, opt); } else if (strcmp(opt->name, "reload") == 0) { fsflags |= MNT_RELOAD; vfs_freeopt(optlist, opt); } else if (strcmp(opt->name, "multilabel") == 0) fsflags |= MNT_MULTILABEL; else if (strcmp(opt->name, "noasync") == 0) fsflags &= ~MNT_ASYNC; else if (strcmp(opt->name, "noatime") == 0) fsflags |= MNT_NOATIME; else if (strcmp(opt->name, "atime") == 0) { free(opt->name, M_MOUNT); opt->name = strdup("nonoatime", M_MOUNT); } else if (strcmp(opt->name, "noclusterr") == 0) fsflags |= MNT_NOCLUSTERR; else if (strcmp(opt->name, "clusterr") == 0) { free(opt->name, M_MOUNT); opt->name = strdup("nonoclusterr", M_MOUNT); } else if (strcmp(opt->name, "noclusterw") == 0) fsflags |= MNT_NOCLUSTERW; else if (strcmp(opt->name, "clusterw") == 0) { free(opt->name, M_MOUNT); opt->name = strdup("nonoclusterw", M_MOUNT); } else if (strcmp(opt->name, "noexec") == 0) fsflags |= MNT_NOEXEC; else if (strcmp(opt->name, "exec") == 0) { free(opt->name, M_MOUNT); opt->name = strdup("nonoexec", M_MOUNT); } else if (strcmp(opt->name, "nosuid") == 0) fsflags |= MNT_NOSUID; else if (strcmp(opt->name, "suid") == 0) { free(opt->name, M_MOUNT); opt->name = strdup("nonosuid", M_MOUNT); } else if (strcmp(opt->name, "nosymfollow") == 0) fsflags |= MNT_NOSYMFOLLOW; else if (strcmp(opt->name, "symfollow") == 0) { free(opt->name, M_MOUNT); opt->name = strdup("nonosymfollow", M_MOUNT); } else if (strcmp(opt->name, "noro") == 0) fsflags &= ~MNT_RDONLY; else if (strcmp(opt->name, "rw") == 0) fsflags &= ~MNT_RDONLY; else if (strcmp(opt->name, "ro") == 0) fsflags |= MNT_RDONLY; else if (strcmp(opt->name, "rdonly") == 0) { free(opt->name, M_MOUNT); opt->name = strdup("ro", M_MOUNT); fsflags |= MNT_RDONLY; } else if (strcmp(opt->name, "suiddir") == 0) fsflags |= MNT_SUIDDIR; else if (strcmp(opt->name, "sync") == 0) fsflags |= MNT_SYNCHRONOUS; else if (strcmp(opt->name, "union") == 0) fsflags |= MNT_UNION; else if (strcmp(opt->name, "automounted") == 0) { fsflags |= MNT_AUTOMOUNTED; vfs_freeopt(optlist, opt); } } /* * Be ultra-paranoid about making sure the type and fspath * variables will fit in our mp buffers, including the * terminating NUL. */ if (fstypelen > MFSNAMELEN || fspathlen > MNAMELEN) { error = ENAMETOOLONG; goto bail; } error = vfs_domount(td, fstype, fspath, fsflags, &optlist); bail: /* copyout the errmsg */ if (errmsg_pos != -1 && ((2 * errmsg_pos + 1) < fsoptions->uio_iovcnt) && errmsg_len > 0 && errmsg != NULL) { if (fsoptions->uio_segflg == UIO_SYSSPACE) { bcopy(errmsg, fsoptions->uio_iov[2 * errmsg_pos + 1].iov_base, fsoptions->uio_iov[2 * errmsg_pos + 1].iov_len); } else { copyout(errmsg, fsoptions->uio_iov[2 * errmsg_pos + 1].iov_base, fsoptions->uio_iov[2 * errmsg_pos + 1].iov_len); } } if (optlist != NULL) vfs_freeopts(optlist); return (error); } /* * Old mount API. */ #ifndef _SYS_SYSPROTO_H_ struct mount_args { char *type; char *path; int flags; caddr_t data; }; #endif /* ARGSUSED */ int sys_mount(td, uap) struct thread *td; struct mount_args /* { char *type; char *path; int flags; caddr_t data; } */ *uap; { char *fstype; struct vfsconf *vfsp = NULL; struct mntarg *ma = NULL; uint64_t flags; int error; /* * Mount flags are now 64-bits. On 32-bit architectures only * 32-bits are passed in, but from here on everything handles * 64-bit flags correctly. */ flags = uap->flags; AUDIT_ARG_FFLAGS(flags); /* * Filter out MNT_ROOTFS. We do not want clients of mount() in * userspace to set this flag, but we must filter it out if we want * MNT_UPDATE on the root file system to work. * MNT_ROOTFS should only be set by the kernel when mounting its * root file system. */ flags &= ~MNT_ROOTFS; fstype = malloc(MFSNAMELEN, M_TEMP, M_WAITOK); error = copyinstr(uap->type, fstype, MFSNAMELEN, NULL); if (error) { free(fstype, M_TEMP); return (error); } AUDIT_ARG_TEXT(fstype); vfsp = vfs_byname_kld(fstype, td, &error); free(fstype, M_TEMP); if (vfsp == NULL) return (ENOENT); if (vfsp->vfc_vfsops->vfs_cmount == NULL) return (EOPNOTSUPP); ma = mount_argsu(ma, "fstype", uap->type, MFSNAMELEN); ma = mount_argsu(ma, "fspath", uap->path, MNAMELEN); ma = mount_argb(ma, flags & MNT_RDONLY, "noro"); ma = mount_argb(ma, !(flags & MNT_NOSUID), "nosuid"); ma = mount_argb(ma, !(flags & MNT_NOEXEC), "noexec"); error = vfsp->vfc_vfsops->vfs_cmount(ma, uap->data, flags); return (error); } /* * vfs_domount_first(): first file system mount (not update) */ static int vfs_domount_first( struct thread *td, /* Calling thread. */ struct vfsconf *vfsp, /* File system type. */ char *fspath, /* Mount path. */ struct vnode *vp, /* Vnode to be covered. */ uint64_t fsflags, /* Flags common to all filesystems. */ struct vfsoptlist **optlist /* Options local to the filesystem. */ ) { struct vattr va; struct mount *mp; struct vnode *newdp; int error; ASSERT_VOP_ELOCKED(vp, __func__); KASSERT((fsflags & MNT_UPDATE) == 0, ("MNT_UPDATE shouldn't be here")); /* * If the user is not root, ensure that they own the directory * onto which we are attempting to mount. */ error = VOP_GETATTR(vp, &va, td->td_ucred); if (error == 0 && va.va_uid != td->td_ucred->cr_uid) error = priv_check_cred(td->td_ucred, PRIV_VFS_ADMIN, 0); if (error == 0) error = vinvalbuf(vp, V_SAVE, 0, 0); if (error == 0 && vp->v_type != VDIR) error = ENOTDIR; if (error == 0) { VI_LOCK(vp); if ((vp->v_iflag & VI_MOUNT) == 0 && vp->v_mountedhere == NULL) vp->v_iflag |= VI_MOUNT; else error = EBUSY; VI_UNLOCK(vp); } if (error != 0) { vput(vp); return (error); } VOP_UNLOCK(vp, 0); /* Allocate and initialize the filesystem. */ mp = vfs_mount_alloc(vp, vfsp, fspath, td->td_ucred); /* XXXMAC: pass to vfs_mount_alloc? */ mp->mnt_optnew = *optlist; /* Set the mount level flags. */ mp->mnt_flag = (fsflags & (MNT_UPDATEMASK | MNT_ROOTFS | MNT_RDONLY)); /* * Mount the filesystem. * XXX The final recipients of VFS_MOUNT just overwrite the ndp they * get. No freeing of cn_pnbuf. */ error = VFS_MOUNT(mp); if (error != 0) { vfs_unbusy(mp); vfs_mount_destroy(mp); VI_LOCK(vp); vp->v_iflag &= ~VI_MOUNT; VI_UNLOCK(vp); vrele(vp); return (error); } if (mp->mnt_opt != NULL) vfs_freeopts(mp->mnt_opt); mp->mnt_opt = mp->mnt_optnew; *optlist = NULL; (void)VFS_STATFS(mp, &mp->mnt_stat); /* * Prevent external consumers of mount options from reading mnt_optnew. */ mp->mnt_optnew = NULL; MNT_ILOCK(mp); if ((mp->mnt_flag & MNT_ASYNC) != 0 && (mp->mnt_kern_flag & MNTK_NOASYNC) == 0) mp->mnt_kern_flag |= MNTK_ASYNC; else mp->mnt_kern_flag &= ~MNTK_ASYNC; MNT_IUNLOCK(mp); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); cache_purge(vp); VI_LOCK(vp); vp->v_iflag &= ~VI_MOUNT; VI_UNLOCK(vp); vp->v_mountedhere = mp; /* Place the new filesystem at the end of the mount list. */ mtx_lock(&mountlist_mtx); TAILQ_INSERT_TAIL(&mountlist, mp, mnt_list); mtx_unlock(&mountlist_mtx); vfs_event_signal(NULL, VQ_MOUNT, 0); if (VFS_ROOT(mp, LK_EXCLUSIVE, &newdp)) panic("mount: lost mount"); VOP_UNLOCK(vp, 0); EVENTHANDLER_INVOKE(vfs_mounted, mp, newdp, td); VOP_UNLOCK(newdp, 0); mountcheckdirs(vp, newdp); vrele(newdp); if ((mp->mnt_flag & MNT_RDONLY) == 0) vfs_allocate_syncvnode(mp); vfs_unbusy(mp); return (0); } /* * vfs_domount_update(): update of mounted file system */ static int vfs_domount_update( struct thread *td, /* Calling thread. */ struct vnode *vp, /* Mount point vnode. */ uint64_t fsflags, /* Flags common to all filesystems. */ struct vfsoptlist **optlist /* Options local to the filesystem. */ ) { struct export_args export; void *bufp; struct mount *mp; int error, export_error, len; uint64_t flag; ASSERT_VOP_ELOCKED(vp, __func__); KASSERT((fsflags & MNT_UPDATE) != 0, ("MNT_UPDATE should be here")); mp = vp->v_mount; if ((vp->v_vflag & VV_ROOT) == 0) { if (vfs_copyopt(*optlist, "export", &export, sizeof(export)) == 0) error = EXDEV; else error = EINVAL; vput(vp); return (error); } /* * We only allow the filesystem to be reloaded if it * is currently mounted read-only. */ flag = mp->mnt_flag; if ((fsflags & MNT_RELOAD) != 0 && (flag & MNT_RDONLY) == 0) { vput(vp); return (EOPNOTSUPP); /* Needs translation */ } /* * Only privileged root, or (if MNT_USER is set) the user that * did the original mount is permitted to update it. */ error = vfs_suser(mp, td); if (error != 0) { vput(vp); return (error); } if (vfs_busy(mp, MBF_NOWAIT)) { vput(vp); return (EBUSY); } VI_LOCK(vp); if ((vp->v_iflag & VI_MOUNT) != 0 || vp->v_mountedhere != NULL) { VI_UNLOCK(vp); vfs_unbusy(mp); vput(vp); return (EBUSY); } vp->v_iflag |= VI_MOUNT; VI_UNLOCK(vp); VOP_UNLOCK(vp, 0); MNT_ILOCK(mp); mp->mnt_flag &= ~MNT_UPDATEMASK; mp->mnt_flag |= fsflags & (MNT_RELOAD | MNT_FORCE | MNT_UPDATE | MNT_SNAPSHOT | MNT_ROOTFS | MNT_UPDATEMASK | MNT_RDONLY); if ((mp->mnt_flag & MNT_ASYNC) == 0) mp->mnt_kern_flag &= ~MNTK_ASYNC; MNT_IUNLOCK(mp); mp->mnt_optnew = *optlist; vfs_mergeopts(mp->mnt_optnew, mp->mnt_opt); /* * Mount the filesystem. * XXX The final recipients of VFS_MOUNT just overwrite the ndp they * get. No freeing of cn_pnbuf. */ error = VFS_MOUNT(mp); export_error = 0; /* Process the export option. */ if (error == 0 && vfs_getopt(mp->mnt_optnew, "export", &bufp, &len) == 0) { /* Assume that there is only 1 ABI for each length. */ switch (len) { case (sizeof(struct oexport_args)): bzero(&export, sizeof(export)); /* FALLTHROUGH */ case (sizeof(export)): bcopy(bufp, &export, len); export_error = vfs_export(mp, &export); break; default: export_error = EINVAL; break; } } MNT_ILOCK(mp); if (error == 0) { mp->mnt_flag &= ~(MNT_UPDATE | MNT_RELOAD | MNT_FORCE | MNT_SNAPSHOT); } else { /* * If we fail, restore old mount flags. MNT_QUOTA is special, * because it is not part of MNT_UPDATEMASK, but it could have * changed in the meantime if quotactl(2) was called. * All in all we want current value of MNT_QUOTA, not the old * one. */ mp->mnt_flag = (mp->mnt_flag & MNT_QUOTA) | (flag & ~MNT_QUOTA); } if ((mp->mnt_flag & MNT_ASYNC) != 0 && (mp->mnt_kern_flag & MNTK_NOASYNC) == 0) mp->mnt_kern_flag |= MNTK_ASYNC; else mp->mnt_kern_flag &= ~MNTK_ASYNC; MNT_IUNLOCK(mp); if (error != 0) goto end; if (mp->mnt_opt != NULL) vfs_freeopts(mp->mnt_opt); mp->mnt_opt = mp->mnt_optnew; *optlist = NULL; (void)VFS_STATFS(mp, &mp->mnt_stat); /* * Prevent external consumers of mount options from reading * mnt_optnew. */ mp->mnt_optnew = NULL; if ((mp->mnt_flag & MNT_RDONLY) == 0) vfs_allocate_syncvnode(mp); else vfs_deallocate_syncvnode(mp); end: vfs_unbusy(mp); VI_LOCK(vp); vp->v_iflag &= ~VI_MOUNT; VI_UNLOCK(vp); vrele(vp); return (error != 0 ? error : export_error); } /* * vfs_domount(): actually attempt a filesystem mount. */ static int vfs_domount( struct thread *td, /* Calling thread. */ const char *fstype, /* Filesystem type. */ char *fspath, /* Mount path. */ uint64_t fsflags, /* Flags common to all filesystems. */ struct vfsoptlist **optlist /* Options local to the filesystem. */ ) { struct vfsconf *vfsp; struct nameidata nd; struct vnode *vp; char *pathbuf; int error; /* * Be ultra-paranoid about making sure the type and fspath * variables will fit in our mp buffers, including the * terminating NUL. */ if (strlen(fstype) >= MFSNAMELEN || strlen(fspath) >= MNAMELEN) return (ENAMETOOLONG); if (jailed(td->td_ucred) || usermount == 0) { if ((error = priv_check(td, PRIV_VFS_MOUNT)) != 0) return (error); } /* * Do not allow NFS export or MNT_SUIDDIR by unprivileged users. */ if (fsflags & MNT_EXPORTED) { error = priv_check(td, PRIV_VFS_MOUNT_EXPORTED); if (error) return (error); } if (fsflags & MNT_SUIDDIR) { error = priv_check(td, PRIV_VFS_MOUNT_SUIDDIR); if (error) return (error); } /* * Silently enforce MNT_NOSUID and MNT_USER for unprivileged users. */ if ((fsflags & (MNT_NOSUID | MNT_USER)) != (MNT_NOSUID | MNT_USER)) { if (priv_check(td, PRIV_VFS_MOUNT_NONUSER) != 0) fsflags |= MNT_NOSUID | MNT_USER; } /* Load KLDs before we lock the covered vnode to avoid reversals. */ vfsp = NULL; if ((fsflags & MNT_UPDATE) == 0) { /* Don't try to load KLDs if we're mounting the root. */ if (fsflags & MNT_ROOTFS) vfsp = vfs_byname(fstype); else vfsp = vfs_byname_kld(fstype, td, &error); if (vfsp == NULL) return (ENODEV); if (jailed(td->td_ucred) && !(vfsp->vfc_flags & VFCF_JAIL)) return (EPERM); } /* * Get vnode to be covered or mount point's vnode in case of MNT_UPDATE. */ NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, UIO_SYSSPACE, fspath, td); error = namei(&nd); if (error != 0) return (error); NDFREE(&nd, NDF_ONLY_PNBUF); vp = nd.ni_vp; if ((fsflags & MNT_UPDATE) == 0) { pathbuf = malloc(MNAMELEN, M_TEMP, M_WAITOK); strcpy(pathbuf, fspath); error = vn_path_to_global_path(td, vp, pathbuf, MNAMELEN); /* debug.disablefullpath == 1 results in ENODEV */ if (error == 0 || error == ENODEV) { error = vfs_domount_first(td, vfsp, pathbuf, vp, fsflags, optlist); } free(pathbuf, M_TEMP); } else error = vfs_domount_update(td, vp, fsflags, optlist); return (error); } /* * Unmount a filesystem. * * Note: unmount takes a path to the vnode mounted on as argument, not * special file (as before). */ #ifndef _SYS_SYSPROTO_H_ struct unmount_args { char *path; int flags; }; #endif /* ARGSUSED */ int sys_unmount(struct thread *td, struct unmount_args *uap) { struct nameidata nd; struct mount *mp; char *pathbuf; int error, id0, id1; AUDIT_ARG_VALUE(uap->flags); if (jailed(td->td_ucred) || usermount == 0) { error = priv_check(td, PRIV_VFS_UNMOUNT); if (error) return (error); } pathbuf = malloc(MNAMELEN, M_TEMP, M_WAITOK); error = copyinstr(uap->path, pathbuf, MNAMELEN, NULL); if (error) { free(pathbuf, M_TEMP); return (error); } if (uap->flags & MNT_BYFSID) { AUDIT_ARG_TEXT(pathbuf); /* Decode the filesystem ID. */ if (sscanf(pathbuf, "FSID:%d:%d", &id0, &id1) != 2) { free(pathbuf, M_TEMP); return (EINVAL); } mtx_lock(&mountlist_mtx); TAILQ_FOREACH_REVERSE(mp, &mountlist, mntlist, mnt_list) { if (mp->mnt_stat.f_fsid.val[0] == id0 && mp->mnt_stat.f_fsid.val[1] == id1) { vfs_ref(mp); break; } } mtx_unlock(&mountlist_mtx); } else { /* * Try to find global path for path argument. */ NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, UIO_SYSSPACE, pathbuf, td); if (namei(&nd) == 0) { NDFREE(&nd, NDF_ONLY_PNBUF); error = vn_path_to_global_path(td, nd.ni_vp, pathbuf, MNAMELEN); if (error == 0 || error == ENODEV) vput(nd.ni_vp); } mtx_lock(&mountlist_mtx); TAILQ_FOREACH_REVERSE(mp, &mountlist, mntlist, mnt_list) { if (strcmp(mp->mnt_stat.f_mntonname, pathbuf) == 0) { vfs_ref(mp); break; } } mtx_unlock(&mountlist_mtx); } free(pathbuf, M_TEMP); if (mp == NULL) { /* * Previously we returned ENOENT for a nonexistent path and * EINVAL for a non-mountpoint. We cannot tell these apart * now, so in the !MNT_BYFSID case return the more likely * EINVAL for compatibility. */ return ((uap->flags & MNT_BYFSID) ? ENOENT : EINVAL); } /* * Don't allow unmounting the root filesystem. */ if (mp->mnt_flag & MNT_ROOTFS) { vfs_rel(mp); return (EINVAL); } error = dounmount(mp, uap->flags, td); return (error); } /* * Return error if any of the vnodes, ignoring the root vnode * and the syncer vnode, have non-zero usecount. * * This function is purely advisory - it can return false positives * and negatives. */ static int vfs_check_usecounts(struct mount *mp) { struct vnode *vp, *mvp; MNT_VNODE_FOREACH_ALL(vp, mp, mvp) { if ((vp->v_vflag & VV_ROOT) == 0 && vp->v_type != VNON && vp->v_usecount != 0) { VI_UNLOCK(vp); MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp); return (EBUSY); } VI_UNLOCK(vp); } return (0); } /* * Do the actual filesystem unmount. */ int dounmount(struct mount *mp, int flags, struct thread *td) { struct vnode *coveredvp, *fsrootvp; int error; uint64_t async_flag; int mnt_gen_r; if ((coveredvp = mp->mnt_vnodecovered) != NULL) { mnt_gen_r = mp->mnt_gen; VI_LOCK(coveredvp); vholdl(coveredvp); vn_lock(coveredvp, LK_EXCLUSIVE | LK_INTERLOCK | LK_RETRY); /* * Check for mp being unmounted while waiting for the * covered vnode lock. */ if (coveredvp->v_mountedhere != mp || coveredvp->v_mountedhere->mnt_gen != mnt_gen_r) { VOP_UNLOCK(coveredvp, 0); vdrop(coveredvp); vfs_rel(mp); return (EBUSY); } } /* * Only privileged root, or (if MNT_USER is set) the user that did the * original mount is permitted to unmount this filesystem. */ error = vfs_suser(mp, td); if (error != 0) { if (coveredvp != NULL) { VOP_UNLOCK(coveredvp, 0); vdrop(coveredvp); } vfs_rel(mp); return (error); } vn_start_write(NULL, &mp, V_WAIT | V_MNTREF); MNT_ILOCK(mp); if ((mp->mnt_kern_flag & MNTK_UNMOUNT) != 0 || !TAILQ_EMPTY(&mp->mnt_uppers)) { MNT_IUNLOCK(mp); if (coveredvp != NULL) { VOP_UNLOCK(coveredvp, 0); vdrop(coveredvp); } vn_finished_write(mp); return (EBUSY); } mp->mnt_kern_flag |= MNTK_UNMOUNT | MNTK_NOINSMNTQ; if (flags & MNT_NONBUSY) { MNT_IUNLOCK(mp); error = vfs_check_usecounts(mp); MNT_ILOCK(mp); if (error != 0) { mp->mnt_kern_flag &= ~(MNTK_UNMOUNT | MNTK_NOINSMNTQ); if (mp->mnt_kern_flag & MNTK_MWAIT) { mp->mnt_kern_flag &= ~MNTK_MWAIT; wakeup(mp); } MNT_IUNLOCK(mp); if (coveredvp != NULL) { VOP_UNLOCK(coveredvp, 0); vdrop(coveredvp); } vn_finished_write(mp); return (error); } } /* Allow filesystems to detect that a forced unmount is in progress. */ if (flags & MNT_FORCE) { mp->mnt_kern_flag |= MNTK_UNMOUNTF; MNT_IUNLOCK(mp); /* * Must be done after setting MNTK_UNMOUNTF and before * waiting for mnt_lockref to become 0. */ VFS_PURGE(mp); MNT_ILOCK(mp); } error = 0; if (mp->mnt_lockref) { mp->mnt_kern_flag |= MNTK_DRAINING; error = msleep(&mp->mnt_lockref, MNT_MTX(mp), PVFS, "mount drain", 0); } MNT_IUNLOCK(mp); KASSERT(mp->mnt_lockref == 0, ("%s: invalid lock refcount in the drain path @ %s:%d", __func__, __FILE__, __LINE__)); KASSERT(error == 0, ("%s: invalid return value for msleep in the drain path @ %s:%d", __func__, __FILE__, __LINE__)); if (mp->mnt_flag & MNT_EXPUBLIC) vfs_setpublicfs(NULL, NULL, NULL); /* * From now, we can claim that the use reference on the * coveredvp is ours, and the ref can be released only by * successfull unmount by us, or left for later unmount * attempt. The previously acquired hold reference is no * longer needed to protect the vnode from reuse. */ if (coveredvp != NULL) vdrop(coveredvp); vfs_msync(mp, MNT_WAIT); MNT_ILOCK(mp); async_flag = mp->mnt_flag & MNT_ASYNC; mp->mnt_flag &= ~MNT_ASYNC; mp->mnt_kern_flag &= ~MNTK_ASYNC; MNT_IUNLOCK(mp); - cache_purgevfs(mp); /* remove cache entries for this file sys */ + cache_purgevfs(mp, false); /* remove cache entries for this file sys */ vfs_deallocate_syncvnode(mp); /* * For forced unmounts, move process cdir/rdir refs on the fs root * vnode to the covered vnode. For non-forced unmounts we want * such references to cause an EBUSY error. */ if ((flags & MNT_FORCE) && VFS_ROOT(mp, LK_EXCLUSIVE, &fsrootvp) == 0) { if (mp->mnt_vnodecovered != NULL && (mp->mnt_flag & MNT_IGNORE) == 0) mountcheckdirs(fsrootvp, mp->mnt_vnodecovered); if (fsrootvp == rootvnode) { vrele(rootvnode); rootvnode = NULL; } vput(fsrootvp); } if ((mp->mnt_flag & MNT_RDONLY) != 0 || (flags & MNT_FORCE) != 0 || (error = VFS_SYNC(mp, MNT_WAIT)) == 0) error = VFS_UNMOUNT(mp, flags); vn_finished_write(mp); /* * If we failed to flush the dirty blocks for this mount point, * undo all the cdir/rdir and rootvnode changes we made above. * Unless we failed to do so because the device is reporting that * it doesn't exist anymore. */ if (error && error != ENXIO) { if ((flags & MNT_FORCE) && VFS_ROOT(mp, LK_EXCLUSIVE, &fsrootvp) == 0) { if (mp->mnt_vnodecovered != NULL && (mp->mnt_flag & MNT_IGNORE) == 0) mountcheckdirs(mp->mnt_vnodecovered, fsrootvp); if (rootvnode == NULL) { rootvnode = fsrootvp; vref(rootvnode); } vput(fsrootvp); } MNT_ILOCK(mp); mp->mnt_kern_flag &= ~MNTK_NOINSMNTQ; if ((mp->mnt_flag & MNT_RDONLY) == 0) { MNT_IUNLOCK(mp); vfs_allocate_syncvnode(mp); MNT_ILOCK(mp); } mp->mnt_kern_flag &= ~(MNTK_UNMOUNT | MNTK_UNMOUNTF); mp->mnt_flag |= async_flag; if ((mp->mnt_flag & MNT_ASYNC) != 0 && (mp->mnt_kern_flag & MNTK_NOASYNC) == 0) mp->mnt_kern_flag |= MNTK_ASYNC; if (mp->mnt_kern_flag & MNTK_MWAIT) { mp->mnt_kern_flag &= ~MNTK_MWAIT; wakeup(mp); } MNT_IUNLOCK(mp); if (coveredvp) VOP_UNLOCK(coveredvp, 0); return (error); } mtx_lock(&mountlist_mtx); TAILQ_REMOVE(&mountlist, mp, mnt_list); mtx_unlock(&mountlist_mtx); EVENTHANDLER_INVOKE(vfs_unmounted, mp, td); if (coveredvp != NULL) { coveredvp->v_mountedhere = NULL; vput(coveredvp); } vfs_event_signal(NULL, VQ_UNMOUNT, 0); if (mp == rootdevmp) rootdevmp = NULL; vfs_mount_destroy(mp); return (0); } /* * Report errors during filesystem mounting. */ void vfs_mount_error(struct mount *mp, const char *fmt, ...) { struct vfsoptlist *moptlist = mp->mnt_optnew; va_list ap; int error, len; char *errmsg; error = vfs_getopt(moptlist, "errmsg", (void **)&errmsg, &len); if (error || errmsg == NULL || len <= 0) return; va_start(ap, fmt); vsnprintf(errmsg, (size_t)len, fmt, ap); va_end(ap); } void vfs_opterror(struct vfsoptlist *opts, const char *fmt, ...) { va_list ap; int error, len; char *errmsg; error = vfs_getopt(opts, "errmsg", (void **)&errmsg, &len); if (error || errmsg == NULL || len <= 0) return; va_start(ap, fmt); vsnprintf(errmsg, (size_t)len, fmt, ap); va_end(ap); } /* * --------------------------------------------------------------------- * Functions for querying mount options/arguments from filesystems. */ /* * Check that no unknown options are given */ int vfs_filteropt(struct vfsoptlist *opts, const char **legal) { struct vfsopt *opt; char errmsg[255]; const char **t, *p, *q; int ret = 0; TAILQ_FOREACH(opt, opts, link) { p = opt->name; q = NULL; if (p[0] == 'n' && p[1] == 'o') q = p + 2; for(t = global_opts; *t != NULL; t++) { if (strcmp(*t, p) == 0) break; if (q != NULL) { if (strcmp(*t, q) == 0) break; } } if (*t != NULL) continue; for(t = legal; *t != NULL; t++) { if (strcmp(*t, p) == 0) break; if (q != NULL) { if (strcmp(*t, q) == 0) break; } } if (*t != NULL) continue; snprintf(errmsg, sizeof(errmsg), "mount option <%s> is unknown", p); ret = EINVAL; } if (ret != 0) { TAILQ_FOREACH(opt, opts, link) { if (strcmp(opt->name, "errmsg") == 0) { strncpy((char *)opt->value, errmsg, opt->len); break; } } if (opt == NULL) printf("%s\n", errmsg); } return (ret); } /* * Get a mount option by its name. * * Return 0 if the option was found, ENOENT otherwise. * If len is non-NULL it will be filled with the length * of the option. If buf is non-NULL, it will be filled * with the address of the option. */ int vfs_getopt(opts, name, buf, len) struct vfsoptlist *opts; const char *name; void **buf; int *len; { struct vfsopt *opt; KASSERT(opts != NULL, ("vfs_getopt: caller passed 'opts' as NULL")); TAILQ_FOREACH(opt, opts, link) { if (strcmp(name, opt->name) == 0) { opt->seen = 1; if (len != NULL) *len = opt->len; if (buf != NULL) *buf = opt->value; return (0); } } return (ENOENT); } int vfs_getopt_pos(struct vfsoptlist *opts, const char *name) { struct vfsopt *opt; if (opts == NULL) return (-1); TAILQ_FOREACH(opt, opts, link) { if (strcmp(name, opt->name) == 0) { opt->seen = 1; return (opt->pos); } } return (-1); } int vfs_getopt_size(struct vfsoptlist *opts, const char *name, off_t *value) { char *opt_value, *vtp; quad_t iv; int error, opt_len; error = vfs_getopt(opts, name, (void **)&opt_value, &opt_len); if (error != 0) return (error); if (opt_len == 0 || opt_value == NULL) return (EINVAL); if (opt_value[0] == '\0' || opt_value[opt_len - 1] != '\0') return (EINVAL); iv = strtoq(opt_value, &vtp, 0); if (vtp == opt_value || (vtp[0] != '\0' && vtp[1] != '\0')) return (EINVAL); if (iv < 0) return (EINVAL); switch (vtp[0]) { case 't': case 'T': iv *= 1024; case 'g': case 'G': iv *= 1024; case 'm': case 'M': iv *= 1024; case 'k': case 'K': iv *= 1024; case '\0': break; default: return (EINVAL); } *value = iv; return (0); } char * vfs_getopts(struct vfsoptlist *opts, const char *name, int *error) { struct vfsopt *opt; *error = 0; TAILQ_FOREACH(opt, opts, link) { if (strcmp(name, opt->name) != 0) continue; opt->seen = 1; if (opt->len == 0 || ((char *)opt->value)[opt->len - 1] != '\0') { *error = EINVAL; return (NULL); } return (opt->value); } *error = ENOENT; return (NULL); } int vfs_flagopt(struct vfsoptlist *opts, const char *name, uint64_t *w, uint64_t val) { struct vfsopt *opt; TAILQ_FOREACH(opt, opts, link) { if (strcmp(name, opt->name) == 0) { opt->seen = 1; if (w != NULL) *w |= val; return (1); } } if (w != NULL) *w &= ~val; return (0); } int vfs_scanopt(struct vfsoptlist *opts, const char *name, const char *fmt, ...) { va_list ap; struct vfsopt *opt; int ret; KASSERT(opts != NULL, ("vfs_getopt: caller passed 'opts' as NULL")); TAILQ_FOREACH(opt, opts, link) { if (strcmp(name, opt->name) != 0) continue; opt->seen = 1; if (opt->len == 0 || opt->value == NULL) return (0); if (((char *)opt->value)[opt->len - 1] != '\0') return (0); va_start(ap, fmt); ret = vsscanf(opt->value, fmt, ap); va_end(ap); return (ret); } return (0); } int vfs_setopt(struct vfsoptlist *opts, const char *name, void *value, int len) { struct vfsopt *opt; TAILQ_FOREACH(opt, opts, link) { if (strcmp(name, opt->name) != 0) continue; opt->seen = 1; if (opt->value == NULL) opt->len = len; else { if (opt->len != len) return (EINVAL); bcopy(value, opt->value, len); } return (0); } return (ENOENT); } int vfs_setopt_part(struct vfsoptlist *opts, const char *name, void *value, int len) { struct vfsopt *opt; TAILQ_FOREACH(opt, opts, link) { if (strcmp(name, opt->name) != 0) continue; opt->seen = 1; if (opt->value == NULL) opt->len = len; else { if (opt->len < len) return (EINVAL); opt->len = len; bcopy(value, opt->value, len); } return (0); } return (ENOENT); } int vfs_setopts(struct vfsoptlist *opts, const char *name, const char *value) { struct vfsopt *opt; TAILQ_FOREACH(opt, opts, link) { if (strcmp(name, opt->name) != 0) continue; opt->seen = 1; if (opt->value == NULL) opt->len = strlen(value) + 1; else if (strlcpy(opt->value, value, opt->len) >= opt->len) return (EINVAL); return (0); } return (ENOENT); } /* * Find and copy a mount option. * * The size of the buffer has to be specified * in len, if it is not the same length as the * mount option, EINVAL is returned. * Returns ENOENT if the option is not found. */ int vfs_copyopt(opts, name, dest, len) struct vfsoptlist *opts; const char *name; void *dest; int len; { struct vfsopt *opt; KASSERT(opts != NULL, ("vfs_copyopt: caller passed 'opts' as NULL")); TAILQ_FOREACH(opt, opts, link) { if (strcmp(name, opt->name) == 0) { opt->seen = 1; if (len != opt->len) return (EINVAL); bcopy(opt->value, dest, opt->len); return (0); } } return (ENOENT); } int __vfs_statfs(struct mount *mp, struct statfs *sbp) { int error; error = mp->mnt_op->vfs_statfs(mp, &mp->mnt_stat); if (sbp != &mp->mnt_stat) *sbp = mp->mnt_stat; return (error); } void vfs_mountedfrom(struct mount *mp, const char *from) { bzero(mp->mnt_stat.f_mntfromname, sizeof mp->mnt_stat.f_mntfromname); strlcpy(mp->mnt_stat.f_mntfromname, from, sizeof mp->mnt_stat.f_mntfromname); } /* * --------------------------------------------------------------------- * This is the api for building mount args and mounting filesystems from * inside the kernel. * * The API works by accumulation of individual args. First error is * latched. * * XXX: should be documented in new manpage kernel_mount(9) */ /* A memory allocation which must be freed when we are done */ struct mntaarg { SLIST_ENTRY(mntaarg) next; }; /* The header for the mount arguments */ struct mntarg { struct iovec *v; int len; int error; SLIST_HEAD(, mntaarg) list; }; /* * Add a boolean argument. * * flag is the boolean value. * name must start with "no". */ struct mntarg * mount_argb(struct mntarg *ma, int flag, const char *name) { KASSERT(name[0] == 'n' && name[1] == 'o', ("mount_argb(...,%s): name must start with 'no'", name)); return (mount_arg(ma, name + (flag ? 2 : 0), NULL, 0)); } /* * Add an argument printf style */ struct mntarg * mount_argf(struct mntarg *ma, const char *name, const char *fmt, ...) { va_list ap; struct mntaarg *maa; struct sbuf *sb; int len; if (ma == NULL) { ma = malloc(sizeof *ma, M_MOUNT, M_WAITOK | M_ZERO); SLIST_INIT(&ma->list); } if (ma->error) return (ma); ma->v = realloc(ma->v, sizeof *ma->v * (ma->len + 2), M_MOUNT, M_WAITOK); ma->v[ma->len].iov_base = (void *)(uintptr_t)name; ma->v[ma->len].iov_len = strlen(name) + 1; ma->len++; sb = sbuf_new_auto(); va_start(ap, fmt); sbuf_vprintf(sb, fmt, ap); va_end(ap); sbuf_finish(sb); len = sbuf_len(sb) + 1; maa = malloc(sizeof *maa + len, M_MOUNT, M_WAITOK | M_ZERO); SLIST_INSERT_HEAD(&ma->list, maa, next); bcopy(sbuf_data(sb), maa + 1, len); sbuf_delete(sb); ma->v[ma->len].iov_base = maa + 1; ma->v[ma->len].iov_len = len; ma->len++; return (ma); } /* * Add an argument which is a userland string. */ struct mntarg * mount_argsu(struct mntarg *ma, const char *name, const void *val, int len) { struct mntaarg *maa; char *tbuf; if (val == NULL) return (ma); if (ma == NULL) { ma = malloc(sizeof *ma, M_MOUNT, M_WAITOK | M_ZERO); SLIST_INIT(&ma->list); } if (ma->error) return (ma); maa = malloc(sizeof *maa + len, M_MOUNT, M_WAITOK | M_ZERO); SLIST_INSERT_HEAD(&ma->list, maa, next); tbuf = (void *)(maa + 1); ma->error = copyinstr(val, tbuf, len, NULL); return (mount_arg(ma, name, tbuf, -1)); } /* * Plain argument. * * If length is -1, treat value as a C string. */ struct mntarg * mount_arg(struct mntarg *ma, const char *name, const void *val, int len) { if (ma == NULL) { ma = malloc(sizeof *ma, M_MOUNT, M_WAITOK | M_ZERO); SLIST_INIT(&ma->list); } if (ma->error) return (ma); ma->v = realloc(ma->v, sizeof *ma->v * (ma->len + 2), M_MOUNT, M_WAITOK); ma->v[ma->len].iov_base = (void *)(uintptr_t)name; ma->v[ma->len].iov_len = strlen(name) + 1; ma->len++; ma->v[ma->len].iov_base = (void *)(uintptr_t)val; if (len < 0) ma->v[ma->len].iov_len = strlen(val) + 1; else ma->v[ma->len].iov_len = len; ma->len++; return (ma); } /* * Free a mntarg structure */ static void free_mntarg(struct mntarg *ma) { struct mntaarg *maa; while (!SLIST_EMPTY(&ma->list)) { maa = SLIST_FIRST(&ma->list); SLIST_REMOVE_HEAD(&ma->list, next); free(maa, M_MOUNT); } free(ma->v, M_MOUNT); free(ma, M_MOUNT); } /* * Mount a filesystem */ int kernel_mount(struct mntarg *ma, uint64_t flags) { struct uio auio; int error; KASSERT(ma != NULL, ("kernel_mount NULL ma")); KASSERT(ma->v != NULL, ("kernel_mount NULL ma->v")); KASSERT(!(ma->len & 1), ("kernel_mount odd ma->len (%d)", ma->len)); auio.uio_iov = ma->v; auio.uio_iovcnt = ma->len; auio.uio_segflg = UIO_SYSSPACE; error = ma->error; if (!error) error = vfs_donmount(curthread, flags, &auio); free_mntarg(ma); return (error); } /* * A printflike function to mount a filesystem. */ int kernel_vmount(int flags, ...) { struct mntarg *ma = NULL; va_list ap; const char *cp; const void *vp; int error; va_start(ap, flags); for (;;) { cp = va_arg(ap, const char *); if (cp == NULL) break; vp = va_arg(ap, const void *); ma = mount_arg(ma, cp, vp, (vp != NULL ? -1 : 0)); } va_end(ap); error = kernel_mount(ma, flags); return (error); } void vfs_oexport_conv(const struct oexport_args *oexp, struct export_args *exp) { bcopy(oexp, exp, sizeof(*oexp)); exp->ex_numsecflavors = 0; } Index: user/alc/PQ_LAUNDRY/sys/kern/vfs_mountroot.c =================================================================== --- user/alc/PQ_LAUNDRY/sys/kern/vfs_mountroot.c (revision 306831) +++ user/alc/PQ_LAUNDRY/sys/kern/vfs_mountroot.c (revision 306832) @@ -1,1105 +1,1105 @@ /*- * Copyright (c) 2010 Marcel Moolenaar * Copyright (c) 1999-2004 Poul-Henning Kamp * Copyright (c) 1999 Michael Smith * Copyright (c) 1989, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include "opt_rootdevname.h" #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * The root filesystem is detailed in the kernel environment variable * vfs.root.mountfrom, which is expected to be in the general format * * :[][ :[] ...] * vfsname := the name of a VFS known to the kernel and capable * of being mounted as root * path := disk device name or other data used by the filesystem * to locate its physical store * * If the environment variable vfs.root.mountfrom is a space separated list, * each list element is tried in turn and the root filesystem will be mounted * from the first one that succeeds. * * The environment variable vfs.root.mountfrom.options is a comma delimited * set of string mount options. These mount options must be parseable * by nmount() in the kernel. */ static int parse_mount(char **); static struct mntarg *parse_mountroot_options(struct mntarg *, const char *); static int sysctl_vfs_root_mount_hold(SYSCTL_HANDLER_ARGS); static void vfs_mountroot_wait(void); static int vfs_mountroot_wait_if_neccessary(const char *fs, const char *dev); /* * The vnode of the system's root (/ in the filesystem, without chroot * active.) */ struct vnode *rootvnode; /* * Mount of the system's /dev. */ struct mount *rootdevmp; char *rootdevnames[2] = {NULL, NULL}; struct mtx root_holds_mtx; MTX_SYSINIT(root_holds, &root_holds_mtx, "root_holds", MTX_DEF); struct root_hold_token { const char *who; LIST_ENTRY(root_hold_token) list; }; static LIST_HEAD(, root_hold_token) root_holds = LIST_HEAD_INITIALIZER(root_holds); enum action { A_CONTINUE, A_PANIC, A_REBOOT, A_RETRY }; static enum action root_mount_onfail = A_CONTINUE; static int root_mount_mddev; static int root_mount_complete; /* By default wait up to 3 seconds for devices to appear. */ static int root_mount_timeout = 3; TUNABLE_INT("vfs.mountroot.timeout", &root_mount_timeout); SYSCTL_PROC(_vfs, OID_AUTO, root_mount_hold, CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, sysctl_vfs_root_mount_hold, "A", "List of root mount hold tokens"); static int sysctl_vfs_root_mount_hold(SYSCTL_HANDLER_ARGS) { struct sbuf sb; struct root_hold_token *h; int error; sbuf_new(&sb, NULL, 256, SBUF_AUTOEXTEND | SBUF_INCLUDENUL); mtx_lock(&root_holds_mtx); LIST_FOREACH(h, &root_holds, list) { if (h != LIST_FIRST(&root_holds)) sbuf_putc(&sb, ' '); sbuf_printf(&sb, "%s", h->who); } mtx_unlock(&root_holds_mtx); error = sbuf_finish(&sb); if (error == 0) error = SYSCTL_OUT(req, sbuf_data(&sb), sbuf_len(&sb)); sbuf_delete(&sb); return (error); } struct root_hold_token * root_mount_hold(const char *identifier) { struct root_hold_token *h; if (root_mounted()) return (NULL); h = malloc(sizeof *h, M_DEVBUF, M_ZERO | M_WAITOK); h->who = identifier; mtx_lock(&root_holds_mtx); LIST_INSERT_HEAD(&root_holds, h, list); mtx_unlock(&root_holds_mtx); return (h); } void root_mount_rel(struct root_hold_token *h) { if (h == NULL) return; mtx_lock(&root_holds_mtx); LIST_REMOVE(h, list); wakeup(&root_holds); mtx_unlock(&root_holds_mtx); free(h, M_DEVBUF); } int root_mounted(void) { /* No mutex is acquired here because int stores are atomic. */ return (root_mount_complete); } static void set_rootvnode(void) { struct proc *p; if (VFS_ROOT(TAILQ_FIRST(&mountlist), LK_EXCLUSIVE, &rootvnode)) panic("Cannot find root vnode"); VOP_UNLOCK(rootvnode, 0); p = curthread->td_proc; FILEDESC_XLOCK(p->p_fd); if (p->p_fd->fd_cdir != NULL) vrele(p->p_fd->fd_cdir); p->p_fd->fd_cdir = rootvnode; VREF(rootvnode); if (p->p_fd->fd_rdir != NULL) vrele(p->p_fd->fd_rdir); p->p_fd->fd_rdir = rootvnode; VREF(rootvnode); FILEDESC_XUNLOCK(p->p_fd); } static int vfs_mountroot_devfs(struct thread *td, struct mount **mpp) { struct vfsoptlist *opts; struct vfsconf *vfsp; struct mount *mp; int error; *mpp = NULL; if (rootdevmp != NULL) { /* * Already have /dev; this happens during rerooting. */ error = vfs_busy(rootdevmp, 0); if (error != 0) return (error); *mpp = rootdevmp; } else { vfsp = vfs_byname("devfs"); KASSERT(vfsp != NULL, ("Could not find devfs by name")); if (vfsp == NULL) return (ENOENT); mp = vfs_mount_alloc(NULLVP, vfsp, "/dev", td->td_ucred); error = VFS_MOUNT(mp); KASSERT(error == 0, ("VFS_MOUNT(devfs) failed %d", error)); if (error) return (error); opts = malloc(sizeof(struct vfsoptlist), M_MOUNT, M_WAITOK); TAILQ_INIT(opts); mp->mnt_opt = opts; mtx_lock(&mountlist_mtx); TAILQ_INSERT_HEAD(&mountlist, mp, mnt_list); mtx_unlock(&mountlist_mtx); *mpp = mp; rootdevmp = mp; } set_rootvnode(); error = kern_symlinkat(td, "/", AT_FDCWD, "dev", UIO_SYSSPACE); if (error) printf("kern_symlink /dev -> / returns %d\n", error); return (error); } static void vfs_mountroot_shuffle(struct thread *td, struct mount *mpdevfs) { struct nameidata nd; struct mount *mporoot, *mpnroot; struct vnode *vp, *vporoot, *vpdevfs; char *fspath; int error; mpnroot = TAILQ_NEXT(mpdevfs, mnt_list); /* Shuffle the mountlist. */ mtx_lock(&mountlist_mtx); mporoot = TAILQ_FIRST(&mountlist); TAILQ_REMOVE(&mountlist, mpdevfs, mnt_list); if (mporoot != mpdevfs) { TAILQ_REMOVE(&mountlist, mpnroot, mnt_list); TAILQ_INSERT_HEAD(&mountlist, mpnroot, mnt_list); } TAILQ_INSERT_TAIL(&mountlist, mpdevfs, mnt_list); mtx_unlock(&mountlist_mtx); - cache_purgevfs(mporoot); + cache_purgevfs(mporoot, true); if (mporoot != mpdevfs) - cache_purgevfs(mpdevfs); + cache_purgevfs(mpdevfs, true); VFS_ROOT(mporoot, LK_EXCLUSIVE, &vporoot); VI_LOCK(vporoot); vporoot->v_iflag &= ~VI_MOUNT; VI_UNLOCK(vporoot); vporoot->v_mountedhere = NULL; mporoot->mnt_flag &= ~MNT_ROOTFS; mporoot->mnt_vnodecovered = NULL; vput(vporoot); /* Set up the new rootvnode, and purge the cache */ mpnroot->mnt_vnodecovered = NULL; set_rootvnode(); - cache_purgevfs(rootvnode->v_mount); + cache_purgevfs(rootvnode->v_mount, true); if (mporoot != mpdevfs) { /* Remount old root under /.mount or /mnt */ fspath = "/.mount"; NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, fspath, td); error = namei(&nd); if (error) { NDFREE(&nd, NDF_ONLY_PNBUF); fspath = "/mnt"; NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, fspath, td); error = namei(&nd); } if (!error) { vp = nd.ni_vp; error = (vp->v_type == VDIR) ? 0 : ENOTDIR; if (!error) error = vinvalbuf(vp, V_SAVE, 0, 0); if (!error) { cache_purge(vp); mporoot->mnt_vnodecovered = vp; vp->v_mountedhere = mporoot; strlcpy(mporoot->mnt_stat.f_mntonname, fspath, MNAMELEN); VOP_UNLOCK(vp, 0); } else vput(vp); } NDFREE(&nd, NDF_ONLY_PNBUF); if (error) printf("mountroot: unable to remount previous root " "under /.mount or /mnt (error %d)\n", error); } /* Remount devfs under /dev */ NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, "/dev", td); error = namei(&nd); if (!error) { vp = nd.ni_vp; error = (vp->v_type == VDIR) ? 0 : ENOTDIR; if (!error) error = vinvalbuf(vp, V_SAVE, 0, 0); if (!error) { vpdevfs = mpdevfs->mnt_vnodecovered; if (vpdevfs != NULL) { cache_purge(vpdevfs); vpdevfs->v_mountedhere = NULL; vrele(vpdevfs); } mpdevfs->mnt_vnodecovered = vp; vp->v_mountedhere = mpdevfs; VOP_UNLOCK(vp, 0); } else vput(vp); } if (error) printf("mountroot: unable to remount devfs under /dev " "(error %d)\n", error); NDFREE(&nd, NDF_ONLY_PNBUF); if (mporoot == mpdevfs) { vfs_unbusy(mpdevfs); /* Unlink the no longer needed /dev/dev -> / symlink */ error = kern_unlinkat(td, AT_FDCWD, "/dev/dev", UIO_SYSSPACE, 0); if (error) printf("mountroot: unable to unlink /dev/dev " "(error %d)\n", error); } } /* * Configuration parser. */ /* Parser character classes. */ #define CC_WHITESPACE -1 #define CC_NONWHITESPACE -2 /* Parse errors. */ #define PE_EOF -1 #define PE_EOL -2 static __inline int parse_peek(char **conf) { return (**conf); } static __inline void parse_poke(char **conf, int c) { **conf = c; } static __inline void parse_advance(char **conf) { (*conf)++; } static int parse_skipto(char **conf, int mc) { int c, match; while (1) { c = parse_peek(conf); if (c == 0) return (PE_EOF); switch (mc) { case CC_WHITESPACE: match = (c == ' ' || c == '\t' || c == '\n') ? 1 : 0; break; case CC_NONWHITESPACE: if (c == '\n') return (PE_EOL); match = (c != ' ' && c != '\t') ? 1 : 0; break; default: match = (c == mc) ? 1 : 0; break; } if (match) break; parse_advance(conf); } return (0); } static int parse_token(char **conf, char **tok) { char *p; size_t len; int error; *tok = NULL; error = parse_skipto(conf, CC_NONWHITESPACE); if (error) return (error); p = *conf; error = parse_skipto(conf, CC_WHITESPACE); len = *conf - p; *tok = malloc(len + 1, M_TEMP, M_WAITOK | M_ZERO); bcopy(p, *tok, len); return (0); } static void parse_dir_ask_printenv(const char *var) { char *val; val = kern_getenv(var); if (val != NULL) { printf(" %s=%s\n", var, val); freeenv(val); } } static int parse_dir_ask(char **conf) { char name[80]; char *mnt; int error; vfs_mountroot_wait(); printf("\nLoader variables:\n"); parse_dir_ask_printenv("vfs.root.mountfrom"); parse_dir_ask_printenv("vfs.root.mountfrom.options"); printf("\nManual root filesystem specification:\n"); printf(" : [options]\n"); printf(" Mount using filesystem \n"); printf(" and with the specified (optional) option list.\n"); printf("\n"); printf(" eg. ufs:/dev/da0s1a\n"); printf(" zfs:tank\n"); printf(" cd9660:/dev/cd0 ro\n"); printf(" (which is equivalent to: "); printf("mount -t cd9660 -o ro /dev/cd0 /)\n"); printf("\n"); printf(" ? List valid disk boot devices\n"); printf(" . Yield 1 second (for background tasks)\n"); printf(" Abort manual input\n"); do { error = EINVAL; printf("\nmountroot> "); cngets(name, sizeof(name), GETS_ECHO); if (name[0] == '\0') break; if (name[0] == '?' && name[1] == '\0') { printf("\nList of GEOM managed disk devices:\n "); g_dev_print(); continue; } if (name[0] == '.' && name[1] == '\0') { pause("rmask", hz); continue; } mnt = name; error = parse_mount(&mnt); if (error == -1) printf("Invalid file system specification.\n"); } while (error != 0); return (error); } static int parse_dir_md(char **conf) { struct stat sb; struct thread *td; struct md_ioctl *mdio; char *path, *tok; int error, fd, len; td = curthread; error = parse_token(conf, &tok); if (error) return (error); len = strlen(tok); mdio = malloc(sizeof(*mdio) + len + 1, M_TEMP, M_WAITOK | M_ZERO); path = (void *)(mdio + 1); bcopy(tok, path, len); free(tok, M_TEMP); /* Get file status. */ error = kern_statat(td, 0, AT_FDCWD, path, UIO_SYSSPACE, &sb, NULL); if (error) goto out; /* Open /dev/mdctl so that we can attach/detach. */ error = kern_openat(td, AT_FDCWD, "/dev/" MDCTL_NAME, UIO_SYSSPACE, O_RDWR, 0); if (error) goto out; fd = td->td_retval[0]; mdio->md_version = MDIOVERSION; mdio->md_type = MD_VNODE; if (root_mount_mddev != -1) { mdio->md_unit = root_mount_mddev; DROP_GIANT(); error = kern_ioctl(td, fd, MDIOCDETACH, (void *)mdio); PICKUP_GIANT(); /* Ignore errors. We don't care. */ root_mount_mddev = -1; } mdio->md_file = (void *)(mdio + 1); mdio->md_options = MD_AUTOUNIT | MD_READONLY; mdio->md_mediasize = sb.st_size; mdio->md_unit = 0; DROP_GIANT(); error = kern_ioctl(td, fd, MDIOCATTACH, (void *)mdio); PICKUP_GIANT(); if (error) goto out; if (mdio->md_unit > 9) { printf("rootmount: too many md units\n"); mdio->md_file = NULL; mdio->md_options = 0; mdio->md_mediasize = 0; DROP_GIANT(); error = kern_ioctl(td, fd, MDIOCDETACH, (void *)mdio); PICKUP_GIANT(); /* Ignore errors. We don't care. */ error = ERANGE; goto out; } root_mount_mddev = mdio->md_unit; printf(MD_NAME "%u attached to %s\n", root_mount_mddev, mdio->md_file); error = kern_close(td, fd); out: free(mdio, M_TEMP); return (error); } static int parse_dir_onfail(char **conf) { char *action; int error; error = parse_token(conf, &action); if (error) return (error); if (!strcmp(action, "continue")) root_mount_onfail = A_CONTINUE; else if (!strcmp(action, "panic")) root_mount_onfail = A_PANIC; else if (!strcmp(action, "reboot")) root_mount_onfail = A_REBOOT; else if (!strcmp(action, "retry")) root_mount_onfail = A_RETRY; else { printf("rootmount: %s: unknown action\n", action); error = EINVAL; } free(action, M_TEMP); return (0); } static int parse_dir_timeout(char **conf) { char *tok, *endtok; long secs; int error; error = parse_token(conf, &tok); if (error) return (error); secs = strtol(tok, &endtok, 0); error = (secs < 0 || *endtok != '\0') ? EINVAL : 0; if (!error) root_mount_timeout = secs; free(tok, M_TEMP); return (error); } static int parse_directive(char **conf) { char *dir; int error; error = parse_token(conf, &dir); if (error) return (error); if (strcmp(dir, ".ask") == 0) error = parse_dir_ask(conf); else if (strcmp(dir, ".md") == 0) error = parse_dir_md(conf); else if (strcmp(dir, ".onfail") == 0) error = parse_dir_onfail(conf); else if (strcmp(dir, ".timeout") == 0) error = parse_dir_timeout(conf); else { printf("mountroot: invalid directive `%s'\n", dir); /* Ignore the rest of the line. */ (void)parse_skipto(conf, '\n'); error = EINVAL; } free(dir, M_TEMP); return (error); } static int parse_mount_dev_present(const char *dev) { struct nameidata nd; int error; NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, dev, curthread); error = namei(&nd); if (!error) vput(nd.ni_vp); NDFREE(&nd, NDF_ONLY_PNBUF); return (error != 0) ? 0 : 1; } #define ERRMSGL 255 static int parse_mount(char **conf) { char *errmsg; struct mntarg *ma; char *dev, *fs, *opts, *tok; int error; error = parse_token(conf, &tok); if (error) return (error); fs = tok; error = parse_skipto(&tok, ':'); if (error) { free(fs, M_TEMP); return (error); } parse_poke(&tok, '\0'); parse_advance(&tok); dev = tok; if (root_mount_mddev != -1) { /* Handle substitution for the md unit number. */ tok = strstr(dev, "md#"); if (tok != NULL) tok[2] = '0' + root_mount_mddev; } /* Parse options. */ error = parse_token(conf, &tok); opts = (error == 0) ? tok : NULL; printf("Trying to mount root from %s:%s [%s]...\n", fs, dev, (opts != NULL) ? opts : ""); errmsg = malloc(ERRMSGL, M_TEMP, M_WAITOK | M_ZERO); if (vfs_byname(fs) == NULL) { strlcpy(errmsg, "unknown file system", ERRMSGL); error = ENOENT; goto out; } error = vfs_mountroot_wait_if_neccessary(fs, dev); if (error != 0) goto out; ma = NULL; ma = mount_arg(ma, "fstype", fs, -1); ma = mount_arg(ma, "fspath", "/", -1); ma = mount_arg(ma, "from", dev, -1); ma = mount_arg(ma, "errmsg", errmsg, ERRMSGL); ma = mount_arg(ma, "ro", NULL, 0); ma = parse_mountroot_options(ma, opts); error = kernel_mount(ma, MNT_ROOTFS); out: if (error) { printf("Mounting from %s:%s failed with error %d", fs, dev, error); if (errmsg[0] != '\0') printf(": %s", errmsg); printf(".\n"); } free(fs, M_TEMP); free(errmsg, M_TEMP); if (opts != NULL) free(opts, M_TEMP); /* kernel_mount can return -1 on error. */ return ((error < 0) ? EDOOFUS : error); } #undef ERRMSGL static int vfs_mountroot_parse(struct sbuf *sb, struct mount *mpdevfs) { struct mount *mp; char *conf; int error; root_mount_mddev = -1; retry: conf = sbuf_data(sb); mp = TAILQ_NEXT(mpdevfs, mnt_list); error = (mp == NULL) ? 0 : EDOOFUS; root_mount_onfail = A_CONTINUE; while (mp == NULL) { error = parse_skipto(&conf, CC_NONWHITESPACE); if (error == PE_EOL) { parse_advance(&conf); continue; } if (error < 0) break; switch (parse_peek(&conf)) { case '#': error = parse_skipto(&conf, '\n'); break; case '.': error = parse_directive(&conf); break; default: error = parse_mount(&conf); if (error == -1) { printf("mountroot: invalid file system " "specification.\n"); error = 0; } break; } if (error < 0) break; /* Ignore any trailing garbage on the line. */ if (parse_peek(&conf) != '\n') { printf("mountroot: advancing to next directive...\n"); (void)parse_skipto(&conf, '\n'); } mp = TAILQ_NEXT(mpdevfs, mnt_list); } if (mp != NULL) return (0); /* * We failed to mount (a new) root. */ switch (root_mount_onfail) { case A_CONTINUE: break; case A_PANIC: panic("mountroot: unable to (re-)mount root."); /* NOTREACHED */ case A_RETRY: goto retry; case A_REBOOT: kern_reboot(RB_NOSYNC); /* NOTREACHED */ } return (error); } static void vfs_mountroot_conf0(struct sbuf *sb) { char *s, *tok, *mnt, *opt; int error; sbuf_printf(sb, ".onfail panic\n"); sbuf_printf(sb, ".timeout %d\n", root_mount_timeout); if (boothowto & RB_ASKNAME) sbuf_printf(sb, ".ask\n"); #ifdef ROOTDEVNAME if (boothowto & RB_DFLTROOT) sbuf_printf(sb, "%s\n", ROOTDEVNAME); #endif if (boothowto & RB_CDROM) { sbuf_printf(sb, "cd9660:/dev/cd0 ro\n"); sbuf_printf(sb, ".timeout 0\n"); sbuf_printf(sb, "cd9660:/dev/cd1 ro\n"); sbuf_printf(sb, ".timeout %d\n", root_mount_timeout); } s = kern_getenv("vfs.root.mountfrom"); if (s != NULL) { opt = kern_getenv("vfs.root.mountfrom.options"); tok = s; error = parse_token(&tok, &mnt); while (!error) { sbuf_printf(sb, "%s %s\n", mnt, (opt != NULL) ? opt : ""); free(mnt, M_TEMP); error = parse_token(&tok, &mnt); } if (opt != NULL) freeenv(opt); freeenv(s); } if (rootdevnames[0] != NULL) sbuf_printf(sb, "%s\n", rootdevnames[0]); if (rootdevnames[1] != NULL) sbuf_printf(sb, "%s\n", rootdevnames[1]); #ifdef ROOTDEVNAME if (!(boothowto & RB_DFLTROOT)) sbuf_printf(sb, "%s\n", ROOTDEVNAME); #endif if (!(boothowto & RB_ASKNAME)) sbuf_printf(sb, ".ask\n"); } static int vfs_mountroot_readconf(struct thread *td, struct sbuf *sb) { static char buf[128]; struct nameidata nd; off_t ofs; ssize_t resid; int error, flags, len; NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, "/.mount.conf", td); flags = FREAD; error = vn_open(&nd, &flags, 0, NULL); if (error) return (error); NDFREE(&nd, NDF_ONLY_PNBUF); ofs = 0; len = sizeof(buf) - 1; while (1) { error = vn_rdwr(UIO_READ, nd.ni_vp, buf, len, ofs, UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, NOCRED, &resid, td); if (error) break; if (resid == len) break; buf[len - resid] = 0; sbuf_printf(sb, "%s", buf); ofs += len - resid; } VOP_UNLOCK(nd.ni_vp, 0); vn_close(nd.ni_vp, FREAD, td->td_ucred, td); return (error); } static void vfs_mountroot_wait(void) { struct root_hold_token *h; struct timeval lastfail; int curfail; curfail = 0; while (1) { DROP_GIANT(); g_waitidle(); PICKUP_GIANT(); mtx_lock(&root_holds_mtx); if (LIST_EMPTY(&root_holds)) { mtx_unlock(&root_holds_mtx); break; } if (ppsratecheck(&lastfail, &curfail, 1)) { printf("Root mount waiting for:"); LIST_FOREACH(h, &root_holds, list) printf(" %s", h->who); printf("\n"); } msleep(&root_holds, &root_holds_mtx, PZERO | PDROP, "roothold", hz); } } static int vfs_mountroot_wait_if_neccessary(const char *fs, const char *dev) { int delay, timeout; /* * In case of ZFS and NFS we don't have a way to wait for * specific device. */ if (strcmp(fs, "zfs") == 0 || strstr(fs, "nfs") != NULL || dev[0] == '\0') { vfs_mountroot_wait(); return (0); } /* * Otherwise, no point in waiting if the device is already there. * Note that we must wait for GEOM to finish reconfiguring itself, * eg for geom_part(4) to finish tasting. */ DROP_GIANT(); g_waitidle(); PICKUP_GIANT(); if (parse_mount_dev_present(dev)) return (0); /* * No luck. Let's wait. This code looks weird, but it's that way * to behave exactly as it used to work before. */ vfs_mountroot_wait(); printf("mountroot: waiting for device %s...\n", dev); delay = hz / 10; timeout = root_mount_timeout * hz; do { pause("rmdev", delay); timeout -= delay; } while (timeout > 0 && !parse_mount_dev_present(dev)); if (timeout <= 0) return (ENODEV); return (0); } void vfs_mountroot(void) { struct mount *mp; struct sbuf *sb; struct thread *td; time_t timebase; int error; td = curthread; sb = sbuf_new_auto(); vfs_mountroot_conf0(sb); sbuf_finish(sb); error = vfs_mountroot_devfs(td, &mp); while (!error) { error = vfs_mountroot_parse(sb, mp); if (!error) { vfs_mountroot_shuffle(td, mp); sbuf_clear(sb); error = vfs_mountroot_readconf(td, sb); sbuf_finish(sb); } } sbuf_delete(sb); /* * Iterate over all currently mounted file systems and use * the time stamp found to check and/or initialize the RTC. * Call inittodr() only once and pass it the largest of the * timestamps we encounter. */ timebase = 0; mtx_lock(&mountlist_mtx); mp = TAILQ_FIRST(&mountlist); while (mp != NULL) { if (mp->mnt_time > timebase) timebase = mp->mnt_time; mp = TAILQ_NEXT(mp, mnt_list); } mtx_unlock(&mountlist_mtx); inittodr(timebase); /* Keep prison0's root in sync with the global rootvnode. */ mtx_lock(&prison0.pr_mtx); prison0.pr_root = rootvnode; vref(prison0.pr_root); mtx_unlock(&prison0.pr_mtx); mtx_lock(&root_holds_mtx); atomic_store_rel_int(&root_mount_complete, 1); wakeup(&root_mount_complete); mtx_unlock(&root_holds_mtx); EVENTHANDLER_INVOKE(mountroot); } static struct mntarg * parse_mountroot_options(struct mntarg *ma, const char *options) { char *p; char *name, *name_arg; char *val, *val_arg; char *opts; if (options == NULL || options[0] == '\0') return (ma); p = opts = strdup(options, M_MOUNT); if (opts == NULL) { return (ma); } while((name = strsep(&p, ",")) != NULL) { if (name[0] == '\0') break; val = strchr(name, '='); if (val != NULL) { *val = '\0'; ++val; } if( strcmp(name, "rw") == 0 || strcmp(name, "noro") == 0) { /* * The first time we mount the root file system, * we need to mount 'ro', so We need to ignore * 'rw' and 'noro' mount options. */ continue; } name_arg = strdup(name, M_MOUNT); val_arg = NULL; if (val != NULL) val_arg = strdup(val, M_MOUNT); ma = mount_arg(ma, name_arg, val_arg, (val_arg != NULL ? -1 : 0)); } free(opts, M_MOUNT); return (ma); } Index: user/alc/PQ_LAUNDRY/sys/mips/conf/AR5312_BASE.hints =================================================================== --- user/alc/PQ_LAUNDRY/sys/mips/conf/AR5312_BASE.hints (nonexistent) +++ user/alc/PQ_LAUNDRY/sys/mips/conf/AR5312_BASE.hints (revision 306832) @@ -0,0 +1,29 @@ +# $FreeBSD$ +hint.apb.0.at="nexus0" +hint.apb.0.irq=4 + +# uart0 +hint.uart.0.at="apb0" +# see atheros/uart_cpu_ar71xx.c why +3 +hint.uart.0.maddr=0x1C000003 +hint.uart.0.msize=0x20 +#hint.uart.0.irq=4 +#hint.uart.0.flags="0x30" + +# Watchdog +hint.ar5315_wdog.0.at="apb0" +hint.ar5315_wdog.0.irq=6 + +# Ethernet +hint.are.0.at="nexus0" +hint.are.0.maddr=0x18100000 +hint.are.0.msize=0x00100000 +hint.are.0.irq=1 + +hint.are.1.at="nexus0" +hint.are.1.maddr=0x18200000 +hint.are.1.msize=0x00100000 +hint.are.1.irq=2 + +# GEOM redboot FIS directory offset +#hint.redboot.0.fisoffset="0x007e0000" Property changes on: user/alc/PQ_LAUNDRY/sys/mips/conf/AR5312_BASE.hints ___________________________________________________________________ Added: svn:keywords ## -0,0 +1 ## +FreeBSD=%H \ No newline at end of property Index: user/alc/PQ_LAUNDRY/sys/mips/conf/AR5315_BASE.hints =================================================================== --- user/alc/PQ_LAUNDRY/sys/mips/conf/AR5315_BASE.hints (nonexistent) +++ user/alc/PQ_LAUNDRY/sys/mips/conf/AR5315_BASE.hints (revision 306832) @@ -0,0 +1,34 @@ +# $FreeBSD$ +hint.apb.0.at="nexus0" +hint.apb.0.irq=0 + +# uart0 +hint.uart.0.at="apb0" +hint.uart.0.maddr=0x11100003 +hint.uart.0.msize=0x20 +#hint.uart.0.irq=0 +#hint.uart.0.flags="0x30" + +# Watchdog +hint.ar5315_wdog.0.at="apb0" +hint.ar5315_wdog.0.irq=7 + +# SPI +hint.spi.0.at="nexus0" +hint.spi.0.maddr=0x11300000 +hint.spi.0.msize=0x0000000c +#hint.spi.0.irq=2 + +# Ethernet +hint.are.0.at="nexus0" +hint.are.0.maddr=0x10500000 +hint.are.0.msize=0x500000 +hint.are.0.irq=2 + +# Flash +hint.mx25l.0.at="spibus0" +hint.mx25l.0.cs=0 + +# GEOM redboot FIS directory offset +#hint.redboot.0.fisoffset="0x007e0000" + Property changes on: user/alc/PQ_LAUNDRY/sys/mips/conf/AR5315_BASE.hints ___________________________________________________________________ Added: svn:keywords ## -0,0 +1 ## +FreeBSD=%H \ No newline at end of property Index: user/alc/PQ_LAUNDRY/sys/mips/conf/std.AR5312 =================================================================== --- user/alc/PQ_LAUNDRY/sys/mips/conf/std.AR5312 (nonexistent) +++ user/alc/PQ_LAUNDRY/sys/mips/conf/std.AR5312 (revision 306832) @@ -0,0 +1,80 @@ +# +# AR5312 -- Kernel configuration file for FreeBSD/MIPS for Atheros 5312 systems +# +# This includes all the common drivers for the AR5312 boards +# +# $FreeBSD$ +# + +machine mips mips +#ident AR5312_BASE +cpu CPU_MIPS4KC +makeoptions KERNLOADADDR=0x80050000 +options HZ=1000 + +makeoptions MODULES_OVERRIDE="" + +files "../atheros/ar531x/files.ar5315" + +options INTRNG +options AR531X_1ST_GENERATION + +# For now, hints are per-board. + +hints "AR5312_BASE.hints" + +makeoptions DEBUG=-g #Build kernel with gdb(1) debug symbols + +# For small memory footprints +options VM_KMEM_SIZE_SCALE=1 + +options DDB +options KDB + +options SCHED_4BSD #4BSD scheduler +options INET #InterNETworking +options INET6 # IPv6 + +# options NFSCL #Network Filesystem Client + +options PSEUDOFS #Pseudo-filesystem framework +options _KPOSIX_PRIORITY_SCHEDULING #Posix P1003_1B real-time extensions + +# options NFS_LEGACYRPC +# Debugging for use in -current +options INVARIANTS +options INVARIANT_SUPPORT +options WITNESS +options WITNESS_SKIPSPIN +options DEBUG_REDZONE +options DEBUG_MEMGUARD + +options FFS #Berkeley Fast Filesystem +# options SOFTUPDATES #Enable FFS soft updates support +# options UFS_ACL #Support for access control lists +# options UFS_DIRHASH #Improve performance on big directories +# options MSDOSFS # Read MSDOS filesystems; useful for USB/CF + +device mii +device are + +device cfi +options CFI_HARDWAREBYTESWAP +device geom_redboot + +device ar5315_wdog + +device uart +device uart_ar5315 + +device loop +device ether +device md +device bpf +device random + +options ARGE_DEBUG # Enable if_arge debugging for now + +# Enable GPIO +device gpio +device gpioled Property changes on: user/alc/PQ_LAUNDRY/sys/mips/conf/std.AR5312 ___________________________________________________________________ Added: svn:keywords ## -0,0 +1 ## +FreeBSD=%H \ No newline at end of property Index: user/alc/PQ_LAUNDRY/sys/mips/conf/std.AR5315 =================================================================== --- user/alc/PQ_LAUNDRY/sys/mips/conf/std.AR5315 (nonexistent) +++ user/alc/PQ_LAUNDRY/sys/mips/conf/std.AR5315 (revision 306832) @@ -0,0 +1,80 @@ +# +# AR5315 -- Kernel configuration file for FreeBSD/MIPS for Atheros 5315 systems +# +# This includes all the common drivers for the AR5315 boards +# +# $FreeBSD$ +# + +machine mips mips +#ident AR5315_BASE +cpu CPU_MIPS4KC +makeoptions KERNLOADADDR=0x80050000 +options HZ=1000 + +makeoptions MODULES_OVERRIDE="" + +files "../atheros/ar531x/files.ar5315" + +options INTRNG + +# For now, hints are per-board. + +hints "AR5315_BASE.hints" + +makeoptions DEBUG=-g #Build kernel with gdb(1) debug symbols + +# For small memory footprints +options VM_KMEM_SIZE_SCALE=1 + +options DDB +options KDB + +options SCHED_4BSD #4BSD scheduler +options INET #InterNETworking +options INET6 # IPv6 + +# options NFSCL #Network Filesystem Client + +options PSEUDOFS #Pseudo-filesystem framework +options _KPOSIX_PRIORITY_SCHEDULING #Posix P1003_1B real-time extensions + +# options NFS_LEGACYRPC +# Debugging for use in -current +options INVARIANTS +options INVARIANT_SUPPORT +options WITNESS +options WITNESS_SKIPSPIN +options DEBUG_REDZONE +options DEBUG_MEMGUARD + +options FFS #Berkeley Fast Filesystem +# options SOFTUPDATES #Enable FFS soft updates support +# options UFS_ACL #Support for access control lists +# options UFS_DIRHASH #Improve performance on big directories +# options MSDOSFS # Read MSDOS filesystems; useful for USB/CF + +device mii +device are + +device ar5315_spi +device spibus +device mx25l +device geom_redboot + +device ar5315_wdog + +device uart +device uart_ar5315 + +device loop +device ether +device md +device bpf +device random + +options ARGE_DEBUG # Enable if_arge debugging for now + +# Enable GPIO +device gpio +device gpioled Property changes on: user/alc/PQ_LAUNDRY/sys/mips/conf/std.AR5315 ___________________________________________________________________ Added: svn:keywords ## -0,0 +1 ## +FreeBSD=%H \ No newline at end of property Index: user/alc/PQ_LAUNDRY/sys/netinet/sctp_output.c =================================================================== --- user/alc/PQ_LAUNDRY/sys/netinet/sctp_output.c (revision 306831) +++ user/alc/PQ_LAUNDRY/sys/netinet/sctp_output.c (revision 306832) @@ -1,13819 +1,13823 @@ /*- * Copyright (c) 2001-2008, by Cisco Systems, Inc. All rights reserved. * Copyright (c) 2008-2012, by Randall Stewart. All rights reserved. * Copyright (c) 2008-2012, by Michael Tuexen. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * a) Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * b) Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in * the documentation and/or other materials provided with the distribution. * * c) Neither the name of Cisco Systems, Inc. nor the names of its * contributors may be used to endorse or promote products derived * from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF * THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #if defined(INET) || defined(INET6) #include #endif #include #include #define SCTP_MAX_GAPS_INARRAY 4 struct sack_track { uint8_t right_edge; /* mergable on the right edge */ uint8_t left_edge; /* mergable on the left edge */ uint8_t num_entries; uint8_t spare; struct sctp_gap_ack_block gaps[SCTP_MAX_GAPS_INARRAY]; }; const struct sack_track sack_array[256] = { {0, 0, 0, 0, /* 0x00 */ {{0, 0}, {0, 0}, {0, 0}, {0, 0} } }, {1, 0, 1, 0, /* 0x01 */ {{0, 0}, {0, 0}, {0, 0}, {0, 0} } }, {0, 0, 1, 0, /* 0x02 */ {{1, 1}, {0, 0}, {0, 0}, {0, 0} } }, {1, 0, 1, 0, /* 0x03 */ {{0, 1}, {0, 0}, {0, 0}, {0, 0} } }, {0, 0, 1, 0, /* 0x04 */ {{2, 2}, {0, 0}, {0, 0}, {0, 0} } }, {1, 0, 2, 0, /* 0x05 */ {{0, 0}, {2, 2}, {0, 0}, {0, 0} } }, {0, 0, 1, 0, /* 0x06 */ {{1, 2}, {0, 0}, {0, 0}, {0, 0} } }, {1, 0, 1, 0, /* 0x07 */ {{0, 2}, {0, 0}, {0, 0}, {0, 0} } }, {0, 0, 1, 0, /* 0x08 */ {{3, 3}, {0, 0}, {0, 0}, {0, 0} } }, {1, 0, 2, 0, /* 0x09 */ {{0, 0}, {3, 3}, {0, 0}, {0, 0} } }, {0, 0, 2, 0, /* 0x0a */ {{1, 1}, {3, 3}, {0, 0}, {0, 0} } }, {1, 0, 2, 0, /* 0x0b */ {{0, 1}, {3, 3}, {0, 0}, {0, 0} } }, {0, 0, 1, 0, /* 0x0c */ {{2, 3}, {0, 0}, {0, 0}, {0, 0} } }, {1, 0, 2, 0, /* 0x0d */ {{0, 0}, {2, 3}, {0, 0}, {0, 0} } }, {0, 0, 1, 0, /* 0x0e */ {{1, 3}, {0, 0}, {0, 0}, {0, 0} } }, {1, 0, 1, 0, /* 0x0f */ {{0, 3}, {0, 0}, {0, 0}, {0, 0} } }, {0, 0, 1, 0, /* 0x10 */ {{4, 4}, {0, 0}, {0, 0}, {0, 0} } }, {1, 0, 2, 0, /* 0x11 */ {{0, 0}, {4, 4}, {0, 0}, {0, 0} } }, {0, 0, 2, 0, /* 0x12 */ {{1, 1}, {4, 4}, {0, 0}, {0, 0} } }, {1, 0, 2, 0, /* 0x13 */ {{0, 1}, {4, 4}, {0, 0}, {0, 0} } }, {0, 0, 2, 0, /* 0x14 */ {{2, 2}, {4, 4}, {0, 0}, {0, 0} } }, {1, 0, 3, 0, /* 0x15 */ {{0, 0}, {2, 2}, {4, 4}, {0, 0} } }, {0, 0, 2, 0, /* 0x16 */ {{1, 2}, {4, 4}, {0, 0}, {0, 0} } }, {1, 0, 2, 0, /* 0x17 */ {{0, 2}, {4, 4}, {0, 0}, {0, 0} } }, {0, 0, 1, 0, /* 0x18 */ {{3, 4}, {0, 0}, {0, 0}, {0, 0} } }, {1, 0, 2, 0, /* 0x19 */ {{0, 0}, {3, 4}, {0, 0}, {0, 0} } }, {0, 0, 2, 0, /* 0x1a */ {{1, 1}, {3, 4}, {0, 0}, {0, 0} } }, {1, 0, 2, 0, /* 0x1b */ {{0, 1}, {3, 4}, {0, 0}, {0, 0} } }, {0, 0, 1, 0, /* 0x1c */ {{2, 4}, {0, 0}, {0, 0}, {0, 0} } }, {1, 0, 2, 0, /* 0x1d */ {{0, 0}, {2, 4}, {0, 0}, {0, 0} } }, {0, 0, 1, 0, /* 0x1e */ {{1, 4}, {0, 0}, {0, 0}, {0, 0} } }, {1, 0, 1, 0, /* 0x1f */ {{0, 4}, {0, 0}, {0, 0}, {0, 0} } }, {0, 0, 1, 0, /* 0x20 */ {{5, 5}, {0, 0}, {0, 0}, {0, 0} } }, {1, 0, 2, 0, /* 0x21 */ {{0, 0}, {5, 5}, {0, 0}, {0, 0} } }, {0, 0, 2, 0, /* 0x22 */ {{1, 1}, {5, 5}, {0, 0}, {0, 0} } }, {1, 0, 2, 0, /* 0x23 */ {{0, 1}, {5, 5}, {0, 0}, {0, 0} } }, {0, 0, 2, 0, /* 0x24 */ {{2, 2}, {5, 5}, {0, 0}, {0, 0} } }, {1, 0, 3, 0, /* 0x25 */ {{0, 0}, {2, 2}, {5, 5}, {0, 0} } }, {0, 0, 2, 0, /* 0x26 */ {{1, 2}, {5, 5}, {0, 0}, {0, 0} } }, {1, 0, 2, 0, /* 0x27 */ {{0, 2}, {5, 5}, {0, 0}, {0, 0} } }, {0, 0, 2, 0, /* 0x28 */ {{3, 3}, {5, 5}, {0, 0}, {0, 0} } }, {1, 0, 3, 0, /* 0x29 */ {{0, 0}, {3, 3}, {5, 5}, {0, 0} } }, {0, 0, 3, 0, /* 0x2a */ {{1, 1}, {3, 3}, {5, 5}, {0, 0} } }, {1, 0, 3, 0, /* 0x2b */ {{0, 1}, {3, 3}, {5, 5}, {0, 0} } }, {0, 0, 2, 0, /* 0x2c */ {{2, 3}, {5, 5}, {0, 0}, {0, 0} } }, {1, 0, 3, 0, /* 0x2d */ {{0, 0}, {2, 3}, {5, 5}, {0, 0} } }, {0, 0, 2, 0, /* 0x2e */ {{1, 3}, {5, 5}, {0, 0}, {0, 0} } }, {1, 0, 2, 0, /* 0x2f */ {{0, 3}, {5, 5}, {0, 0}, {0, 0} } }, {0, 0, 1, 0, /* 0x30 */ {{4, 5}, {0, 0}, {0, 0}, {0, 0} } }, {1, 0, 2, 0, /* 0x31 */ {{0, 0}, {4, 5}, {0, 0}, {0, 0} } }, {0, 0, 2, 0, /* 0x32 */ {{1, 1}, {4, 5}, {0, 0}, {0, 0} } }, {1, 0, 2, 0, /* 0x33 */ {{0, 1}, {4, 5}, {0, 0}, {0, 0} } }, {0, 0, 2, 0, /* 0x34 */ {{2, 2}, {4, 5}, {0, 0}, {0, 0} } }, {1, 0, 3, 0, /* 0x35 */ {{0, 0}, {2, 2}, {4, 5}, {0, 0} } }, {0, 0, 2, 0, /* 0x36 */ {{1, 2}, {4, 5}, {0, 0}, {0, 0} } }, {1, 0, 2, 0, /* 0x37 */ {{0, 2}, {4, 5}, {0, 0}, {0, 0} } }, {0, 0, 1, 0, /* 0x38 */ {{3, 5}, {0, 0}, {0, 0}, {0, 0} } }, {1, 0, 2, 0, /* 0x39 */ {{0, 0}, {3, 5}, {0, 0}, {0, 0} } }, {0, 0, 2, 0, /* 0x3a */ {{1, 1}, {3, 5}, {0, 0}, {0, 0} } }, {1, 0, 2, 0, /* 0x3b */ {{0, 1}, {3, 5}, {0, 0}, {0, 0} } }, {0, 0, 1, 0, /* 0x3c */ {{2, 5}, {0, 0}, {0, 0}, {0, 0} } }, {1, 0, 2, 0, /* 0x3d */ {{0, 0}, {2, 5}, {0, 0}, {0, 0} } }, {0, 0, 1, 0, /* 0x3e */ {{1, 5}, {0, 0}, {0, 0}, {0, 0} } }, {1, 0, 1, 0, /* 0x3f */ {{0, 5}, {0, 0}, {0, 0}, {0, 0} } }, {0, 0, 1, 0, /* 0x40 */ {{6, 6}, {0, 0}, {0, 0}, {0, 0} } }, {1, 0, 2, 0, /* 0x41 */ {{0, 0}, {6, 6}, {0, 0}, {0, 0} } }, {0, 0, 2, 0, /* 0x42 */ {{1, 1}, {6, 6}, {0, 0}, {0, 0} } }, {1, 0, 2, 0, /* 0x43 */ {{0, 1}, {6, 6}, {0, 0}, {0, 0} } }, {0, 0, 2, 0, /* 0x44 */ {{2, 2}, {6, 6}, {0, 0}, {0, 0} } }, {1, 0, 3, 0, /* 0x45 */ {{0, 0}, {2, 2}, {6, 6}, {0, 0} } }, {0, 0, 2, 0, /* 0x46 */ {{1, 2}, {6, 6}, {0, 0}, {0, 0} } }, {1, 0, 2, 0, /* 0x47 */ {{0, 2}, {6, 6}, {0, 0}, {0, 0} } }, {0, 0, 2, 0, /* 0x48 */ {{3, 3}, {6, 6}, {0, 0}, {0, 0} } }, {1, 0, 3, 0, /* 0x49 */ {{0, 0}, {3, 3}, {6, 6}, {0, 0} } }, {0, 0, 3, 0, /* 0x4a */ {{1, 1}, {3, 3}, {6, 6}, {0, 0} } }, {1, 0, 3, 0, /* 0x4b */ {{0, 1}, {3, 3}, {6, 6}, {0, 0} } }, {0, 0, 2, 0, /* 0x4c */ {{2, 3}, {6, 6}, {0, 0}, {0, 0} } }, {1, 0, 3, 0, /* 0x4d */ {{0, 0}, {2, 3}, {6, 6}, {0, 0} } }, {0, 0, 2, 0, /* 0x4e */ {{1, 3}, {6, 6}, {0, 0}, {0, 0} } }, {1, 0, 2, 0, /* 0x4f */ {{0, 3}, {6, 6}, {0, 0}, {0, 0} } }, {0, 0, 2, 0, /* 0x50 */ {{4, 4}, {6, 6}, {0, 0}, {0, 0} } }, {1, 0, 3, 0, /* 0x51 */ {{0, 0}, {4, 4}, {6, 6}, {0, 0} } }, {0, 0, 3, 0, /* 0x52 */ {{1, 1}, {4, 4}, {6, 6}, {0, 0} } }, {1, 0, 3, 0, /* 0x53 */ {{0, 1}, {4, 4}, {6, 6}, {0, 0} } }, {0, 0, 3, 0, /* 0x54 */ {{2, 2}, {4, 4}, {6, 6}, {0, 0} } }, {1, 0, 4, 0, /* 0x55 */ {{0, 0}, {2, 2}, {4, 4}, {6, 6} } }, {0, 0, 3, 0, /* 0x56 */ {{1, 2}, {4, 4}, {6, 6}, {0, 0} } }, {1, 0, 3, 0, /* 0x57 */ {{0, 2}, {4, 4}, {6, 6}, {0, 0} } }, {0, 0, 2, 0, /* 0x58 */ {{3, 4}, {6, 6}, {0, 0}, {0, 0} } }, {1, 0, 3, 0, /* 0x59 */ {{0, 0}, {3, 4}, {6, 6}, {0, 0} } }, {0, 0, 3, 0, /* 0x5a */ {{1, 1}, {3, 4}, {6, 6}, {0, 0} } }, {1, 0, 3, 0, /* 0x5b */ {{0, 1}, {3, 4}, {6, 6}, {0, 0} } }, {0, 0, 2, 0, /* 0x5c */ {{2, 4}, {6, 6}, {0, 0}, {0, 0} } }, {1, 0, 3, 0, /* 0x5d */ {{0, 0}, {2, 4}, {6, 6}, {0, 0} } }, {0, 0, 2, 0, /* 0x5e */ {{1, 4}, {6, 6}, {0, 0}, {0, 0} } }, {1, 0, 2, 0, /* 0x5f */ {{0, 4}, {6, 6}, {0, 0}, {0, 0} } }, {0, 0, 1, 0, /* 0x60 */ {{5, 6}, {0, 0}, {0, 0}, {0, 0} } }, {1, 0, 2, 0, /* 0x61 */ {{0, 0}, {5, 6}, {0, 0}, {0, 0} } }, {0, 0, 2, 0, /* 0x62 */ {{1, 1}, {5, 6}, {0, 0}, {0, 0} } }, {1, 0, 2, 0, /* 0x63 */ {{0, 1}, {5, 6}, {0, 0}, {0, 0} } }, {0, 0, 2, 0, /* 0x64 */ {{2, 2}, {5, 6}, {0, 0}, {0, 0} } }, {1, 0, 3, 0, /* 0x65 */ {{0, 0}, {2, 2}, {5, 6}, {0, 0} } }, {0, 0, 2, 0, /* 0x66 */ {{1, 2}, {5, 6}, {0, 0}, {0, 0} } }, {1, 0, 2, 0, /* 0x67 */ {{0, 2}, {5, 6}, {0, 0}, {0, 0} } }, {0, 0, 2, 0, /* 0x68 */ {{3, 3}, {5, 6}, {0, 0}, {0, 0} } }, {1, 0, 3, 0, /* 0x69 */ {{0, 0}, {3, 3}, {5, 6}, {0, 0} } }, {0, 0, 3, 0, /* 0x6a */ {{1, 1}, {3, 3}, {5, 6}, {0, 0} } }, {1, 0, 3, 0, /* 0x6b */ {{0, 1}, {3, 3}, {5, 6}, {0, 0} } }, {0, 0, 2, 0, /* 0x6c */ {{2, 3}, {5, 6}, {0, 0}, {0, 0} } }, {1, 0, 3, 0, /* 0x6d */ {{0, 0}, {2, 3}, {5, 6}, {0, 0} } }, {0, 0, 2, 0, /* 0x6e */ {{1, 3}, {5, 6}, {0, 0}, {0, 0} } }, {1, 0, 2, 0, /* 0x6f */ {{0, 3}, {5, 6}, {0, 0}, {0, 0} } }, {0, 0, 1, 0, /* 0x70 */ {{4, 6}, {0, 0}, {0, 0}, {0, 0} } }, {1, 0, 2, 0, /* 0x71 */ {{0, 0}, {4, 6}, {0, 0}, {0, 0} } }, {0, 0, 2, 0, /* 0x72 */ {{1, 1}, {4, 6}, {0, 0}, {0, 0} } }, {1, 0, 2, 0, /* 0x73 */ {{0, 1}, {4, 6}, {0, 0}, {0, 0} } }, {0, 0, 2, 0, /* 0x74 */ {{2, 2}, {4, 6}, {0, 0}, {0, 0} } }, {1, 0, 3, 0, /* 0x75 */ {{0, 0}, {2, 2}, {4, 6}, {0, 0} } }, {0, 0, 2, 0, /* 0x76 */ {{1, 2}, {4, 6}, {0, 0}, {0, 0} } }, {1, 0, 2, 0, /* 0x77 */ {{0, 2}, {4, 6}, {0, 0}, {0, 0} } }, {0, 0, 1, 0, /* 0x78 */ {{3, 6}, {0, 0}, {0, 0}, {0, 0} } }, {1, 0, 2, 0, /* 0x79 */ {{0, 0}, {3, 6}, {0, 0}, {0, 0} } }, {0, 0, 2, 0, /* 0x7a */ {{1, 1}, {3, 6}, {0, 0}, {0, 0} } }, {1, 0, 2, 0, /* 0x7b */ {{0, 1}, {3, 6}, {0, 0}, {0, 0} } }, {0, 0, 1, 0, /* 0x7c */ {{2, 6}, {0, 0}, {0, 0}, {0, 0} } }, {1, 0, 2, 0, /* 0x7d */ {{0, 0}, {2, 6}, {0, 0}, {0, 0} } }, {0, 0, 1, 0, /* 0x7e */ {{1, 6}, {0, 0}, {0, 0}, {0, 0} } }, {1, 0, 1, 0, /* 0x7f */ {{0, 6}, {0, 0}, {0, 0}, {0, 0} } }, {0, 1, 1, 0, /* 0x80 */ {{7, 7}, {0, 0}, {0, 0}, {0, 0} } }, {1, 1, 2, 0, /* 0x81 */ {{0, 0}, {7, 7}, {0, 0}, {0, 0} } }, {0, 1, 2, 0, /* 0x82 */ {{1, 1}, {7, 7}, {0, 0}, {0, 0} } }, {1, 1, 2, 0, /* 0x83 */ {{0, 1}, {7, 7}, {0, 0}, {0, 0} } }, {0, 1, 2, 0, /* 0x84 */ {{2, 2}, {7, 7}, {0, 0}, {0, 0} } }, {1, 1, 3, 0, /* 0x85 */ {{0, 0}, {2, 2}, {7, 7}, {0, 0} } }, {0, 1, 2, 0, /* 0x86 */ {{1, 2}, {7, 7}, {0, 0}, {0, 0} } }, {1, 1, 2, 0, /* 0x87 */ {{0, 2}, {7, 7}, {0, 0}, {0, 0} } }, {0, 1, 2, 0, /* 0x88 */ {{3, 3}, {7, 7}, {0, 0}, {0, 0} } }, {1, 1, 3, 0, /* 0x89 */ {{0, 0}, {3, 3}, {7, 7}, {0, 0} } }, {0, 1, 3, 0, /* 0x8a */ {{1, 1}, {3, 3}, {7, 7}, {0, 0} } }, {1, 1, 3, 0, /* 0x8b */ {{0, 1}, {3, 3}, {7, 7}, {0, 0} } }, {0, 1, 2, 0, /* 0x8c */ {{2, 3}, {7, 7}, {0, 0}, {0, 0} } }, {1, 1, 3, 0, /* 0x8d */ {{0, 0}, {2, 3}, {7, 7}, {0, 0} } }, {0, 1, 2, 0, /* 0x8e */ {{1, 3}, {7, 7}, {0, 0}, {0, 0} } }, {1, 1, 2, 0, /* 0x8f */ {{0, 3}, {7, 7}, {0, 0}, {0, 0} } }, {0, 1, 2, 0, /* 0x90 */ {{4, 4}, {7, 7}, {0, 0}, {0, 0} } }, {1, 1, 3, 0, /* 0x91 */ {{0, 0}, {4, 4}, {7, 7}, {0, 0} } }, {0, 1, 3, 0, /* 0x92 */ {{1, 1}, {4, 4}, {7, 7}, {0, 0} } }, {1, 1, 3, 0, /* 0x93 */ {{0, 1}, {4, 4}, {7, 7}, {0, 0} } }, {0, 1, 3, 0, /* 0x94 */ {{2, 2}, {4, 4}, {7, 7}, {0, 0} } }, {1, 1, 4, 0, /* 0x95 */ {{0, 0}, {2, 2}, {4, 4}, {7, 7} } }, {0, 1, 3, 0, /* 0x96 */ {{1, 2}, {4, 4}, {7, 7}, {0, 0} } }, {1, 1, 3, 0, /* 0x97 */ {{0, 2}, {4, 4}, {7, 7}, {0, 0} } }, {0, 1, 2, 0, /* 0x98 */ {{3, 4}, {7, 7}, {0, 0}, {0, 0} } }, {1, 1, 3, 0, /* 0x99 */ {{0, 0}, {3, 4}, {7, 7}, {0, 0} } }, {0, 1, 3, 0, /* 0x9a */ {{1, 1}, {3, 4}, {7, 7}, {0, 0} } }, {1, 1, 3, 0, /* 0x9b */ {{0, 1}, {3, 4}, {7, 7}, {0, 0} } }, {0, 1, 2, 0, /* 0x9c */ {{2, 4}, {7, 7}, {0, 0}, {0, 0} } }, {1, 1, 3, 0, /* 0x9d */ {{0, 0}, {2, 4}, {7, 7}, {0, 0} } }, {0, 1, 2, 0, /* 0x9e */ {{1, 4}, {7, 7}, {0, 0}, {0, 0} } }, {1, 1, 2, 0, /* 0x9f */ {{0, 4}, {7, 7}, {0, 0}, {0, 0} } }, {0, 1, 2, 0, /* 0xa0 */ {{5, 5}, {7, 7}, {0, 0}, {0, 0} } }, {1, 1, 3, 0, /* 0xa1 */ {{0, 0}, {5, 5}, {7, 7}, {0, 0} } }, {0, 1, 3, 0, /* 0xa2 */ {{1, 1}, {5, 5}, {7, 7}, {0, 0} } }, {1, 1, 3, 0, /* 0xa3 */ {{0, 1}, {5, 5}, {7, 7}, {0, 0} } }, {0, 1, 3, 0, /* 0xa4 */ {{2, 2}, {5, 5}, {7, 7}, {0, 0} } }, {1, 1, 4, 0, /* 0xa5 */ {{0, 0}, {2, 2}, {5, 5}, {7, 7} } }, {0, 1, 3, 0, /* 0xa6 */ {{1, 2}, {5, 5}, {7, 7}, {0, 0} } }, {1, 1, 3, 0, /* 0xa7 */ {{0, 2}, {5, 5}, {7, 7}, {0, 0} } }, {0, 1, 3, 0, /* 0xa8 */ {{3, 3}, {5, 5}, {7, 7}, {0, 0} } }, {1, 1, 4, 0, /* 0xa9 */ {{0, 0}, {3, 3}, {5, 5}, {7, 7} } }, {0, 1, 4, 0, /* 0xaa */ {{1, 1}, {3, 3}, {5, 5}, {7, 7} } }, {1, 1, 4, 0, /* 0xab */ {{0, 1}, {3, 3}, {5, 5}, {7, 7} } }, {0, 1, 3, 0, /* 0xac */ {{2, 3}, {5, 5}, {7, 7}, {0, 0} } }, {1, 1, 4, 0, /* 0xad */ {{0, 0}, {2, 3}, {5, 5}, {7, 7} } }, {0, 1, 3, 0, /* 0xae */ {{1, 3}, {5, 5}, {7, 7}, {0, 0} } }, {1, 1, 3, 0, /* 0xaf */ {{0, 3}, {5, 5}, {7, 7}, {0, 0} } }, {0, 1, 2, 0, /* 0xb0 */ {{4, 5}, {7, 7}, {0, 0}, {0, 0} } }, {1, 1, 3, 0, /* 0xb1 */ {{0, 0}, {4, 5}, {7, 7}, {0, 0} } }, {0, 1, 3, 0, /* 0xb2 */ {{1, 1}, {4, 5}, {7, 7}, {0, 0} } }, {1, 1, 3, 0, /* 0xb3 */ {{0, 1}, {4, 5}, {7, 7}, {0, 0} } }, {0, 1, 3, 0, /* 0xb4 */ {{2, 2}, {4, 5}, {7, 7}, {0, 0} } }, {1, 1, 4, 0, /* 0xb5 */ {{0, 0}, {2, 2}, {4, 5}, {7, 7} } }, {0, 1, 3, 0, /* 0xb6 */ {{1, 2}, {4, 5}, {7, 7}, {0, 0} } }, {1, 1, 3, 0, /* 0xb7 */ {{0, 2}, {4, 5}, {7, 7}, {0, 0} } }, {0, 1, 2, 0, /* 0xb8 */ {{3, 5}, {7, 7}, {0, 0}, {0, 0} } }, {1, 1, 3, 0, /* 0xb9 */ {{0, 0}, {3, 5}, {7, 7}, {0, 0} } }, {0, 1, 3, 0, /* 0xba */ {{1, 1}, {3, 5}, {7, 7}, {0, 0} } }, {1, 1, 3, 0, /* 0xbb */ {{0, 1}, {3, 5}, {7, 7}, {0, 0} } }, {0, 1, 2, 0, /* 0xbc */ {{2, 5}, {7, 7}, {0, 0}, {0, 0} } }, {1, 1, 3, 0, /* 0xbd */ {{0, 0}, {2, 5}, {7, 7}, {0, 0} } }, {0, 1, 2, 0, /* 0xbe */ {{1, 5}, {7, 7}, {0, 0}, {0, 0} } }, {1, 1, 2, 0, /* 0xbf */ {{0, 5}, {7, 7}, {0, 0}, {0, 0} } }, {0, 1, 1, 0, /* 0xc0 */ {{6, 7}, {0, 0}, {0, 0}, {0, 0} } }, {1, 1, 2, 0, /* 0xc1 */ {{0, 0}, {6, 7}, {0, 0}, {0, 0} } }, {0, 1, 2, 0, /* 0xc2 */ {{1, 1}, {6, 7}, {0, 0}, {0, 0} } }, {1, 1, 2, 0, /* 0xc3 */ {{0, 1}, {6, 7}, {0, 0}, {0, 0} } }, {0, 1, 2, 0, /* 0xc4 */ {{2, 2}, {6, 7}, {0, 0}, {0, 0} } }, {1, 1, 3, 0, /* 0xc5 */ {{0, 0}, {2, 2}, {6, 7}, {0, 0} } }, {0, 1, 2, 0, /* 0xc6 */ {{1, 2}, {6, 7}, {0, 0}, {0, 0} } }, {1, 1, 2, 0, /* 0xc7 */ {{0, 2}, {6, 7}, {0, 0}, {0, 0} } }, {0, 1, 2, 0, /* 0xc8 */ {{3, 3}, {6, 7}, {0, 0}, {0, 0} } }, {1, 1, 3, 0, /* 0xc9 */ {{0, 0}, {3, 3}, {6, 7}, {0, 0} } }, {0, 1, 3, 0, /* 0xca */ {{1, 1}, {3, 3}, {6, 7}, {0, 0} } }, {1, 1, 3, 0, /* 0xcb */ {{0, 1}, {3, 3}, {6, 7}, {0, 0} } }, {0, 1, 2, 0, /* 0xcc */ {{2, 3}, {6, 7}, {0, 0}, {0, 0} } }, {1, 1, 3, 0, /* 0xcd */ {{0, 0}, {2, 3}, {6, 7}, {0, 0} } }, {0, 1, 2, 0, /* 0xce */ {{1, 3}, {6, 7}, {0, 0}, {0, 0} } }, {1, 1, 2, 0, /* 0xcf */ {{0, 3}, {6, 7}, {0, 0}, {0, 0} } }, {0, 1, 2, 0, /* 0xd0 */ {{4, 4}, {6, 7}, {0, 0}, {0, 0} } }, {1, 1, 3, 0, /* 0xd1 */ {{0, 0}, {4, 4}, {6, 7}, {0, 0} } }, {0, 1, 3, 0, /* 0xd2 */ {{1, 1}, {4, 4}, {6, 7}, {0, 0} } }, {1, 1, 3, 0, /* 0xd3 */ {{0, 1}, {4, 4}, {6, 7}, {0, 0} } }, {0, 1, 3, 0, /* 0xd4 */ {{2, 2}, {4, 4}, {6, 7}, {0, 0} } }, {1, 1, 4, 0, /* 0xd5 */ {{0, 0}, {2, 2}, {4, 4}, {6, 7} } }, {0, 1, 3, 0, /* 0xd6 */ {{1, 2}, {4, 4}, {6, 7}, {0, 0} } }, {1, 1, 3, 0, /* 0xd7 */ {{0, 2}, {4, 4}, {6, 7}, {0, 0} } }, {0, 1, 2, 0, /* 0xd8 */ {{3, 4}, {6, 7}, {0, 0}, {0, 0} } }, {1, 1, 3, 0, /* 0xd9 */ {{0, 0}, {3, 4}, {6, 7}, {0, 0} } }, {0, 1, 3, 0, /* 0xda */ {{1, 1}, {3, 4}, {6, 7}, {0, 0} } }, {1, 1, 3, 0, /* 0xdb */ {{0, 1}, {3, 4}, {6, 7}, {0, 0} } }, {0, 1, 2, 0, /* 0xdc */ {{2, 4}, {6, 7}, {0, 0}, {0, 0} } }, {1, 1, 3, 0, /* 0xdd */ {{0, 0}, {2, 4}, {6, 7}, {0, 0} } }, {0, 1, 2, 0, /* 0xde */ {{1, 4}, {6, 7}, {0, 0}, {0, 0} } }, {1, 1, 2, 0, /* 0xdf */ {{0, 4}, {6, 7}, {0, 0}, {0, 0} } }, {0, 1, 1, 0, /* 0xe0 */ {{5, 7}, {0, 0}, {0, 0}, {0, 0} } }, {1, 1, 2, 0, /* 0xe1 */ {{0, 0}, {5, 7}, {0, 0}, {0, 0} } }, {0, 1, 2, 0, /* 0xe2 */ {{1, 1}, {5, 7}, {0, 0}, {0, 0} } }, {1, 1, 2, 0, /* 0xe3 */ {{0, 1}, {5, 7}, {0, 0}, {0, 0} } }, {0, 1, 2, 0, /* 0xe4 */ {{2, 2}, {5, 7}, {0, 0}, {0, 0} } }, {1, 1, 3, 0, /* 0xe5 */ {{0, 0}, {2, 2}, {5, 7}, {0, 0} } }, {0, 1, 2, 0, /* 0xe6 */ {{1, 2}, {5, 7}, {0, 0}, {0, 0} } }, {1, 1, 2, 0, /* 0xe7 */ {{0, 2}, {5, 7}, {0, 0}, {0, 0} } }, {0, 1, 2, 0, /* 0xe8 */ {{3, 3}, {5, 7}, {0, 0}, {0, 0} } }, {1, 1, 3, 0, /* 0xe9 */ {{0, 0}, {3, 3}, {5, 7}, {0, 0} } }, {0, 1, 3, 0, /* 0xea */ {{1, 1}, {3, 3}, {5, 7}, {0, 0} } }, {1, 1, 3, 0, /* 0xeb */ {{0, 1}, {3, 3}, {5, 7}, {0, 0} } }, {0, 1, 2, 0, /* 0xec */ {{2, 3}, {5, 7}, {0, 0}, {0, 0} } }, {1, 1, 3, 0, /* 0xed */ {{0, 0}, {2, 3}, {5, 7}, {0, 0} } }, {0, 1, 2, 0, /* 0xee */ {{1, 3}, {5, 7}, {0, 0}, {0, 0} } }, {1, 1, 2, 0, /* 0xef */ {{0, 3}, {5, 7}, {0, 0}, {0, 0} } }, {0, 1, 1, 0, /* 0xf0 */ {{4, 7}, {0, 0}, {0, 0}, {0, 0} } }, {1, 1, 2, 0, /* 0xf1 */ {{0, 0}, {4, 7}, {0, 0}, {0, 0} } }, {0, 1, 2, 0, /* 0xf2 */ {{1, 1}, {4, 7}, {0, 0}, {0, 0} } }, {1, 1, 2, 0, /* 0xf3 */ {{0, 1}, {4, 7}, {0, 0}, {0, 0} } }, {0, 1, 2, 0, /* 0xf4 */ {{2, 2}, {4, 7}, {0, 0}, {0, 0} } }, {1, 1, 3, 0, /* 0xf5 */ {{0, 0}, {2, 2}, {4, 7}, {0, 0} } }, {0, 1, 2, 0, /* 0xf6 */ {{1, 2}, {4, 7}, {0, 0}, {0, 0} } }, {1, 1, 2, 0, /* 0xf7 */ {{0, 2}, {4, 7}, {0, 0}, {0, 0} } }, {0, 1, 1, 0, /* 0xf8 */ {{3, 7}, {0, 0}, {0, 0}, {0, 0} } }, {1, 1, 2, 0, /* 0xf9 */ {{0, 0}, {3, 7}, {0, 0}, {0, 0} } }, {0, 1, 2, 0, /* 0xfa */ {{1, 1}, {3, 7}, {0, 0}, {0, 0} } }, {1, 1, 2, 0, /* 0xfb */ {{0, 1}, {3, 7}, {0, 0}, {0, 0} } }, {0, 1, 1, 0, /* 0xfc */ {{2, 7}, {0, 0}, {0, 0}, {0, 0} } }, {1, 1, 2, 0, /* 0xfd */ {{0, 0}, {2, 7}, {0, 0}, {0, 0} } }, {0, 1, 1, 0, /* 0xfe */ {{1, 7}, {0, 0}, {0, 0}, {0, 0} } }, {1, 1, 1, 0, /* 0xff */ {{0, 7}, {0, 0}, {0, 0}, {0, 0} } } }; int sctp_is_address_in_scope(struct sctp_ifa *ifa, struct sctp_scoping *scope, int do_update) { if ((scope->loopback_scope == 0) && (ifa->ifn_p) && SCTP_IFN_IS_IFT_LOOP(ifa->ifn_p)) { /* * skip loopback if not in scope * */ return (0); } switch (ifa->address.sa.sa_family) { #ifdef INET case AF_INET: if (scope->ipv4_addr_legal) { struct sockaddr_in *sin; sin = &ifa->address.sin; if (sin->sin_addr.s_addr == 0) { /* not in scope , unspecified */ return (0); } if ((scope->ipv4_local_scope == 0) && (IN4_ISPRIVATE_ADDRESS(&sin->sin_addr))) { /* private address not in scope */ return (0); } } else { return (0); } break; #endif #ifdef INET6 case AF_INET6: if (scope->ipv6_addr_legal) { struct sockaddr_in6 *sin6; /* * Must update the flags, bummer, which means any * IFA locks must now be applied HERE <-> */ if (do_update) { sctp_gather_internal_ifa_flags(ifa); } if (ifa->localifa_flags & SCTP_ADDR_IFA_UNUSEABLE) { return (0); } /* ok to use deprecated addresses? */ sin6 = &ifa->address.sin6; if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) { /* skip unspecifed addresses */ return (0); } if ( /* (local_scope == 0) && */ (IN6_IS_ADDR_LINKLOCAL(&sin6->sin6_addr))) { return (0); } if ((scope->site_scope == 0) && (IN6_IS_ADDR_SITELOCAL(&sin6->sin6_addr))) { return (0); } } else { return (0); } break; #endif default: return (0); } return (1); } static struct mbuf * sctp_add_addr_to_mbuf(struct mbuf *m, struct sctp_ifa *ifa, uint16_t * len) { #if defined(INET) || defined(INET6) struct sctp_paramhdr *parmh; struct mbuf *mret; uint16_t plen; #endif switch (ifa->address.sa.sa_family) { #ifdef INET case AF_INET: plen = (uint16_t) sizeof(struct sctp_ipv4addr_param); break; #endif #ifdef INET6 case AF_INET6: plen = (uint16_t) sizeof(struct sctp_ipv6addr_param); break; #endif default: return (m); } #if defined(INET) || defined(INET6) if (M_TRAILINGSPACE(m) >= plen) { /* easy side we just drop it on the end */ parmh = (struct sctp_paramhdr *)(SCTP_BUF_AT(m, SCTP_BUF_LEN(m))); mret = m; } else { /* Need more space */ mret = m; while (SCTP_BUF_NEXT(mret) != NULL) { mret = SCTP_BUF_NEXT(mret); } SCTP_BUF_NEXT(mret) = sctp_get_mbuf_for_msg(plen, 0, M_NOWAIT, 1, MT_DATA); if (SCTP_BUF_NEXT(mret) == NULL) { /* We are hosed, can't add more addresses */ return (m); } mret = SCTP_BUF_NEXT(mret); parmh = mtod(mret, struct sctp_paramhdr *); } /* now add the parameter */ switch (ifa->address.sa.sa_family) { #ifdef INET case AF_INET: { struct sctp_ipv4addr_param *ipv4p; struct sockaddr_in *sin; sin = &ifa->address.sin; ipv4p = (struct sctp_ipv4addr_param *)parmh; parmh->param_type = htons(SCTP_IPV4_ADDRESS); parmh->param_length = htons(plen); ipv4p->addr = sin->sin_addr.s_addr; SCTP_BUF_LEN(mret) += plen; break; } #endif #ifdef INET6 case AF_INET6: { struct sctp_ipv6addr_param *ipv6p; struct sockaddr_in6 *sin6; sin6 = &ifa->address.sin6; ipv6p = (struct sctp_ipv6addr_param *)parmh; parmh->param_type = htons(SCTP_IPV6_ADDRESS); parmh->param_length = htons(plen); memcpy(ipv6p->addr, &sin6->sin6_addr, sizeof(ipv6p->addr)); /* clear embedded scope in the address */ in6_clearscope((struct in6_addr *)ipv6p->addr); SCTP_BUF_LEN(mret) += plen; break; } #endif default: return (m); } if (len != NULL) { *len += plen; } return (mret); #endif } struct mbuf * sctp_add_addresses_to_i_ia(struct sctp_inpcb *inp, struct sctp_tcb *stcb, struct sctp_scoping *scope, struct mbuf *m_at, int cnt_inits_to, uint16_t * padding_len, uint16_t * chunk_len) { struct sctp_vrf *vrf = NULL; int cnt, limit_out = 0, total_count; uint32_t vrf_id; vrf_id = inp->def_vrf_id; SCTP_IPI_ADDR_RLOCK(); vrf = sctp_find_vrf(vrf_id); if (vrf == NULL) { SCTP_IPI_ADDR_RUNLOCK(); return (m_at); } if (inp->sctp_flags & SCTP_PCB_FLAGS_BOUNDALL) { struct sctp_ifa *sctp_ifap; struct sctp_ifn *sctp_ifnp; cnt = cnt_inits_to; if (vrf->total_ifa_count > SCTP_COUNT_LIMIT) { limit_out = 1; cnt = SCTP_ADDRESS_LIMIT; goto skip_count; } LIST_FOREACH(sctp_ifnp, &vrf->ifnlist, next_ifn) { if ((scope->loopback_scope == 0) && SCTP_IFN_IS_IFT_LOOP(sctp_ifnp)) { /* * Skip loopback devices if loopback_scope * not set */ continue; } LIST_FOREACH(sctp_ifap, &sctp_ifnp->ifalist, next_ifa) { #ifdef INET if ((sctp_ifap->address.sa.sa_family == AF_INET) && (prison_check_ip4(inp->ip_inp.inp.inp_cred, &sctp_ifap->address.sin.sin_addr) != 0)) { continue; } #endif #ifdef INET6 if ((sctp_ifap->address.sa.sa_family == AF_INET6) && (prison_check_ip6(inp->ip_inp.inp.inp_cred, &sctp_ifap->address.sin6.sin6_addr) != 0)) { continue; } #endif if (sctp_is_addr_restricted(stcb, sctp_ifap)) { continue; } if (sctp_is_address_in_scope(sctp_ifap, scope, 1) == 0) { continue; } cnt++; if (cnt > SCTP_ADDRESS_LIMIT) { break; } } if (cnt > SCTP_ADDRESS_LIMIT) { break; } } skip_count: if (cnt > 1) { total_count = 0; LIST_FOREACH(sctp_ifnp, &vrf->ifnlist, next_ifn) { cnt = 0; if ((scope->loopback_scope == 0) && SCTP_IFN_IS_IFT_LOOP(sctp_ifnp)) { /* * Skip loopback devices if * loopback_scope not set */ continue; } LIST_FOREACH(sctp_ifap, &sctp_ifnp->ifalist, next_ifa) { #ifdef INET if ((sctp_ifap->address.sa.sa_family == AF_INET) && (prison_check_ip4(inp->ip_inp.inp.inp_cred, &sctp_ifap->address.sin.sin_addr) != 0)) { continue; } #endif #ifdef INET6 if ((sctp_ifap->address.sa.sa_family == AF_INET6) && (prison_check_ip6(inp->ip_inp.inp.inp_cred, &sctp_ifap->address.sin6.sin6_addr) != 0)) { continue; } #endif if (sctp_is_addr_restricted(stcb, sctp_ifap)) { continue; } if (sctp_is_address_in_scope(sctp_ifap, scope, 0) == 0) { continue; } if ((chunk_len != NULL) && (padding_len != NULL) && (*padding_len > 0)) { memset(mtod(m_at, caddr_t)+*chunk_len, 0, *padding_len); SCTP_BUF_LEN(m_at) += *padding_len; *chunk_len += *padding_len; *padding_len = 0; } m_at = sctp_add_addr_to_mbuf(m_at, sctp_ifap, chunk_len); if (limit_out) { cnt++; total_count++; if (cnt >= 2) { /* * two from each * address */ break; } if (total_count > SCTP_ADDRESS_LIMIT) { /* No more addresses */ break; } } } } } } else { struct sctp_laddr *laddr; cnt = cnt_inits_to; /* First, how many ? */ LIST_FOREACH(laddr, &inp->sctp_addr_list, sctp_nxt_addr) { if (laddr->ifa == NULL) { continue; } if (laddr->ifa->localifa_flags & SCTP_BEING_DELETED) /* * Address being deleted by the system, dont * list. */ continue; if (laddr->action == SCTP_DEL_IP_ADDRESS) { /* * Address being deleted on this ep don't * list. */ continue; } if (sctp_is_address_in_scope(laddr->ifa, scope, 1) == 0) { continue; } cnt++; } /* * To get through a NAT we only list addresses if we have * more than one. That way if you just bind a single address * we let the source of the init dictate our address. */ if (cnt > 1) { cnt = cnt_inits_to; LIST_FOREACH(laddr, &inp->sctp_addr_list, sctp_nxt_addr) { if (laddr->ifa == NULL) { continue; } if (laddr->ifa->localifa_flags & SCTP_BEING_DELETED) { continue; } if (sctp_is_address_in_scope(laddr->ifa, scope, 0) == 0) { continue; } if ((chunk_len != NULL) && (padding_len != NULL) && (*padding_len > 0)) { memset(mtod(m_at, caddr_t)+*chunk_len, 0, *padding_len); SCTP_BUF_LEN(m_at) += *padding_len; *chunk_len += *padding_len; *padding_len = 0; } m_at = sctp_add_addr_to_mbuf(m_at, laddr->ifa, chunk_len); cnt++; if (cnt >= SCTP_ADDRESS_LIMIT) { break; } } } } SCTP_IPI_ADDR_RUNLOCK(); return (m_at); } static struct sctp_ifa * sctp_is_ifa_addr_preferred(struct sctp_ifa *ifa, uint8_t dest_is_loop, uint8_t dest_is_priv, sa_family_t fam) { uint8_t dest_is_global = 0; /* dest_is_priv is true if destination is a private address */ /* dest_is_loop is true if destination is a loopback addresses */ /** * Here we determine if its a preferred address. A preferred address * means it is the same scope or higher scope then the destination. * L = loopback, P = private, G = global * ----------------------------------------- * src | dest | result * ---------------------------------------- * L | L | yes * ----------------------------------------- * P | L | yes-v4 no-v6 * ----------------------------------------- * G | L | yes-v4 no-v6 * ----------------------------------------- * L | P | no * ----------------------------------------- * P | P | yes * ----------------------------------------- * G | P | no * ----------------------------------------- * L | G | no * ----------------------------------------- * P | G | no * ----------------------------------------- * G | G | yes * ----------------------------------------- */ if (ifa->address.sa.sa_family != fam) { /* forget mis-matched family */ return (NULL); } if ((dest_is_priv == 0) && (dest_is_loop == 0)) { dest_is_global = 1; } SCTPDBG(SCTP_DEBUG_OUTPUT2, "Is destination preferred:"); SCTPDBG_ADDR(SCTP_DEBUG_OUTPUT2, &ifa->address.sa); /* Ok the address may be ok */ #ifdef INET6 if (fam == AF_INET6) { /* ok to use deprecated addresses? no lets not! */ if (ifa->localifa_flags & SCTP_ADDR_IFA_UNUSEABLE) { SCTPDBG(SCTP_DEBUG_OUTPUT3, "NO:1\n"); return (NULL); } if (ifa->src_is_priv && !ifa->src_is_loop) { if (dest_is_loop) { SCTPDBG(SCTP_DEBUG_OUTPUT3, "NO:2\n"); return (NULL); } } if (ifa->src_is_glob) { if (dest_is_loop) { SCTPDBG(SCTP_DEBUG_OUTPUT3, "NO:3\n"); return (NULL); } } } #endif /* * Now that we know what is what, implement or table this could in * theory be done slicker (it used to be), but this is * straightforward and easier to validate :-) */ SCTPDBG(SCTP_DEBUG_OUTPUT3, "src_loop:%d src_priv:%d src_glob:%d\n", ifa->src_is_loop, ifa->src_is_priv, ifa->src_is_glob); SCTPDBG(SCTP_DEBUG_OUTPUT3, "dest_loop:%d dest_priv:%d dest_glob:%d\n", dest_is_loop, dest_is_priv, dest_is_global); if ((ifa->src_is_loop) && (dest_is_priv)) { SCTPDBG(SCTP_DEBUG_OUTPUT3, "NO:4\n"); return (NULL); } if ((ifa->src_is_glob) && (dest_is_priv)) { SCTPDBG(SCTP_DEBUG_OUTPUT3, "NO:5\n"); return (NULL); } if ((ifa->src_is_loop) && (dest_is_global)) { SCTPDBG(SCTP_DEBUG_OUTPUT3, "NO:6\n"); return (NULL); } if ((ifa->src_is_priv) && (dest_is_global)) { SCTPDBG(SCTP_DEBUG_OUTPUT3, "NO:7\n"); return (NULL); } SCTPDBG(SCTP_DEBUG_OUTPUT3, "YES\n"); /* its a preferred address */ return (ifa); } static struct sctp_ifa * sctp_is_ifa_addr_acceptable(struct sctp_ifa *ifa, uint8_t dest_is_loop, uint8_t dest_is_priv, sa_family_t fam) { uint8_t dest_is_global = 0; /** * Here we determine if its a acceptable address. A acceptable * address means it is the same scope or higher scope but we can * allow for NAT which means its ok to have a global dest and a * private src. * * L = loopback, P = private, G = global * ----------------------------------------- * src | dest | result * ----------------------------------------- * L | L | yes * ----------------------------------------- * P | L | yes-v4 no-v6 * ----------------------------------------- * G | L | yes * ----------------------------------------- * L | P | no * ----------------------------------------- * P | P | yes * ----------------------------------------- * G | P | yes - May not work * ----------------------------------------- * L | G | no * ----------------------------------------- * P | G | yes - May not work * ----------------------------------------- * G | G | yes * ----------------------------------------- */ if (ifa->address.sa.sa_family != fam) { /* forget non matching family */ SCTPDBG(SCTP_DEBUG_OUTPUT3, "ifa_fam:%d fam:%d\n", ifa->address.sa.sa_family, fam); return (NULL); } /* Ok the address may be ok */ SCTPDBG_ADDR(SCTP_DEBUG_OUTPUT3, &ifa->address.sa); SCTPDBG(SCTP_DEBUG_OUTPUT3, "dst_is_loop:%d dest_is_priv:%d\n", dest_is_loop, dest_is_priv); if ((dest_is_loop == 0) && (dest_is_priv == 0)) { dest_is_global = 1; } #ifdef INET6 if (fam == AF_INET6) { /* ok to use deprecated addresses? */ if (ifa->localifa_flags & SCTP_ADDR_IFA_UNUSEABLE) { return (NULL); } if (ifa->src_is_priv) { /* Special case, linklocal to loop */ if (dest_is_loop) return (NULL); } } #endif /* * Now that we know what is what, implement our table. This could in * theory be done slicker (it used to be), but this is * straightforward and easier to validate :-) */ SCTPDBG(SCTP_DEBUG_OUTPUT3, "ifa->src_is_loop:%d dest_is_priv:%d\n", ifa->src_is_loop, dest_is_priv); if ((ifa->src_is_loop == 1) && (dest_is_priv)) { return (NULL); } SCTPDBG(SCTP_DEBUG_OUTPUT3, "ifa->src_is_loop:%d dest_is_glob:%d\n", ifa->src_is_loop, dest_is_global); if ((ifa->src_is_loop == 1) && (dest_is_global)) { return (NULL); } SCTPDBG(SCTP_DEBUG_OUTPUT3, "address is acceptable\n"); /* its an acceptable address */ return (ifa); } int sctp_is_addr_restricted(struct sctp_tcb *stcb, struct sctp_ifa *ifa) { struct sctp_laddr *laddr; if (stcb == NULL) { /* There are no restrictions, no TCB :-) */ return (0); } LIST_FOREACH(laddr, &stcb->asoc.sctp_restricted_addrs, sctp_nxt_addr) { if (laddr->ifa == NULL) { SCTPDBG(SCTP_DEBUG_OUTPUT1, "%s: NULL ifa\n", __func__); continue; } if (laddr->ifa == ifa) { /* Yes it is on the list */ return (1); } } return (0); } int sctp_is_addr_in_ep(struct sctp_inpcb *inp, struct sctp_ifa *ifa) { struct sctp_laddr *laddr; if (ifa == NULL) return (0); LIST_FOREACH(laddr, &inp->sctp_addr_list, sctp_nxt_addr) { if (laddr->ifa == NULL) { SCTPDBG(SCTP_DEBUG_OUTPUT1, "%s: NULL ifa\n", __func__); continue; } if ((laddr->ifa == ifa) && laddr->action == 0) /* same pointer */ return (1); } return (0); } static struct sctp_ifa * sctp_choose_boundspecific_inp(struct sctp_inpcb *inp, sctp_route_t * ro, uint32_t vrf_id, int non_asoc_addr_ok, uint8_t dest_is_priv, uint8_t dest_is_loop, sa_family_t fam) { struct sctp_laddr *laddr, *starting_point; void *ifn; int resettotop = 0; struct sctp_ifn *sctp_ifn; struct sctp_ifa *sctp_ifa, *sifa; struct sctp_vrf *vrf; uint32_t ifn_index; vrf = sctp_find_vrf(vrf_id); if (vrf == NULL) return (NULL); ifn = SCTP_GET_IFN_VOID_FROM_ROUTE(ro); ifn_index = SCTP_GET_IF_INDEX_FROM_ROUTE(ro); sctp_ifn = sctp_find_ifn(ifn, ifn_index); /* * first question, is the ifn we will emit on in our list, if so, we * want such an address. Note that we first looked for a preferred * address. */ if (sctp_ifn) { /* is a preferred one on the interface we route out? */ LIST_FOREACH(sctp_ifa, &sctp_ifn->ifalist, next_ifa) { #ifdef INET if ((sctp_ifa->address.sa.sa_family == AF_INET) && (prison_check_ip4(inp->ip_inp.inp.inp_cred, &sctp_ifa->address.sin.sin_addr) != 0)) { continue; } #endif #ifdef INET6 if ((sctp_ifa->address.sa.sa_family == AF_INET6) && (prison_check_ip6(inp->ip_inp.inp.inp_cred, &sctp_ifa->address.sin6.sin6_addr) != 0)) { continue; } #endif if ((sctp_ifa->localifa_flags & SCTP_ADDR_DEFER_USE) && (non_asoc_addr_ok == 0)) continue; sifa = sctp_is_ifa_addr_preferred(sctp_ifa, dest_is_loop, dest_is_priv, fam); if (sifa == NULL) continue; if (sctp_is_addr_in_ep(inp, sifa)) { atomic_add_int(&sifa->refcount, 1); return (sifa); } } } /* * ok, now we now need to find one on the list of the addresses. We * can't get one on the emitting interface so let's find first a * preferred one. If not that an acceptable one otherwise... we * return NULL. */ starting_point = inp->next_addr_touse; once_again: if (inp->next_addr_touse == NULL) { inp->next_addr_touse = LIST_FIRST(&inp->sctp_addr_list); resettotop = 1; } for (laddr = inp->next_addr_touse; laddr; laddr = LIST_NEXT(laddr, sctp_nxt_addr)) { if (laddr->ifa == NULL) { /* address has been removed */ continue; } if (laddr->action == SCTP_DEL_IP_ADDRESS) { /* address is being deleted */ continue; } sifa = sctp_is_ifa_addr_preferred(laddr->ifa, dest_is_loop, dest_is_priv, fam); if (sifa == NULL) continue; atomic_add_int(&sifa->refcount, 1); return (sifa); } if (resettotop == 0) { inp->next_addr_touse = NULL; goto once_again; } inp->next_addr_touse = starting_point; resettotop = 0; once_again_too: if (inp->next_addr_touse == NULL) { inp->next_addr_touse = LIST_FIRST(&inp->sctp_addr_list); resettotop = 1; } /* ok, what about an acceptable address in the inp */ for (laddr = inp->next_addr_touse; laddr; laddr = LIST_NEXT(laddr, sctp_nxt_addr)) { if (laddr->ifa == NULL) { /* address has been removed */ continue; } if (laddr->action == SCTP_DEL_IP_ADDRESS) { /* address is being deleted */ continue; } sifa = sctp_is_ifa_addr_acceptable(laddr->ifa, dest_is_loop, dest_is_priv, fam); if (sifa == NULL) continue; atomic_add_int(&sifa->refcount, 1); return (sifa); } if (resettotop == 0) { inp->next_addr_touse = NULL; goto once_again_too; } /* * no address bound can be a source for the destination we are in * trouble */ return (NULL); } static struct sctp_ifa * sctp_choose_boundspecific_stcb(struct sctp_inpcb *inp, struct sctp_tcb *stcb, sctp_route_t * ro, uint32_t vrf_id, uint8_t dest_is_priv, uint8_t dest_is_loop, int non_asoc_addr_ok, sa_family_t fam) { struct sctp_laddr *laddr, *starting_point; void *ifn; struct sctp_ifn *sctp_ifn; struct sctp_ifa *sctp_ifa, *sifa; uint8_t start_at_beginning = 0; struct sctp_vrf *vrf; uint32_t ifn_index; /* * first question, is the ifn we will emit on in our list, if so, we * want that one. */ vrf = sctp_find_vrf(vrf_id); if (vrf == NULL) return (NULL); ifn = SCTP_GET_IFN_VOID_FROM_ROUTE(ro); ifn_index = SCTP_GET_IF_INDEX_FROM_ROUTE(ro); sctp_ifn = sctp_find_ifn(ifn, ifn_index); /* * first question, is the ifn we will emit on in our list? If so, * we want that one. First we look for a preferred. Second, we go * for an acceptable. */ if (sctp_ifn) { /* first try for a preferred address on the ep */ LIST_FOREACH(sctp_ifa, &sctp_ifn->ifalist, next_ifa) { #ifdef INET if ((sctp_ifa->address.sa.sa_family == AF_INET) && (prison_check_ip4(inp->ip_inp.inp.inp_cred, &sctp_ifa->address.sin.sin_addr) != 0)) { continue; } #endif #ifdef INET6 if ((sctp_ifa->address.sa.sa_family == AF_INET6) && (prison_check_ip6(inp->ip_inp.inp.inp_cred, &sctp_ifa->address.sin6.sin6_addr) != 0)) { continue; } #endif if ((sctp_ifa->localifa_flags & SCTP_ADDR_DEFER_USE) && (non_asoc_addr_ok == 0)) continue; if (sctp_is_addr_in_ep(inp, sctp_ifa)) { sifa = sctp_is_ifa_addr_preferred(sctp_ifa, dest_is_loop, dest_is_priv, fam); if (sifa == NULL) continue; if (((non_asoc_addr_ok == 0) && (sctp_is_addr_restricted(stcb, sifa))) || (non_asoc_addr_ok && (sctp_is_addr_restricted(stcb, sifa)) && (!sctp_is_addr_pending(stcb, sifa)))) { /* on the no-no list */ continue; } atomic_add_int(&sifa->refcount, 1); return (sifa); } } /* next try for an acceptable address on the ep */ LIST_FOREACH(sctp_ifa, &sctp_ifn->ifalist, next_ifa) { #ifdef INET if ((sctp_ifa->address.sa.sa_family == AF_INET) && (prison_check_ip4(inp->ip_inp.inp.inp_cred, &sctp_ifa->address.sin.sin_addr) != 0)) { continue; } #endif #ifdef INET6 if ((sctp_ifa->address.sa.sa_family == AF_INET6) && (prison_check_ip6(inp->ip_inp.inp.inp_cred, &sctp_ifa->address.sin6.sin6_addr) != 0)) { continue; } #endif if ((sctp_ifa->localifa_flags & SCTP_ADDR_DEFER_USE) && (non_asoc_addr_ok == 0)) continue; if (sctp_is_addr_in_ep(inp, sctp_ifa)) { sifa = sctp_is_ifa_addr_acceptable(sctp_ifa, dest_is_loop, dest_is_priv, fam); if (sifa == NULL) continue; if (((non_asoc_addr_ok == 0) && (sctp_is_addr_restricted(stcb, sifa))) || (non_asoc_addr_ok && (sctp_is_addr_restricted(stcb, sifa)) && (!sctp_is_addr_pending(stcb, sifa)))) { /* on the no-no list */ continue; } atomic_add_int(&sifa->refcount, 1); return (sifa); } } } /* * if we can't find one like that then we must look at all addresses * bound to pick one at first preferable then secondly acceptable. */ starting_point = stcb->asoc.last_used_address; sctp_from_the_top: if (stcb->asoc.last_used_address == NULL) { start_at_beginning = 1; stcb->asoc.last_used_address = LIST_FIRST(&inp->sctp_addr_list); } /* search beginning with the last used address */ for (laddr = stcb->asoc.last_used_address; laddr; laddr = LIST_NEXT(laddr, sctp_nxt_addr)) { if (laddr->ifa == NULL) { /* address has been removed */ continue; } if (laddr->action == SCTP_DEL_IP_ADDRESS) { /* address is being deleted */ continue; } sifa = sctp_is_ifa_addr_preferred(laddr->ifa, dest_is_loop, dest_is_priv, fam); if (sifa == NULL) continue; if (((non_asoc_addr_ok == 0) && (sctp_is_addr_restricted(stcb, sifa))) || (non_asoc_addr_ok && (sctp_is_addr_restricted(stcb, sifa)) && (!sctp_is_addr_pending(stcb, sifa)))) { /* on the no-no list */ continue; } stcb->asoc.last_used_address = laddr; atomic_add_int(&sifa->refcount, 1); return (sifa); } if (start_at_beginning == 0) { stcb->asoc.last_used_address = NULL; goto sctp_from_the_top; } /* now try for any higher scope than the destination */ stcb->asoc.last_used_address = starting_point; start_at_beginning = 0; sctp_from_the_top2: if (stcb->asoc.last_used_address == NULL) { start_at_beginning = 1; stcb->asoc.last_used_address = LIST_FIRST(&inp->sctp_addr_list); } /* search beginning with the last used address */ for (laddr = stcb->asoc.last_used_address; laddr; laddr = LIST_NEXT(laddr, sctp_nxt_addr)) { if (laddr->ifa == NULL) { /* address has been removed */ continue; } if (laddr->action == SCTP_DEL_IP_ADDRESS) { /* address is being deleted */ continue; } sifa = sctp_is_ifa_addr_acceptable(laddr->ifa, dest_is_loop, dest_is_priv, fam); if (sifa == NULL) continue; if (((non_asoc_addr_ok == 0) && (sctp_is_addr_restricted(stcb, sifa))) || (non_asoc_addr_ok && (sctp_is_addr_restricted(stcb, sifa)) && (!sctp_is_addr_pending(stcb, sifa)))) { /* on the no-no list */ continue; } stcb->asoc.last_used_address = laddr; atomic_add_int(&sifa->refcount, 1); return (sifa); } if (start_at_beginning == 0) { stcb->asoc.last_used_address = NULL; goto sctp_from_the_top2; } return (NULL); } static struct sctp_ifa * sctp_select_nth_preferred_addr_from_ifn_boundall(struct sctp_ifn *ifn, struct sctp_inpcb *inp, struct sctp_tcb *stcb, int non_asoc_addr_ok, uint8_t dest_is_loop, uint8_t dest_is_priv, int addr_wanted, sa_family_t fam, sctp_route_t * ro ) { struct sctp_ifa *ifa, *sifa; int num_eligible_addr = 0; #ifdef INET6 struct sockaddr_in6 sin6, lsa6; if (fam == AF_INET6) { memcpy(&sin6, &ro->ro_dst, sizeof(struct sockaddr_in6)); (void)sa6_recoverscope(&sin6); } #endif /* INET6 */ LIST_FOREACH(ifa, &ifn->ifalist, next_ifa) { #ifdef INET if ((ifa->address.sa.sa_family == AF_INET) && (prison_check_ip4(inp->ip_inp.inp.inp_cred, &ifa->address.sin.sin_addr) != 0)) { continue; } #endif #ifdef INET6 if ((ifa->address.sa.sa_family == AF_INET6) && (prison_check_ip6(inp->ip_inp.inp.inp_cred, &ifa->address.sin6.sin6_addr) != 0)) { continue; } #endif if ((ifa->localifa_flags & SCTP_ADDR_DEFER_USE) && (non_asoc_addr_ok == 0)) continue; sifa = sctp_is_ifa_addr_preferred(ifa, dest_is_loop, dest_is_priv, fam); if (sifa == NULL) continue; #ifdef INET6 if (fam == AF_INET6 && dest_is_loop && sifa->src_is_loop && sifa->src_is_priv) { /* * don't allow fe80::1 to be a src on loop ::1, we * don't list it to the peer so we will get an * abort. */ continue; } if (fam == AF_INET6 && IN6_IS_ADDR_LINKLOCAL(&sifa->address.sin6.sin6_addr) && IN6_IS_ADDR_LINKLOCAL(&sin6.sin6_addr)) { /* * link-local <-> link-local must belong to the same * scope. */ memcpy(&lsa6, &sifa->address.sin6, sizeof(struct sockaddr_in6)); (void)sa6_recoverscope(&lsa6); if (sin6.sin6_scope_id != lsa6.sin6_scope_id) { continue; } } #endif /* INET6 */ /* * Check if the IPv6 address matches to next-hop. In the * mobile case, old IPv6 address may be not deleted from the * interface. Then, the interface has previous and new * addresses. We should use one corresponding to the * next-hop. (by micchie) */ #ifdef INET6 if (stcb && fam == AF_INET6 && sctp_is_mobility_feature_on(stcb->sctp_ep, SCTP_MOBILITY_BASE)) { if (sctp_v6src_match_nexthop(&sifa->address.sin6, ro) == 0) { continue; } } #endif #ifdef INET /* Avoid topologically incorrect IPv4 address */ if (stcb && fam == AF_INET && sctp_is_mobility_feature_on(stcb->sctp_ep, SCTP_MOBILITY_BASE)) { if (sctp_v4src_match_nexthop(sifa, ro) == 0) { continue; } } #endif if (stcb) { if (sctp_is_address_in_scope(ifa, &stcb->asoc.scope, 0) == 0) { continue; } if (((non_asoc_addr_ok == 0) && (sctp_is_addr_restricted(stcb, sifa))) || (non_asoc_addr_ok && (sctp_is_addr_restricted(stcb, sifa)) && (!sctp_is_addr_pending(stcb, sifa)))) { /* * It is restricted for some reason.. * probably not yet added. */ continue; } } if (num_eligible_addr >= addr_wanted) { return (sifa); } num_eligible_addr++; } return (NULL); } static int sctp_count_num_preferred_boundall(struct sctp_ifn *ifn, struct sctp_inpcb *inp, struct sctp_tcb *stcb, int non_asoc_addr_ok, uint8_t dest_is_loop, uint8_t dest_is_priv, sa_family_t fam) { struct sctp_ifa *ifa, *sifa; int num_eligible_addr = 0; LIST_FOREACH(ifa, &ifn->ifalist, next_ifa) { #ifdef INET if ((ifa->address.sa.sa_family == AF_INET) && (prison_check_ip4(inp->ip_inp.inp.inp_cred, &ifa->address.sin.sin_addr) != 0)) { continue; } #endif #ifdef INET6 if ((ifa->address.sa.sa_family == AF_INET6) && (stcb != NULL) && (prison_check_ip6(inp->ip_inp.inp.inp_cred, &ifa->address.sin6.sin6_addr) != 0)) { continue; } #endif if ((ifa->localifa_flags & SCTP_ADDR_DEFER_USE) && (non_asoc_addr_ok == 0)) { continue; } sifa = sctp_is_ifa_addr_preferred(ifa, dest_is_loop, dest_is_priv, fam); if (sifa == NULL) { continue; } if (stcb) { if (sctp_is_address_in_scope(ifa, &stcb->asoc.scope, 0) == 0) { continue; } if (((non_asoc_addr_ok == 0) && (sctp_is_addr_restricted(stcb, sifa))) || (non_asoc_addr_ok && (sctp_is_addr_restricted(stcb, sifa)) && (!sctp_is_addr_pending(stcb, sifa)))) { /* * It is restricted for some reason.. * probably not yet added. */ continue; } } num_eligible_addr++; } return (num_eligible_addr); } static struct sctp_ifa * sctp_choose_boundall(struct sctp_inpcb *inp, struct sctp_tcb *stcb, struct sctp_nets *net, sctp_route_t * ro, uint32_t vrf_id, uint8_t dest_is_priv, uint8_t dest_is_loop, int non_asoc_addr_ok, sa_family_t fam) { int cur_addr_num = 0, num_preferred = 0; void *ifn; struct sctp_ifn *sctp_ifn, *looked_at = NULL, *emit_ifn; struct sctp_ifa *sctp_ifa, *sifa; uint32_t ifn_index; struct sctp_vrf *vrf; #ifdef INET int retried = 0; #endif /*- * For boundall we can use any address in the association. * If non_asoc_addr_ok is set we can use any address (at least in * theory). So we look for preferred addresses first. If we find one, * we use it. Otherwise we next try to get an address on the * interface, which we should be able to do (unless non_asoc_addr_ok * is false and we are routed out that way). In these cases where we * can't use the address of the interface we go through all the * ifn's looking for an address we can use and fill that in. Punting * means we send back address 0, which will probably cause problems * actually since then IP will fill in the address of the route ifn, * which means we probably already rejected it.. i.e. here comes an * abort :-<. */ vrf = sctp_find_vrf(vrf_id); if (vrf == NULL) return (NULL); ifn = SCTP_GET_IFN_VOID_FROM_ROUTE(ro); ifn_index = SCTP_GET_IF_INDEX_FROM_ROUTE(ro); SCTPDBG(SCTP_DEBUG_OUTPUT2, "ifn from route:%p ifn_index:%d\n", ifn, ifn_index); emit_ifn = looked_at = sctp_ifn = sctp_find_ifn(ifn, ifn_index); if (sctp_ifn == NULL) { /* ?? We don't have this guy ?? */ SCTPDBG(SCTP_DEBUG_OUTPUT2, "No ifn emit interface?\n"); goto bound_all_plan_b; } SCTPDBG(SCTP_DEBUG_OUTPUT2, "ifn_index:%d name:%s is emit interface\n", ifn_index, sctp_ifn->ifn_name); if (net) { cur_addr_num = net->indx_of_eligible_next_to_use; } num_preferred = sctp_count_num_preferred_boundall(sctp_ifn, inp, stcb, non_asoc_addr_ok, dest_is_loop, dest_is_priv, fam); SCTPDBG(SCTP_DEBUG_OUTPUT2, "Found %d preferred source addresses for intf:%s\n", num_preferred, sctp_ifn->ifn_name); if (num_preferred == 0) { /* * no eligible addresses, we must use some other interface * address if we can find one. */ goto bound_all_plan_b; } /* * Ok we have num_eligible_addr set with how many we can use, this * may vary from call to call due to addresses being deprecated * etc.. */ if (cur_addr_num >= num_preferred) { cur_addr_num = 0; } /* * select the nth address from the list (where cur_addr_num is the * nth) and 0 is the first one, 1 is the second one etc... */ SCTPDBG(SCTP_DEBUG_OUTPUT2, "cur_addr_num:%d\n", cur_addr_num); sctp_ifa = sctp_select_nth_preferred_addr_from_ifn_boundall(sctp_ifn, inp, stcb, non_asoc_addr_ok, dest_is_loop, dest_is_priv, cur_addr_num, fam, ro); /* if sctp_ifa is NULL something changed??, fall to plan b. */ if (sctp_ifa) { atomic_add_int(&sctp_ifa->refcount, 1); if (net) { /* save off where the next one we will want */ net->indx_of_eligible_next_to_use = cur_addr_num + 1; } return (sctp_ifa); } /* * plan_b: Look at all interfaces and find a preferred address. If * no preferred fall through to plan_c. */ bound_all_plan_b: SCTPDBG(SCTP_DEBUG_OUTPUT2, "Trying Plan B\n"); LIST_FOREACH(sctp_ifn, &vrf->ifnlist, next_ifn) { SCTPDBG(SCTP_DEBUG_OUTPUT2, "Examine interface %s\n", sctp_ifn->ifn_name); if (dest_is_loop == 0 && SCTP_IFN_IS_IFT_LOOP(sctp_ifn)) { /* wrong base scope */ SCTPDBG(SCTP_DEBUG_OUTPUT2, "skip\n"); continue; } if ((sctp_ifn == looked_at) && looked_at) { /* already looked at this guy */ SCTPDBG(SCTP_DEBUG_OUTPUT2, "already seen\n"); continue; } num_preferred = sctp_count_num_preferred_boundall(sctp_ifn, inp, stcb, non_asoc_addr_ok, dest_is_loop, dest_is_priv, fam); SCTPDBG(SCTP_DEBUG_OUTPUT2, "Found ifn:%p %d preferred source addresses\n", ifn, num_preferred); if (num_preferred == 0) { /* None on this interface. */ SCTPDBG(SCTP_DEBUG_OUTPUT2, "No preferred -- skipping to next\n"); continue; } SCTPDBG(SCTP_DEBUG_OUTPUT2, "num preferred:%d on interface:%p cur_addr_num:%d\n", num_preferred, (void *)sctp_ifn, cur_addr_num); /* * Ok we have num_eligible_addr set with how many we can * use, this may vary from call to call due to addresses * being deprecated etc.. */ if (cur_addr_num >= num_preferred) { cur_addr_num = 0; } sifa = sctp_select_nth_preferred_addr_from_ifn_boundall(sctp_ifn, inp, stcb, non_asoc_addr_ok, dest_is_loop, dest_is_priv, cur_addr_num, fam, ro); if (sifa == NULL) continue; if (net) { net->indx_of_eligible_next_to_use = cur_addr_num + 1; SCTPDBG(SCTP_DEBUG_OUTPUT2, "we selected %d\n", cur_addr_num); SCTPDBG(SCTP_DEBUG_OUTPUT2, "Source:"); SCTPDBG_ADDR(SCTP_DEBUG_OUTPUT2, &sifa->address.sa); SCTPDBG(SCTP_DEBUG_OUTPUT2, "Dest:"); SCTPDBG_ADDR(SCTP_DEBUG_OUTPUT2, &net->ro._l_addr.sa); } atomic_add_int(&sifa->refcount, 1); return (sifa); } #ifdef INET again_with_private_addresses_allowed: #endif /* plan_c: do we have an acceptable address on the emit interface */ sifa = NULL; SCTPDBG(SCTP_DEBUG_OUTPUT2, "Trying Plan C: find acceptable on interface\n"); if (emit_ifn == NULL) { SCTPDBG(SCTP_DEBUG_OUTPUT2, "Jump to Plan D - no emit_ifn\n"); goto plan_d; } LIST_FOREACH(sctp_ifa, &emit_ifn->ifalist, next_ifa) { SCTPDBG(SCTP_DEBUG_OUTPUT2, "ifa:%p\n", (void *)sctp_ifa); #ifdef INET if ((sctp_ifa->address.sa.sa_family == AF_INET) && (prison_check_ip4(inp->ip_inp.inp.inp_cred, &sctp_ifa->address.sin.sin_addr) != 0)) { SCTPDBG(SCTP_DEBUG_OUTPUT2, "Jailed\n"); continue; } #endif #ifdef INET6 if ((sctp_ifa->address.sa.sa_family == AF_INET6) && (prison_check_ip6(inp->ip_inp.inp.inp_cred, &sctp_ifa->address.sin6.sin6_addr) != 0)) { SCTPDBG(SCTP_DEBUG_OUTPUT2, "Jailed\n"); continue; } #endif if ((sctp_ifa->localifa_flags & SCTP_ADDR_DEFER_USE) && (non_asoc_addr_ok == 0)) { SCTPDBG(SCTP_DEBUG_OUTPUT2, "Defer\n"); continue; } sifa = sctp_is_ifa_addr_acceptable(sctp_ifa, dest_is_loop, dest_is_priv, fam); if (sifa == NULL) { SCTPDBG(SCTP_DEBUG_OUTPUT2, "IFA not acceptable\n"); continue; } if (stcb) { if (sctp_is_address_in_scope(sifa, &stcb->asoc.scope, 0) == 0) { SCTPDBG(SCTP_DEBUG_OUTPUT2, "NOT in scope\n"); sifa = NULL; continue; } if (((non_asoc_addr_ok == 0) && (sctp_is_addr_restricted(stcb, sifa))) || (non_asoc_addr_ok && (sctp_is_addr_restricted(stcb, sifa)) && (!sctp_is_addr_pending(stcb, sifa)))) { /* * It is restricted for some reason.. * probably not yet added. */ SCTPDBG(SCTP_DEBUG_OUTPUT2, "Its restricted\n"); sifa = NULL; continue; } } atomic_add_int(&sifa->refcount, 1); goto out; } plan_d: /* * plan_d: We are in trouble. No preferred address on the emit * interface. And not even a preferred address on all interfaces. Go * out and see if we can find an acceptable address somewhere * amongst all interfaces. */ SCTPDBG(SCTP_DEBUG_OUTPUT2, "Trying Plan D looked_at is %p\n", (void *)looked_at); LIST_FOREACH(sctp_ifn, &vrf->ifnlist, next_ifn) { if (dest_is_loop == 0 && SCTP_IFN_IS_IFT_LOOP(sctp_ifn)) { /* wrong base scope */ continue; } LIST_FOREACH(sctp_ifa, &sctp_ifn->ifalist, next_ifa) { #ifdef INET if ((sctp_ifa->address.sa.sa_family == AF_INET) && (prison_check_ip4(inp->ip_inp.inp.inp_cred, &sctp_ifa->address.sin.sin_addr) != 0)) { continue; } #endif #ifdef INET6 if ((sctp_ifa->address.sa.sa_family == AF_INET6) && (prison_check_ip6(inp->ip_inp.inp.inp_cred, &sctp_ifa->address.sin6.sin6_addr) != 0)) { continue; } #endif if ((sctp_ifa->localifa_flags & SCTP_ADDR_DEFER_USE) && (non_asoc_addr_ok == 0)) continue; sifa = sctp_is_ifa_addr_acceptable(sctp_ifa, dest_is_loop, dest_is_priv, fam); if (sifa == NULL) continue; if (stcb) { if (sctp_is_address_in_scope(sifa, &stcb->asoc.scope, 0) == 0) { sifa = NULL; continue; } if (((non_asoc_addr_ok == 0) && (sctp_is_addr_restricted(stcb, sifa))) || (non_asoc_addr_ok && (sctp_is_addr_restricted(stcb, sifa)) && (!sctp_is_addr_pending(stcb, sifa)))) { /* * It is restricted for some * reason.. probably not yet added. */ sifa = NULL; continue; } } goto out; } } #ifdef INET if (stcb) { if ((retried == 0) && (stcb->asoc.scope.ipv4_local_scope == 0)) { stcb->asoc.scope.ipv4_local_scope = 1; retried = 1; goto again_with_private_addresses_allowed; } else if (retried == 1) { stcb->asoc.scope.ipv4_local_scope = 0; } } #endif out: #ifdef INET if (sifa) { if (retried == 1) { LIST_FOREACH(sctp_ifn, &vrf->ifnlist, next_ifn) { if (dest_is_loop == 0 && SCTP_IFN_IS_IFT_LOOP(sctp_ifn)) { /* wrong base scope */ continue; } LIST_FOREACH(sctp_ifa, &sctp_ifn->ifalist, next_ifa) { struct sctp_ifa *tmp_sifa; #ifdef INET if ((sctp_ifa->address.sa.sa_family == AF_INET) && (prison_check_ip4(inp->ip_inp.inp.inp_cred, &sctp_ifa->address.sin.sin_addr) != 0)) { continue; } #endif #ifdef INET6 if ((sctp_ifa->address.sa.sa_family == AF_INET6) && (prison_check_ip6(inp->ip_inp.inp.inp_cred, &sctp_ifa->address.sin6.sin6_addr) != 0)) { continue; } #endif if ((sctp_ifa->localifa_flags & SCTP_ADDR_DEFER_USE) && (non_asoc_addr_ok == 0)) continue; tmp_sifa = sctp_is_ifa_addr_acceptable(sctp_ifa, dest_is_loop, dest_is_priv, fam); if (tmp_sifa == NULL) { continue; } if (tmp_sifa == sifa) { continue; } if (stcb) { if (sctp_is_address_in_scope(tmp_sifa, &stcb->asoc.scope, 0) == 0) { continue; } if (((non_asoc_addr_ok == 0) && (sctp_is_addr_restricted(stcb, tmp_sifa))) || (non_asoc_addr_ok && (sctp_is_addr_restricted(stcb, tmp_sifa)) && (!sctp_is_addr_pending(stcb, tmp_sifa)))) { /* * It is restricted * for some reason.. * probably not yet * added. */ continue; } } if ((tmp_sifa->address.sin.sin_family == AF_INET) && (IN4_ISPRIVATE_ADDRESS(&(tmp_sifa->address.sin.sin_addr)))) { sctp_add_local_addr_restricted(stcb, tmp_sifa); } } } } atomic_add_int(&sifa->refcount, 1); } #endif return (sifa); } /* tcb may be NULL */ struct sctp_ifa * sctp_source_address_selection(struct sctp_inpcb *inp, struct sctp_tcb *stcb, sctp_route_t * ro, struct sctp_nets *net, int non_asoc_addr_ok, uint32_t vrf_id) { struct sctp_ifa *answer; uint8_t dest_is_priv, dest_is_loop; sa_family_t fam; #ifdef INET struct sockaddr_in *to = (struct sockaddr_in *)&ro->ro_dst; #endif #ifdef INET6 struct sockaddr_in6 *to6 = (struct sockaddr_in6 *)&ro->ro_dst; #endif /** * Rules: * - Find the route if needed, cache if I can. * - Look at interface address in route, Is it in the bound list. If so we * have the best source. * - If not we must rotate amongst the addresses. * * Cavets and issues * * Do we need to pay attention to scope. We can have a private address * or a global address we are sourcing or sending to. So if we draw * it out * zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz * For V4 * ------------------------------------------ * source * dest * result * ----------------------------------------- * Private * Global * NAT * ----------------------------------------- * Private * Private * No problem * ----------------------------------------- * Global * Private * Huh, How will this work? * ----------------------------------------- * Global * Global * No Problem *------------------------------------------ * zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz * For V6 *------------------------------------------ * source * dest * result * ----------------------------------------- * Linklocal * Global * * ----------------------------------------- * Linklocal * Linklocal * No problem * ----------------------------------------- * Global * Linklocal * Huh, How will this work? * ----------------------------------------- * Global * Global * No Problem *------------------------------------------ * zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz * * And then we add to that what happens if there are multiple addresses * assigned to an interface. Remember the ifa on a ifn is a linked * list of addresses. So one interface can have more than one IP * address. What happens if we have both a private and a global * address? Do we then use context of destination to sort out which * one is best? And what about NAT's sending P->G may get you a NAT * translation, or should you select the G thats on the interface in * preference. * * Decisions: * * - count the number of addresses on the interface. * - if it is one, no problem except case . * For we will assume a NAT out there. * - if there are more than one, then we need to worry about scope P * or G. We should prefer G -> G and P -> P if possible. * Then as a secondary fall back to mixed types G->P being a last * ditch one. * - The above all works for bound all, but bound specific we need to * use the same concept but instead only consider the bound * addresses. If the bound set is NOT assigned to the interface then * we must use rotation amongst the bound addresses.. */ if (ro->ro_rt == NULL) { /* * Need a route to cache. */ SCTP_RTALLOC(ro, vrf_id, inp->fibnum); } if (ro->ro_rt == NULL) { return (NULL); } fam = ro->ro_dst.sa_family; dest_is_priv = dest_is_loop = 0; /* Setup our scopes for the destination */ switch (fam) { #ifdef INET case AF_INET: /* Scope based on outbound address */ if (IN4_ISLOOPBACK_ADDRESS(&to->sin_addr)) { dest_is_loop = 1; if (net != NULL) { /* mark it as local */ net->addr_is_local = 1; } } else if ((IN4_ISPRIVATE_ADDRESS(&to->sin_addr))) { dest_is_priv = 1; } break; #endif #ifdef INET6 case AF_INET6: /* Scope based on outbound address */ if (IN6_IS_ADDR_LOOPBACK(&to6->sin6_addr) || SCTP_ROUTE_IS_REAL_LOOP(ro)) { /* * If the address is a loopback address, which * consists of "::1" OR "fe80::1%lo0", we are * loopback scope. But we don't use dest_is_priv * (link local addresses). */ dest_is_loop = 1; if (net != NULL) { /* mark it as local */ net->addr_is_local = 1; } } else if (IN6_IS_ADDR_LINKLOCAL(&to6->sin6_addr)) { dest_is_priv = 1; } break; #endif } SCTPDBG(SCTP_DEBUG_OUTPUT2, "Select source addr for:"); SCTPDBG_ADDR(SCTP_DEBUG_OUTPUT2, (struct sockaddr *)&ro->ro_dst); SCTP_IPI_ADDR_RLOCK(); if (inp->sctp_flags & SCTP_PCB_FLAGS_BOUNDALL) { /* * Bound all case */ answer = sctp_choose_boundall(inp, stcb, net, ro, vrf_id, dest_is_priv, dest_is_loop, non_asoc_addr_ok, fam); SCTP_IPI_ADDR_RUNLOCK(); return (answer); } /* * Subset bound case */ if (stcb) { answer = sctp_choose_boundspecific_stcb(inp, stcb, ro, vrf_id, dest_is_priv, dest_is_loop, non_asoc_addr_ok, fam); } else { answer = sctp_choose_boundspecific_inp(inp, ro, vrf_id, non_asoc_addr_ok, dest_is_priv, dest_is_loop, fam); } SCTP_IPI_ADDR_RUNLOCK(); return (answer); } static int sctp_find_cmsg(int c_type, void *data, struct mbuf *control, size_t cpsize) { struct cmsghdr cmh; int tlen, at, found; struct sctp_sndinfo sndinfo; struct sctp_prinfo prinfo; struct sctp_authinfo authinfo; tlen = SCTP_BUF_LEN(control); at = 0; found = 0; /* * Independent of how many mbufs, find the c_type inside the control * structure and copy out the data. */ while (at < tlen) { if ((tlen - at) < (int)CMSG_ALIGN(sizeof(cmh))) { /* There is not enough room for one more. */ return (found); } m_copydata(control, at, sizeof(cmh), (caddr_t)&cmh); if (cmh.cmsg_len < CMSG_ALIGN(sizeof(cmh))) { /* We dont't have a complete CMSG header. */ return (found); } if (((int)cmh.cmsg_len + at) > tlen) { /* We don't have the complete CMSG. */ return (found); } if ((cmh.cmsg_level == IPPROTO_SCTP) && ((c_type == cmh.cmsg_type) || ((c_type == SCTP_SNDRCV) && ((cmh.cmsg_type == SCTP_SNDINFO) || (cmh.cmsg_type == SCTP_PRINFO) || (cmh.cmsg_type == SCTP_AUTHINFO))))) { if (c_type == cmh.cmsg_type) { if ((size_t)(cmh.cmsg_len - CMSG_ALIGN(sizeof(cmh))) < cpsize) { return (found); } /* It is exactly what we want. Copy it out. */ m_copydata(control, at + CMSG_ALIGN(sizeof(cmh)), (int)cpsize, (caddr_t)data); return (1); } else { struct sctp_sndrcvinfo *sndrcvinfo; sndrcvinfo = (struct sctp_sndrcvinfo *)data; if (found == 0) { if (cpsize < sizeof(struct sctp_sndrcvinfo)) { return (found); } memset(sndrcvinfo, 0, sizeof(struct sctp_sndrcvinfo)); } switch (cmh.cmsg_type) { case SCTP_SNDINFO: if ((size_t)(cmh.cmsg_len - CMSG_ALIGN(sizeof(cmh))) < sizeof(struct sctp_sndinfo)) { return (found); } m_copydata(control, at + CMSG_ALIGN(sizeof(cmh)), sizeof(struct sctp_sndinfo), (caddr_t)&sndinfo); sndrcvinfo->sinfo_stream = sndinfo.snd_sid; sndrcvinfo->sinfo_flags = sndinfo.snd_flags; sndrcvinfo->sinfo_ppid = sndinfo.snd_ppid; sndrcvinfo->sinfo_context = sndinfo.snd_context; sndrcvinfo->sinfo_assoc_id = sndinfo.snd_assoc_id; break; case SCTP_PRINFO: if ((size_t)(cmh.cmsg_len - CMSG_ALIGN(sizeof(cmh))) < sizeof(struct sctp_prinfo)) { return (found); } m_copydata(control, at + CMSG_ALIGN(sizeof(cmh)), sizeof(struct sctp_prinfo), (caddr_t)&prinfo); if (prinfo.pr_policy != SCTP_PR_SCTP_NONE) { sndrcvinfo->sinfo_timetolive = prinfo.pr_value; } else { sndrcvinfo->sinfo_timetolive = 0; } sndrcvinfo->sinfo_flags |= prinfo.pr_policy; break; case SCTP_AUTHINFO: if ((size_t)(cmh.cmsg_len - CMSG_ALIGN(sizeof(cmh))) < sizeof(struct sctp_authinfo)) { return (found); } m_copydata(control, at + CMSG_ALIGN(sizeof(cmh)), sizeof(struct sctp_authinfo), (caddr_t)&authinfo); sndrcvinfo->sinfo_keynumber_valid = 1; sndrcvinfo->sinfo_keynumber = authinfo.auth_keynumber; break; default: return (found); } found = 1; } } at += CMSG_ALIGN(cmh.cmsg_len); } return (found); } static int sctp_process_cmsgs_for_init(struct sctp_tcb *stcb, struct mbuf *control, int *error) { struct cmsghdr cmh; int tlen, at; struct sctp_initmsg initmsg; #ifdef INET struct sockaddr_in sin; #endif #ifdef INET6 struct sockaddr_in6 sin6; #endif tlen = SCTP_BUF_LEN(control); at = 0; while (at < tlen) { if ((tlen - at) < (int)CMSG_ALIGN(sizeof(cmh))) { /* There is not enough room for one more. */ *error = EINVAL; return (1); } m_copydata(control, at, sizeof(cmh), (caddr_t)&cmh); if (cmh.cmsg_len < CMSG_ALIGN(sizeof(cmh))) { /* We dont't have a complete CMSG header. */ *error = EINVAL; return (1); } if (((int)cmh.cmsg_len + at) > tlen) { /* We don't have the complete CMSG. */ *error = EINVAL; return (1); } if (cmh.cmsg_level == IPPROTO_SCTP) { switch (cmh.cmsg_type) { case SCTP_INIT: if ((size_t)(cmh.cmsg_len - CMSG_ALIGN(sizeof(cmh))) < sizeof(struct sctp_initmsg)) { *error = EINVAL; return (1); } m_copydata(control, at + CMSG_ALIGN(sizeof(cmh)), sizeof(struct sctp_initmsg), (caddr_t)&initmsg); if (initmsg.sinit_max_attempts) stcb->asoc.max_init_times = initmsg.sinit_max_attempts; if (initmsg.sinit_num_ostreams) stcb->asoc.pre_open_streams = initmsg.sinit_num_ostreams; if (initmsg.sinit_max_instreams) stcb->asoc.max_inbound_streams = initmsg.sinit_max_instreams; if (initmsg.sinit_max_init_timeo) stcb->asoc.initial_init_rto_max = initmsg.sinit_max_init_timeo; if (stcb->asoc.streamoutcnt < stcb->asoc.pre_open_streams) { struct sctp_stream_out *tmp_str; unsigned int i; #if defined(SCTP_DETAILED_STR_STATS) int j; #endif /* Default is NOT correct */ SCTPDBG(SCTP_DEBUG_OUTPUT1, "Ok, default:%d pre_open:%d\n", stcb->asoc.streamoutcnt, stcb->asoc.pre_open_streams); SCTP_TCB_UNLOCK(stcb); SCTP_MALLOC(tmp_str, struct sctp_stream_out *, (stcb->asoc.pre_open_streams * sizeof(struct sctp_stream_out)), SCTP_M_STRMO); SCTP_TCB_LOCK(stcb); if (tmp_str != NULL) { SCTP_FREE(stcb->asoc.strmout, SCTP_M_STRMO); stcb->asoc.strmout = tmp_str; stcb->asoc.strm_realoutsize = stcb->asoc.streamoutcnt = stcb->asoc.pre_open_streams; } else { stcb->asoc.pre_open_streams = stcb->asoc.streamoutcnt; } for (i = 0; i < stcb->asoc.streamoutcnt; i++) { TAILQ_INIT(&stcb->asoc.strmout[i].outqueue); stcb->asoc.strmout[i].chunks_on_queues = 0; stcb->asoc.strmout[i].next_mid_ordered = 0; stcb->asoc.strmout[i].next_mid_unordered = 0; #if defined(SCTP_DETAILED_STR_STATS) for (j = 0; j < SCTP_PR_SCTP_MAX + 1; j++) { stcb->asoc.strmout[i].abandoned_sent[j] = 0; stcb->asoc.strmout[i].abandoned_unsent[j] = 0; } #else stcb->asoc.strmout[i].abandoned_sent[0] = 0; stcb->asoc.strmout[i].abandoned_unsent[0] = 0; #endif stcb->asoc.strmout[i].stream_no = i; stcb->asoc.strmout[i].last_msg_incomplete = 0; stcb->asoc.strmout[i].state = SCTP_STREAM_OPENING; stcb->asoc.ss_functions.sctp_ss_init_stream(stcb, &stcb->asoc.strmout[i], NULL); } } break; #ifdef INET case SCTP_DSTADDRV4: if ((size_t)(cmh.cmsg_len - CMSG_ALIGN(sizeof(cmh))) < sizeof(struct in_addr)) { *error = EINVAL; return (1); } memset(&sin, 0, sizeof(struct sockaddr_in)); sin.sin_family = AF_INET; sin.sin_len = sizeof(struct sockaddr_in); sin.sin_port = stcb->rport; m_copydata(control, at + CMSG_ALIGN(sizeof(cmh)), sizeof(struct in_addr), (caddr_t)&sin.sin_addr); if ((sin.sin_addr.s_addr == INADDR_ANY) || (sin.sin_addr.s_addr == INADDR_BROADCAST) || IN_MULTICAST(ntohl(sin.sin_addr.s_addr))) { *error = EINVAL; return (1); } if (sctp_add_remote_addr(stcb, (struct sockaddr *)&sin, NULL, stcb->asoc.port, SCTP_DONOT_SETSCOPE, SCTP_ADDR_IS_CONFIRMED)) { *error = ENOBUFS; return (1); } break; #endif #ifdef INET6 case SCTP_DSTADDRV6: if ((size_t)(cmh.cmsg_len - CMSG_ALIGN(sizeof(cmh))) < sizeof(struct in6_addr)) { *error = EINVAL; return (1); } memset(&sin6, 0, sizeof(struct sockaddr_in6)); sin6.sin6_family = AF_INET6; sin6.sin6_len = sizeof(struct sockaddr_in6); sin6.sin6_port = stcb->rport; m_copydata(control, at + CMSG_ALIGN(sizeof(cmh)), sizeof(struct in6_addr), (caddr_t)&sin6.sin6_addr); if (IN6_IS_ADDR_UNSPECIFIED(&sin6.sin6_addr) || IN6_IS_ADDR_MULTICAST(&sin6.sin6_addr)) { *error = EINVAL; return (1); } #ifdef INET if (IN6_IS_ADDR_V4MAPPED(&sin6.sin6_addr)) { in6_sin6_2_sin(&sin, &sin6); if ((sin.sin_addr.s_addr == INADDR_ANY) || (sin.sin_addr.s_addr == INADDR_BROADCAST) || IN_MULTICAST(ntohl(sin.sin_addr.s_addr))) { *error = EINVAL; return (1); } if (sctp_add_remote_addr(stcb, (struct sockaddr *)&sin, NULL, stcb->asoc.port, SCTP_DONOT_SETSCOPE, SCTP_ADDR_IS_CONFIRMED)) { *error = ENOBUFS; return (1); } } else #endif if (sctp_add_remote_addr(stcb, (struct sockaddr *)&sin6, NULL, stcb->asoc.port, SCTP_DONOT_SETSCOPE, SCTP_ADDR_IS_CONFIRMED)) { *error = ENOBUFS; return (1); } break; #endif default: break; } } at += CMSG_ALIGN(cmh.cmsg_len); } return (0); } static struct sctp_tcb * sctp_findassociation_cmsgs(struct sctp_inpcb **inp_p, uint16_t port, struct mbuf *control, struct sctp_nets **net_p, int *error) { struct cmsghdr cmh; int tlen, at; struct sctp_tcb *stcb; struct sockaddr *addr; #ifdef INET struct sockaddr_in sin; #endif #ifdef INET6 struct sockaddr_in6 sin6; #endif tlen = SCTP_BUF_LEN(control); at = 0; while (at < tlen) { if ((tlen - at) < (int)CMSG_ALIGN(sizeof(cmh))) { /* There is not enough room for one more. */ *error = EINVAL; return (NULL); } m_copydata(control, at, sizeof(cmh), (caddr_t)&cmh); if (cmh.cmsg_len < CMSG_ALIGN(sizeof(cmh))) { /* We dont't have a complete CMSG header. */ *error = EINVAL; return (NULL); } if (((int)cmh.cmsg_len + at) > tlen) { /* We don't have the complete CMSG. */ *error = EINVAL; return (NULL); } if (cmh.cmsg_level == IPPROTO_SCTP) { switch (cmh.cmsg_type) { #ifdef INET case SCTP_DSTADDRV4: if ((size_t)(cmh.cmsg_len - CMSG_ALIGN(sizeof(cmh))) < sizeof(struct in_addr)) { *error = EINVAL; return (NULL); } memset(&sin, 0, sizeof(struct sockaddr_in)); sin.sin_family = AF_INET; sin.sin_len = sizeof(struct sockaddr_in); sin.sin_port = port; m_copydata(control, at + CMSG_ALIGN(sizeof(cmh)), sizeof(struct in_addr), (caddr_t)&sin.sin_addr); addr = (struct sockaddr *)&sin; break; #endif #ifdef INET6 case SCTP_DSTADDRV6: if ((size_t)(cmh.cmsg_len - CMSG_ALIGN(sizeof(cmh))) < sizeof(struct in6_addr)) { *error = EINVAL; return (NULL); } memset(&sin6, 0, sizeof(struct sockaddr_in6)); sin6.sin6_family = AF_INET6; sin6.sin6_len = sizeof(struct sockaddr_in6); sin6.sin6_port = port; m_copydata(control, at + CMSG_ALIGN(sizeof(cmh)), sizeof(struct in6_addr), (caddr_t)&sin6.sin6_addr); #ifdef INET if (IN6_IS_ADDR_V4MAPPED(&sin6.sin6_addr)) { in6_sin6_2_sin(&sin, &sin6); addr = (struct sockaddr *)&sin; } else #endif addr = (struct sockaddr *)&sin6; break; #endif default: addr = NULL; break; } if (addr) { stcb = sctp_findassociation_ep_addr(inp_p, addr, net_p, NULL, NULL); if (stcb != NULL) { return (stcb); } } } at += CMSG_ALIGN(cmh.cmsg_len); } return (NULL); } static struct mbuf * sctp_add_cookie(struct mbuf *init, int init_offset, struct mbuf *initack, int initack_offset, struct sctp_state_cookie *stc_in, uint8_t ** signature) { struct mbuf *copy_init, *copy_initack, *m_at, *sig, *mret; struct sctp_state_cookie *stc; struct sctp_paramhdr *ph; uint8_t *foo; int sig_offset; uint16_t cookie_sz; mret = sctp_get_mbuf_for_msg((sizeof(struct sctp_state_cookie) + sizeof(struct sctp_paramhdr)), 0, M_NOWAIT, 1, MT_DATA); if (mret == NULL) { return (NULL); } copy_init = SCTP_M_COPYM(init, init_offset, M_COPYALL, M_NOWAIT); if (copy_init == NULL) { sctp_m_freem(mret); return (NULL); } #ifdef SCTP_MBUF_LOGGING if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_MBUF_LOGGING_ENABLE) { sctp_log_mbc(copy_init, SCTP_MBUF_ICOPY); } #endif copy_initack = SCTP_M_COPYM(initack, initack_offset, M_COPYALL, M_NOWAIT); if (copy_initack == NULL) { sctp_m_freem(mret); sctp_m_freem(copy_init); return (NULL); } #ifdef SCTP_MBUF_LOGGING if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_MBUF_LOGGING_ENABLE) { sctp_log_mbc(copy_initack, SCTP_MBUF_ICOPY); } #endif /* easy side we just drop it on the end */ ph = mtod(mret, struct sctp_paramhdr *); SCTP_BUF_LEN(mret) = sizeof(struct sctp_state_cookie) + sizeof(struct sctp_paramhdr); stc = (struct sctp_state_cookie *)((caddr_t)ph + sizeof(struct sctp_paramhdr)); ph->param_type = htons(SCTP_STATE_COOKIE); ph->param_length = 0; /* fill in at the end */ /* Fill in the stc cookie data */ memcpy(stc, stc_in, sizeof(struct sctp_state_cookie)); /* tack the INIT and then the INIT-ACK onto the chain */ cookie_sz = 0; for (m_at = mret; m_at; m_at = SCTP_BUF_NEXT(m_at)) { cookie_sz += SCTP_BUF_LEN(m_at); if (SCTP_BUF_NEXT(m_at) == NULL) { SCTP_BUF_NEXT(m_at) = copy_init; break; } } for (m_at = copy_init; m_at; m_at = SCTP_BUF_NEXT(m_at)) { cookie_sz += SCTP_BUF_LEN(m_at); if (SCTP_BUF_NEXT(m_at) == NULL) { SCTP_BUF_NEXT(m_at) = copy_initack; break; } } for (m_at = copy_initack; m_at; m_at = SCTP_BUF_NEXT(m_at)) { cookie_sz += SCTP_BUF_LEN(m_at); if (SCTP_BUF_NEXT(m_at) == NULL) { break; } } sig = sctp_get_mbuf_for_msg(SCTP_SECRET_SIZE, 0, M_NOWAIT, 1, MT_DATA); if (sig == NULL) { /* no space, so free the entire chain */ sctp_m_freem(mret); return (NULL); } SCTP_BUF_LEN(sig) = 0; SCTP_BUF_NEXT(m_at) = sig; sig_offset = 0; foo = (uint8_t *) (mtod(sig, caddr_t)+sig_offset); memset(foo, 0, SCTP_SIGNATURE_SIZE); *signature = foo; SCTP_BUF_LEN(sig) += SCTP_SIGNATURE_SIZE; cookie_sz += SCTP_SIGNATURE_SIZE; ph->param_length = htons(cookie_sz); return (mret); } static uint8_t sctp_get_ect(struct sctp_tcb *stcb) { if ((stcb != NULL) && (stcb->asoc.ecn_supported == 1)) { return (SCTP_ECT0_BIT); } else { return (0); } } #if defined(INET) || defined(INET6) static void sctp_handle_no_route(struct sctp_tcb *stcb, struct sctp_nets *net, int so_locked) { SCTPDBG(SCTP_DEBUG_OUTPUT1, "dropped packet - no valid source addr\n"); if (net) { SCTPDBG(SCTP_DEBUG_OUTPUT1, "Destination was "); SCTPDBG_ADDR(SCTP_DEBUG_OUTPUT1, &net->ro._l_addr.sa); if (net->dest_state & SCTP_ADDR_CONFIRMED) { if ((net->dest_state & SCTP_ADDR_REACHABLE) && stcb) { SCTPDBG(SCTP_DEBUG_OUTPUT1, "no route takes interface %p down\n", (void *)net); sctp_ulp_notify(SCTP_NOTIFY_INTERFACE_DOWN, stcb, 0, (void *)net, so_locked); net->dest_state &= ~SCTP_ADDR_REACHABLE; net->dest_state &= ~SCTP_ADDR_PF; } } if (stcb) { if (net == stcb->asoc.primary_destination) { /* need a new primary */ struct sctp_nets *alt; alt = sctp_find_alternate_net(stcb, net, 0); if (alt != net) { if (stcb->asoc.alternate) { sctp_free_remote_addr(stcb->asoc.alternate); } stcb->asoc.alternate = alt; atomic_add_int(&stcb->asoc.alternate->ref_count, 1); if (net->ro._s_addr) { sctp_free_ifa(net->ro._s_addr); net->ro._s_addr = NULL; } net->src_addr_selected = 0; } } } } } #endif static int sctp_lowlevel_chunk_output(struct sctp_inpcb *inp, struct sctp_tcb *stcb, /* may be NULL */ struct sctp_nets *net, struct sockaddr *to, struct mbuf *m, uint32_t auth_offset, struct sctp_auth_chunk *auth, uint16_t auth_keyid, int nofragment_flag, int ecn_ok, int out_of_asoc_ok, uint16_t src_port, uint16_t dest_port, uint32_t v_tag, uint16_t port, union sctp_sockstore *over_addr, uint8_t mflowtype, uint32_t mflowid, #if !defined(__APPLE__) && !defined(SCTP_SO_LOCK_TESTING) int so_locked SCTP_UNUSED #else int so_locked #endif ) /* nofragment_flag to tell if IP_DF should be set (IPv4 only) */ { /** * Given a mbuf chain (via SCTP_BUF_NEXT()) that holds a packet header * WITH an SCTPHDR but no IP header, endpoint inp and sa structure: * - fill in the HMAC digest of any AUTH chunk in the packet. * - calculate and fill in the SCTP checksum. * - prepend an IP address header. * - if boundall use INADDR_ANY. * - if boundspecific do source address selection. * - set fragmentation option for ipV4. * - On return from IP output, check/adjust mtu size of output * interface and smallest_mtu size as well. */ /* Will need ifdefs around this */ struct mbuf *newm; struct sctphdr *sctphdr; int packet_length; int ret; #if defined(INET) || defined(INET6) uint32_t vrf_id; #endif #if defined(INET) || defined(INET6) struct mbuf *o_pak; sctp_route_t *ro = NULL; struct udphdr *udp = NULL; #endif uint8_t tos_value; #if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING) struct socket *so = NULL; #endif if ((net) && (net->dest_state & SCTP_ADDR_OUT_OF_SCOPE)) { SCTP_LTRACE_ERR_RET_PKT(m, inp, stcb, net, SCTP_FROM_SCTP_OUTPUT, EFAULT); sctp_m_freem(m); return (EFAULT); } #if defined(INET) || defined(INET6) if (stcb) { vrf_id = stcb->asoc.vrf_id; } else { vrf_id = inp->def_vrf_id; } #endif /* fill in the HMAC digest for any AUTH chunk in the packet */ if ((auth != NULL) && (stcb != NULL)) { sctp_fill_hmac_digest_m(m, auth_offset, auth, stcb, auth_keyid); } if (net) { tos_value = net->dscp; } else if (stcb) { tos_value = stcb->asoc.default_dscp; } else { tos_value = inp->sctp_ep.default_dscp; } switch (to->sa_family) { #ifdef INET case AF_INET: { struct ip *ip = NULL; sctp_route_t iproute; int len; len = SCTP_MIN_V4_OVERHEAD; if (port) { len += sizeof(struct udphdr); } newm = sctp_get_mbuf_for_msg(len, 1, M_NOWAIT, 1, MT_DATA); if (newm == NULL) { sctp_m_freem(m); SCTP_LTRACE_ERR_RET(inp, stcb, NULL, SCTP_FROM_SCTP_OUTPUT, ENOMEM); return (ENOMEM); } SCTP_ALIGN_TO_END(newm, len); SCTP_BUF_LEN(newm) = len; SCTP_BUF_NEXT(newm) = m; m = newm; if (net != NULL) { m->m_pkthdr.flowid = net->flowid; M_HASHTYPE_SET(m, net->flowtype); } else { m->m_pkthdr.flowid = mflowid; M_HASHTYPE_SET(m, mflowtype); } packet_length = sctp_calculate_len(m); ip = mtod(m, struct ip *); ip->ip_v = IPVERSION; ip->ip_hl = (sizeof(struct ip) >> 2); if (tos_value == 0) { /* * This means especially, that it is not set * at the SCTP layer. So use the value from * the IP layer. */ tos_value = inp->ip_inp.inp.inp_ip_tos; } tos_value &= 0xfc; if (ecn_ok) { tos_value |= sctp_get_ect(stcb); } if ((nofragment_flag) && (port == 0)) { ip->ip_off = htons(IP_DF); } else { ip->ip_off = htons(0); } /* FreeBSD has a function for ip_id's */ ip_fillid(ip); ip->ip_ttl = inp->ip_inp.inp.inp_ip_ttl; ip->ip_len = htons(packet_length); ip->ip_tos = tos_value; if (port) { ip->ip_p = IPPROTO_UDP; } else { ip->ip_p = IPPROTO_SCTP; } ip->ip_sum = 0; if (net == NULL) { ro = &iproute; memset(&iproute, 0, sizeof(iproute)); memcpy(&ro->ro_dst, to, to->sa_len); } else { ro = (sctp_route_t *) & net->ro; } /* Now the address selection part */ ip->ip_dst.s_addr = ((struct sockaddr_in *)to)->sin_addr.s_addr; /* call the routine to select the src address */ if (net && out_of_asoc_ok == 0) { if (net->ro._s_addr && (net->ro._s_addr->localifa_flags & (SCTP_BEING_DELETED | SCTP_ADDR_IFA_UNUSEABLE))) { sctp_free_ifa(net->ro._s_addr); net->ro._s_addr = NULL; net->src_addr_selected = 0; if (ro->ro_rt) { RTFREE(ro->ro_rt); ro->ro_rt = NULL; } } if (net->src_addr_selected == 0) { /* Cache the source address */ net->ro._s_addr = sctp_source_address_selection(inp, stcb, ro, net, 0, vrf_id); net->src_addr_selected = 1; } if (net->ro._s_addr == NULL) { /* No route to host */ net->src_addr_selected = 0; sctp_handle_no_route(stcb, net, so_locked); SCTP_LTRACE_ERR_RET_PKT(m, inp, stcb, NULL, SCTP_FROM_SCTP_OUTPUT, EHOSTUNREACH); sctp_m_freem(m); return (EHOSTUNREACH); } ip->ip_src = net->ro._s_addr->address.sin.sin_addr; } else { if (over_addr == NULL) { struct sctp_ifa *_lsrc; _lsrc = sctp_source_address_selection(inp, stcb, ro, net, out_of_asoc_ok, vrf_id); if (_lsrc == NULL) { sctp_handle_no_route(stcb, net, so_locked); SCTP_LTRACE_ERR_RET_PKT(m, inp, stcb, NULL, SCTP_FROM_SCTP_OUTPUT, EHOSTUNREACH); sctp_m_freem(m); return (EHOSTUNREACH); } ip->ip_src = _lsrc->address.sin.sin_addr; sctp_free_ifa(_lsrc); } else { ip->ip_src = over_addr->sin.sin_addr; SCTP_RTALLOC(ro, vrf_id, inp->fibnum); } } if (port) { if (htons(SCTP_BASE_SYSCTL(sctp_udp_tunneling_port)) == 0) { sctp_handle_no_route(stcb, net, so_locked); SCTP_LTRACE_ERR_RET_PKT(m, inp, stcb, NULL, SCTP_FROM_SCTP_OUTPUT, EHOSTUNREACH); sctp_m_freem(m); return (EHOSTUNREACH); } udp = (struct udphdr *)((caddr_t)ip + sizeof(struct ip)); udp->uh_sport = htons(SCTP_BASE_SYSCTL(sctp_udp_tunneling_port)); udp->uh_dport = port; udp->uh_ulen = htons((uint16_t) (packet_length - sizeof(struct ip))); if (V_udp_cksum) { udp->uh_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, udp->uh_ulen + htons(IPPROTO_UDP)); } else { udp->uh_sum = 0; } sctphdr = (struct sctphdr *)((caddr_t)udp + sizeof(struct udphdr)); } else { sctphdr = (struct sctphdr *)((caddr_t)ip + sizeof(struct ip)); } sctphdr->src_port = src_port; sctphdr->dest_port = dest_port; sctphdr->v_tag = v_tag; sctphdr->checksum = 0; /* * If source address selection fails and we find no * route then the ip_output should fail as well with * a NO_ROUTE_TO_HOST type error. We probably should * catch that somewhere and abort the association * right away (assuming this is an INIT being sent). */ if (ro->ro_rt == NULL) { /* * src addr selection failed to find a route * (or valid source addr), so we can't get * there from here (yet)! */ sctp_handle_no_route(stcb, net, so_locked); SCTP_LTRACE_ERR_RET_PKT(m, inp, stcb, NULL, SCTP_FROM_SCTP_OUTPUT, EHOSTUNREACH); sctp_m_freem(m); return (EHOSTUNREACH); } if (ro != &iproute) { memcpy(&iproute, ro, sizeof(*ro)); } SCTPDBG(SCTP_DEBUG_OUTPUT3, "Calling ipv4 output routine from low level src addr:%x\n", (uint32_t) (ntohl(ip->ip_src.s_addr))); SCTPDBG(SCTP_DEBUG_OUTPUT3, "Destination is %x\n", (uint32_t) (ntohl(ip->ip_dst.s_addr))); SCTPDBG(SCTP_DEBUG_OUTPUT3, "RTP route is %p through\n", (void *)ro->ro_rt); if (SCTP_GET_HEADER_FOR_OUTPUT(o_pak)) { /* failed to prepend data, give up */ SCTP_LTRACE_ERR_RET_PKT(m, inp, stcb, NULL, SCTP_FROM_SCTP_OUTPUT, ENOMEM); sctp_m_freem(m); return (ENOMEM); } SCTP_ATTACH_CHAIN(o_pak, m, packet_length); if (port) { #if defined(SCTP_WITH_NO_CSUM) SCTP_STAT_INCR(sctps_sendnocrc); #else sctphdr->checksum = sctp_calculate_cksum(m, sizeof(struct ip) + sizeof(struct udphdr)); SCTP_STAT_INCR(sctps_sendswcrc); #endif if (V_udp_cksum) { SCTP_ENABLE_UDP_CSUM(o_pak); } } else { #if defined(SCTP_WITH_NO_CSUM) SCTP_STAT_INCR(sctps_sendnocrc); #else m->m_pkthdr.csum_flags = CSUM_SCTP; m->m_pkthdr.csum_data = offsetof(struct sctphdr, checksum); SCTP_STAT_INCR(sctps_sendhwcrc); #endif } #ifdef SCTP_PACKET_LOGGING if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_LAST_PACKET_TRACING) sctp_packet_log(o_pak); #endif /* send it out. table id is taken from stcb */ #if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING) if ((SCTP_BASE_SYSCTL(sctp_output_unlocked)) && (so_locked)) { so = SCTP_INP_SO(inp); SCTP_SOCKET_UNLOCK(so, 0); } #endif SCTP_IP_OUTPUT(ret, o_pak, ro, stcb, vrf_id); #if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING) if ((SCTP_BASE_SYSCTL(sctp_output_unlocked)) && (so_locked)) { atomic_add_int(&stcb->asoc.refcnt, 1); SCTP_TCB_UNLOCK(stcb); SCTP_SOCKET_LOCK(so, 0); SCTP_TCB_LOCK(stcb); atomic_subtract_int(&stcb->asoc.refcnt, 1); } #endif SCTP_STAT_INCR(sctps_sendpackets); SCTP_STAT_INCR_COUNTER64(sctps_outpackets); if (ret) SCTP_STAT_INCR(sctps_senderrors); SCTPDBG(SCTP_DEBUG_OUTPUT3, "IP output returns %d\n", ret); if (net == NULL) { /* free tempy routes */ RO_RTFREE(ro); } else { /* * PMTU check versus smallest asoc MTU goes * here */ if ((ro->ro_rt != NULL) && (net->ro._s_addr)) { uint32_t mtu; mtu = SCTP_GATHER_MTU_FROM_ROUTE(net->ro._s_addr, &net->ro._l_addr.sa, ro->ro_rt); if (net->port) { mtu -= sizeof(struct udphdr); } if (mtu && (stcb->asoc.smallest_mtu > mtu)) { sctp_mtu_size_reset(inp, &stcb->asoc, mtu); net->mtu = mtu; } } else if (ro->ro_rt == NULL) { /* route was freed */ if (net->ro._s_addr && net->src_addr_selected) { sctp_free_ifa(net->ro._s_addr); net->ro._s_addr = NULL; } net->src_addr_selected = 0; } } return (ret); } #endif #ifdef INET6 case AF_INET6: { uint32_t flowlabel, flowinfo; struct ip6_hdr *ip6h; struct route_in6 ip6route; struct ifnet *ifp; struct sockaddr_in6 *sin6, tmp, *lsa6, lsa6_tmp; int prev_scope = 0; struct sockaddr_in6 lsa6_storage; int error; u_short prev_port = 0; int len; if (net) { flowlabel = net->flowlabel; } else if (stcb) { flowlabel = stcb->asoc.default_flowlabel; } else { flowlabel = inp->sctp_ep.default_flowlabel; } if (flowlabel == 0) { /* * This means especially, that it is not set * at the SCTP layer. So use the value from * the IP layer. */ flowlabel = ntohl(((struct in6pcb *)inp)->in6p_flowinfo); } flowlabel &= 0x000fffff; len = SCTP_MIN_OVERHEAD; if (port) { len += sizeof(struct udphdr); } newm = sctp_get_mbuf_for_msg(len, 1, M_NOWAIT, 1, MT_DATA); if (newm == NULL) { sctp_m_freem(m); SCTP_LTRACE_ERR_RET(inp, stcb, NULL, SCTP_FROM_SCTP_OUTPUT, ENOMEM); return (ENOMEM); } SCTP_ALIGN_TO_END(newm, len); SCTP_BUF_LEN(newm) = len; SCTP_BUF_NEXT(newm) = m; m = newm; if (net != NULL) { m->m_pkthdr.flowid = net->flowid; M_HASHTYPE_SET(m, net->flowtype); } else { m->m_pkthdr.flowid = mflowid; M_HASHTYPE_SET(m, mflowtype); } packet_length = sctp_calculate_len(m); ip6h = mtod(m, struct ip6_hdr *); /* protect *sin6 from overwrite */ sin6 = (struct sockaddr_in6 *)to; tmp = *sin6; sin6 = &tmp; /* KAME hack: embed scopeid */ if (sa6_embedscope(sin6, MODULE_GLOBAL(ip6_use_defzone)) != 0) { SCTP_LTRACE_ERR_RET_PKT(m, inp, stcb, net, SCTP_FROM_SCTP_OUTPUT, EINVAL); return (EINVAL); } if (net == NULL) { memset(&ip6route, 0, sizeof(ip6route)); ro = (sctp_route_t *) & ip6route; memcpy(&ro->ro_dst, sin6, sin6->sin6_len); } else { ro = (sctp_route_t *) & net->ro; } /* * We assume here that inp_flow is in host byte * order within the TCB! */ if (tos_value == 0) { /* * This means especially, that it is not set * at the SCTP layer. So use the value from * the IP layer. */ tos_value = (ntohl(((struct in6pcb *)inp)->in6p_flowinfo) >> 20) & 0xff; } tos_value &= 0xfc; if (ecn_ok) { tos_value |= sctp_get_ect(stcb); } flowinfo = 0x06; flowinfo <<= 8; flowinfo |= tos_value; flowinfo <<= 20; flowinfo |= flowlabel; ip6h->ip6_flow = htonl(flowinfo); if (port) { ip6h->ip6_nxt = IPPROTO_UDP; } else { ip6h->ip6_nxt = IPPROTO_SCTP; } ip6h->ip6_plen = (uint16_t) (packet_length - sizeof(struct ip6_hdr)); ip6h->ip6_dst = sin6->sin6_addr; /* * Add SRC address selection here: we can only reuse * to a limited degree the kame src-addr-sel, since * we can try their selection but it may not be * bound. */ bzero(&lsa6_tmp, sizeof(lsa6_tmp)); lsa6_tmp.sin6_family = AF_INET6; lsa6_tmp.sin6_len = sizeof(lsa6_tmp); lsa6 = &lsa6_tmp; if (net && out_of_asoc_ok == 0) { if (net->ro._s_addr && (net->ro._s_addr->localifa_flags & (SCTP_BEING_DELETED | SCTP_ADDR_IFA_UNUSEABLE))) { sctp_free_ifa(net->ro._s_addr); net->ro._s_addr = NULL; net->src_addr_selected = 0; if (ro->ro_rt) { RTFREE(ro->ro_rt); ro->ro_rt = NULL; } } if (net->src_addr_selected == 0) { sin6 = (struct sockaddr_in6 *)&net->ro._l_addr; /* KAME hack: embed scopeid */ if (sa6_embedscope(sin6, MODULE_GLOBAL(ip6_use_defzone)) != 0) { SCTP_LTRACE_ERR_RET_PKT(m, inp, stcb, net, SCTP_FROM_SCTP_OUTPUT, EINVAL); return (EINVAL); } /* Cache the source address */ net->ro._s_addr = sctp_source_address_selection(inp, stcb, ro, net, 0, vrf_id); (void)sa6_recoverscope(sin6); net->src_addr_selected = 1; } if (net->ro._s_addr == NULL) { SCTPDBG(SCTP_DEBUG_OUTPUT3, "V6:No route to host\n"); net->src_addr_selected = 0; sctp_handle_no_route(stcb, net, so_locked); SCTP_LTRACE_ERR_RET_PKT(m, inp, stcb, NULL, SCTP_FROM_SCTP_OUTPUT, EHOSTUNREACH); sctp_m_freem(m); return (EHOSTUNREACH); } lsa6->sin6_addr = net->ro._s_addr->address.sin6.sin6_addr; } else { sin6 = (struct sockaddr_in6 *)&ro->ro_dst; /* KAME hack: embed scopeid */ if (sa6_embedscope(sin6, MODULE_GLOBAL(ip6_use_defzone)) != 0) { SCTP_LTRACE_ERR_RET_PKT(m, inp, stcb, net, SCTP_FROM_SCTP_OUTPUT, EINVAL); return (EINVAL); } if (over_addr == NULL) { struct sctp_ifa *_lsrc; _lsrc = sctp_source_address_selection(inp, stcb, ro, net, out_of_asoc_ok, vrf_id); if (_lsrc == NULL) { sctp_handle_no_route(stcb, net, so_locked); SCTP_LTRACE_ERR_RET_PKT(m, inp, stcb, NULL, SCTP_FROM_SCTP_OUTPUT, EHOSTUNREACH); sctp_m_freem(m); return (EHOSTUNREACH); } lsa6->sin6_addr = _lsrc->address.sin6.sin6_addr; sctp_free_ifa(_lsrc); } else { lsa6->sin6_addr = over_addr->sin6.sin6_addr; SCTP_RTALLOC(ro, vrf_id, inp->fibnum); } (void)sa6_recoverscope(sin6); } lsa6->sin6_port = inp->sctp_lport; if (ro->ro_rt == NULL) { /* * src addr selection failed to find a route * (or valid source addr), so we can't get * there from here! */ sctp_handle_no_route(stcb, net, so_locked); SCTP_LTRACE_ERR_RET_PKT(m, inp, stcb, NULL, SCTP_FROM_SCTP_OUTPUT, EHOSTUNREACH); sctp_m_freem(m); return (EHOSTUNREACH); } /* * XXX: sa6 may not have a valid sin6_scope_id in * the non-SCOPEDROUTING case. */ bzero(&lsa6_storage, sizeof(lsa6_storage)); lsa6_storage.sin6_family = AF_INET6; lsa6_storage.sin6_len = sizeof(lsa6_storage); lsa6_storage.sin6_addr = lsa6->sin6_addr; if ((error = sa6_recoverscope(&lsa6_storage)) != 0) { SCTPDBG(SCTP_DEBUG_OUTPUT3, "recover scope fails error %d\n", error); sctp_m_freem(m); return (error); } /* XXX */ lsa6_storage.sin6_addr = lsa6->sin6_addr; lsa6_storage.sin6_port = inp->sctp_lport; lsa6 = &lsa6_storage; ip6h->ip6_src = lsa6->sin6_addr; if (port) { if (htons(SCTP_BASE_SYSCTL(sctp_udp_tunneling_port)) == 0) { sctp_handle_no_route(stcb, net, so_locked); SCTP_LTRACE_ERR_RET_PKT(m, inp, stcb, NULL, SCTP_FROM_SCTP_OUTPUT, EHOSTUNREACH); sctp_m_freem(m); return (EHOSTUNREACH); } udp = (struct udphdr *)((caddr_t)ip6h + sizeof(struct ip6_hdr)); udp->uh_sport = htons(SCTP_BASE_SYSCTL(sctp_udp_tunneling_port)); udp->uh_dport = port; udp->uh_ulen = htons((uint16_t) (packet_length - sizeof(struct ip6_hdr))); udp->uh_sum = 0; sctphdr = (struct sctphdr *)((caddr_t)udp + sizeof(struct udphdr)); } else { sctphdr = (struct sctphdr *)((caddr_t)ip6h + sizeof(struct ip6_hdr)); } sctphdr->src_port = src_port; sctphdr->dest_port = dest_port; sctphdr->v_tag = v_tag; sctphdr->checksum = 0; /* * We set the hop limit now since there is a good * chance that our ro pointer is now filled */ ip6h->ip6_hlim = SCTP_GET_HLIM(inp, ro); ifp = SCTP_GET_IFN_VOID_FROM_ROUTE(ro); #ifdef SCTP_DEBUG /* Copy to be sure something bad is not happening */ sin6->sin6_addr = ip6h->ip6_dst; lsa6->sin6_addr = ip6h->ip6_src; #endif SCTPDBG(SCTP_DEBUG_OUTPUT3, "Calling ipv6 output routine from low level\n"); SCTPDBG(SCTP_DEBUG_OUTPUT3, "src: "); SCTPDBG_ADDR(SCTP_DEBUG_OUTPUT3, (struct sockaddr *)lsa6); SCTPDBG(SCTP_DEBUG_OUTPUT3, "dst: "); SCTPDBG_ADDR(SCTP_DEBUG_OUTPUT3, (struct sockaddr *)sin6); if (net) { sin6 = (struct sockaddr_in6 *)&net->ro._l_addr; /* * preserve the port and scope for link * local send */ prev_scope = sin6->sin6_scope_id; prev_port = sin6->sin6_port; } if (SCTP_GET_HEADER_FOR_OUTPUT(o_pak)) { /* failed to prepend data, give up */ sctp_m_freem(m); SCTP_LTRACE_ERR_RET(inp, stcb, NULL, SCTP_FROM_SCTP_OUTPUT, ENOMEM); return (ENOMEM); } SCTP_ATTACH_CHAIN(o_pak, m, packet_length); if (port) { #if defined(SCTP_WITH_NO_CSUM) SCTP_STAT_INCR(sctps_sendnocrc); #else sctphdr->checksum = sctp_calculate_cksum(m, sizeof(struct ip6_hdr) + sizeof(struct udphdr)); SCTP_STAT_INCR(sctps_sendswcrc); #endif if ((udp->uh_sum = in6_cksum(o_pak, IPPROTO_UDP, sizeof(struct ip6_hdr), packet_length - sizeof(struct ip6_hdr))) == 0) { udp->uh_sum = 0xffff; } } else { #if defined(SCTP_WITH_NO_CSUM) SCTP_STAT_INCR(sctps_sendnocrc); #else m->m_pkthdr.csum_flags = CSUM_SCTP_IPV6; m->m_pkthdr.csum_data = offsetof(struct sctphdr, checksum); SCTP_STAT_INCR(sctps_sendhwcrc); #endif } /* send it out. table id is taken from stcb */ #if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING) if ((SCTP_BASE_SYSCTL(sctp_output_unlocked)) && (so_locked)) { so = SCTP_INP_SO(inp); SCTP_SOCKET_UNLOCK(so, 0); } #endif #ifdef SCTP_PACKET_LOGGING if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_LAST_PACKET_TRACING) sctp_packet_log(o_pak); #endif SCTP_IP6_OUTPUT(ret, o_pak, (struct route_in6 *)ro, &ifp, stcb, vrf_id); #if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING) if ((SCTP_BASE_SYSCTL(sctp_output_unlocked)) && (so_locked)) { atomic_add_int(&stcb->asoc.refcnt, 1); SCTP_TCB_UNLOCK(stcb); SCTP_SOCKET_LOCK(so, 0); SCTP_TCB_LOCK(stcb); atomic_subtract_int(&stcb->asoc.refcnt, 1); } #endif if (net) { /* for link local this must be done */ sin6->sin6_scope_id = prev_scope; sin6->sin6_port = prev_port; } SCTPDBG(SCTP_DEBUG_OUTPUT3, "return from send is %d\n", ret); SCTP_STAT_INCR(sctps_sendpackets); SCTP_STAT_INCR_COUNTER64(sctps_outpackets); if (ret) { SCTP_STAT_INCR(sctps_senderrors); } if (net == NULL) { /* Now if we had a temp route free it */ RO_RTFREE(ro); } else { /* * PMTU check versus smallest asoc MTU goes * here */ if (ro->ro_rt == NULL) { /* Route was freed */ if (net->ro._s_addr && net->src_addr_selected) { sctp_free_ifa(net->ro._s_addr); net->ro._s_addr = NULL; } net->src_addr_selected = 0; } if ((ro->ro_rt != NULL) && (net->ro._s_addr)) { uint32_t mtu; mtu = SCTP_GATHER_MTU_FROM_ROUTE(net->ro._s_addr, &net->ro._l_addr.sa, ro->ro_rt); if (mtu && (stcb->asoc.smallest_mtu > mtu)) { sctp_mtu_size_reset(inp, &stcb->asoc, mtu); net->mtu = mtu; if (net->port) { net->mtu -= sizeof(struct udphdr); } } } else if (ifp) { if (ND_IFINFO(ifp)->linkmtu && (stcb->asoc.smallest_mtu > ND_IFINFO(ifp)->linkmtu)) { sctp_mtu_size_reset(inp, &stcb->asoc, ND_IFINFO(ifp)->linkmtu); } } } return (ret); } #endif default: SCTPDBG(SCTP_DEBUG_OUTPUT1, "Unknown protocol (TSNH) type %d\n", ((struct sockaddr *)to)->sa_family); sctp_m_freem(m); SCTP_LTRACE_ERR_RET_PKT(m, inp, stcb, net, SCTP_FROM_SCTP_OUTPUT, EFAULT); return (EFAULT); } } void sctp_send_initiate(struct sctp_inpcb *inp, struct sctp_tcb *stcb, int so_locked #if !defined(__APPLE__) && !defined(SCTP_SO_LOCK_TESTING) SCTP_UNUSED #endif ) { struct mbuf *m, *m_last; struct sctp_nets *net; struct sctp_init_chunk *init; struct sctp_supported_addr_param *sup_addr; struct sctp_adaptation_layer_indication *ali; struct sctp_supported_chunk_types_param *pr_supported; struct sctp_paramhdr *ph; int cnt_inits_to = 0; int ret; uint16_t num_ext, chunk_len, padding_len, parameter_len; /* INIT's always go to the primary (and usually ONLY address) */ net = stcb->asoc.primary_destination; if (net == NULL) { net = TAILQ_FIRST(&stcb->asoc.nets); if (net == NULL) { /* TSNH */ return; } /* we confirm any address we send an INIT to */ net->dest_state &= ~SCTP_ADDR_UNCONFIRMED; (void)sctp_set_primary_addr(stcb, NULL, net); } else { /* we confirm any address we send an INIT to */ net->dest_state &= ~SCTP_ADDR_UNCONFIRMED; } SCTPDBG(SCTP_DEBUG_OUTPUT4, "Sending INIT\n"); #ifdef INET6 if (net->ro._l_addr.sa.sa_family == AF_INET6) { /* * special hook, if we are sending to link local it will not * show up in our private address count. */ if (IN6_IS_ADDR_LINKLOCAL(&net->ro._l_addr.sin6.sin6_addr)) cnt_inits_to = 1; } #endif if (SCTP_OS_TIMER_PENDING(&net->rxt_timer.timer)) { /* This case should not happen */ SCTPDBG(SCTP_DEBUG_OUTPUT4, "Sending INIT - failed timer?\n"); return; } /* start the INIT timer */ sctp_timer_start(SCTP_TIMER_TYPE_INIT, inp, stcb, net); m = sctp_get_mbuf_for_msg(MCLBYTES, 1, M_NOWAIT, 1, MT_DATA); if (m == NULL) { /* No memory, INIT timer will re-attempt. */ SCTPDBG(SCTP_DEBUG_OUTPUT4, "Sending INIT - mbuf?\n"); return; } chunk_len = (uint16_t) sizeof(struct sctp_init_chunk); padding_len = 0; /* Now lets put the chunk header in place */ init = mtod(m, struct sctp_init_chunk *); /* now the chunk header */ init->ch.chunk_type = SCTP_INITIATION; init->ch.chunk_flags = 0; /* fill in later from mbuf we build */ init->ch.chunk_length = 0; /* place in my tag */ init->init.initiate_tag = htonl(stcb->asoc.my_vtag); /* set up some of the credits. */ init->init.a_rwnd = htonl(max(inp->sctp_socket ? SCTP_SB_LIMIT_RCV(inp->sctp_socket) : 0, SCTP_MINIMAL_RWND)); init->init.num_outbound_streams = htons(stcb->asoc.pre_open_streams); init->init.num_inbound_streams = htons(stcb->asoc.max_inbound_streams); init->init.initial_tsn = htonl(stcb->asoc.init_seq_number); /* Adaptation layer indication parameter */ if (inp->sctp_ep.adaptation_layer_indicator_provided) { parameter_len = (uint16_t) sizeof(struct sctp_adaptation_layer_indication); ali = (struct sctp_adaptation_layer_indication *)(mtod(m, caddr_t)+chunk_len); ali->ph.param_type = htons(SCTP_ULP_ADAPTATION); ali->ph.param_length = htons(parameter_len); ali->indication = htonl(inp->sctp_ep.adaptation_layer_indicator); chunk_len += parameter_len; } /* ECN parameter */ if (stcb->asoc.ecn_supported == 1) { parameter_len = (uint16_t) sizeof(struct sctp_paramhdr); ph = (struct sctp_paramhdr *)(mtod(m, caddr_t)+chunk_len); ph->param_type = htons(SCTP_ECN_CAPABLE); ph->param_length = htons(parameter_len); chunk_len += parameter_len; } /* PR-SCTP supported parameter */ if (stcb->asoc.prsctp_supported == 1) { parameter_len = (uint16_t) sizeof(struct sctp_paramhdr); ph = (struct sctp_paramhdr *)(mtod(m, caddr_t)+chunk_len); ph->param_type = htons(SCTP_PRSCTP_SUPPORTED); ph->param_length = htons(parameter_len); chunk_len += parameter_len; } /* Add NAT friendly parameter. */ if (SCTP_BASE_SYSCTL(sctp_inits_include_nat_friendly)) { parameter_len = (uint16_t) sizeof(struct sctp_paramhdr); ph = (struct sctp_paramhdr *)(mtod(m, caddr_t)+chunk_len); ph->param_type = htons(SCTP_HAS_NAT_SUPPORT); ph->param_length = htons(parameter_len); chunk_len += parameter_len; } /* And now tell the peer which extensions we support */ num_ext = 0; pr_supported = (struct sctp_supported_chunk_types_param *)(mtod(m, caddr_t)+chunk_len); if (stcb->asoc.prsctp_supported == 1) { pr_supported->chunk_types[num_ext++] = SCTP_FORWARD_CUM_TSN; if (stcb->asoc.idata_supported) { pr_supported->chunk_types[num_ext++] = SCTP_IFORWARD_CUM_TSN; } } if (stcb->asoc.auth_supported == 1) { pr_supported->chunk_types[num_ext++] = SCTP_AUTHENTICATION; } if (stcb->asoc.asconf_supported == 1) { pr_supported->chunk_types[num_ext++] = SCTP_ASCONF; pr_supported->chunk_types[num_ext++] = SCTP_ASCONF_ACK; } if (stcb->asoc.reconfig_supported == 1) { pr_supported->chunk_types[num_ext++] = SCTP_STREAM_RESET; } if (stcb->asoc.idata_supported) { pr_supported->chunk_types[num_ext++] = SCTP_IDATA; } if (stcb->asoc.nrsack_supported == 1) { pr_supported->chunk_types[num_ext++] = SCTP_NR_SELECTIVE_ACK; } if (stcb->asoc.pktdrop_supported == 1) { pr_supported->chunk_types[num_ext++] = SCTP_PACKET_DROPPED; } if (num_ext > 0) { parameter_len = (uint16_t) sizeof(struct sctp_supported_chunk_types_param) + num_ext; pr_supported->ph.param_type = htons(SCTP_SUPPORTED_CHUNK_EXT); pr_supported->ph.param_length = htons(parameter_len); padding_len = SCTP_SIZE32(parameter_len) - parameter_len; chunk_len += parameter_len; } /* add authentication parameters */ if (stcb->asoc.auth_supported) { /* attach RANDOM parameter, if available */ if (stcb->asoc.authinfo.random != NULL) { struct sctp_auth_random *randp; if (padding_len > 0) { memset(mtod(m, caddr_t)+chunk_len, 0, padding_len); chunk_len += padding_len; padding_len = 0; } randp = (struct sctp_auth_random *)(mtod(m, caddr_t)+chunk_len); parameter_len = (uint16_t) sizeof(struct sctp_auth_random) + stcb->asoc.authinfo.random_len; /* random key already contains the header */ memcpy(randp, stcb->asoc.authinfo.random->key, parameter_len); padding_len = SCTP_SIZE32(parameter_len) - parameter_len; chunk_len += parameter_len; } /* add HMAC_ALGO parameter */ if (stcb->asoc.local_hmacs != NULL) { struct sctp_auth_hmac_algo *hmacs; if (padding_len > 0) { memset(mtod(m, caddr_t)+chunk_len, 0, padding_len); chunk_len += padding_len; padding_len = 0; } hmacs = (struct sctp_auth_hmac_algo *)(mtod(m, caddr_t)+chunk_len); parameter_len = (uint16_t) (sizeof(struct sctp_auth_hmac_algo) + stcb->asoc.local_hmacs->num_algo * sizeof(uint16_t)); hmacs->ph.param_type = htons(SCTP_HMAC_LIST); hmacs->ph.param_length = htons(parameter_len); sctp_serialize_hmaclist(stcb->asoc.local_hmacs, (uint8_t *) hmacs->hmac_ids); padding_len = SCTP_SIZE32(parameter_len) - parameter_len; chunk_len += parameter_len; } /* add CHUNKS parameter */ if (stcb->asoc.local_auth_chunks != NULL) { struct sctp_auth_chunk_list *chunks; if (padding_len > 0) { memset(mtod(m, caddr_t)+chunk_len, 0, padding_len); chunk_len += padding_len; padding_len = 0; } chunks = (struct sctp_auth_chunk_list *)(mtod(m, caddr_t)+chunk_len); parameter_len = (uint16_t) (sizeof(struct sctp_auth_chunk_list) + sctp_auth_get_chklist_size(stcb->asoc.local_auth_chunks)); chunks->ph.param_type = htons(SCTP_CHUNK_LIST); chunks->ph.param_length = htons(parameter_len); sctp_serialize_auth_chunks(stcb->asoc.local_auth_chunks, chunks->chunk_types); padding_len = SCTP_SIZE32(parameter_len) - parameter_len; chunk_len += parameter_len; } } /* now any cookie time extensions */ if (stcb->asoc.cookie_preserve_req) { struct sctp_cookie_perserve_param *cookie_preserve; if (padding_len > 0) { memset(mtod(m, caddr_t)+chunk_len, 0, padding_len); chunk_len += padding_len; padding_len = 0; } parameter_len = (uint16_t) sizeof(struct sctp_cookie_perserve_param); cookie_preserve = (struct sctp_cookie_perserve_param *)(mtod(m, caddr_t)+chunk_len); cookie_preserve->ph.param_type = htons(SCTP_COOKIE_PRESERVE); cookie_preserve->ph.param_length = htons(parameter_len); cookie_preserve->time = htonl(stcb->asoc.cookie_preserve_req); stcb->asoc.cookie_preserve_req = 0; chunk_len += parameter_len; } if (stcb->asoc.scope.ipv4_addr_legal || stcb->asoc.scope.ipv6_addr_legal) { uint8_t i; if (padding_len > 0) { memset(mtod(m, caddr_t)+chunk_len, 0, padding_len); chunk_len += padding_len; padding_len = 0; } parameter_len = (uint16_t) sizeof(struct sctp_paramhdr); if (stcb->asoc.scope.ipv4_addr_legal) { parameter_len += (uint16_t) sizeof(uint16_t); } if (stcb->asoc.scope.ipv6_addr_legal) { parameter_len += (uint16_t) sizeof(uint16_t); } sup_addr = (struct sctp_supported_addr_param *)(mtod(m, caddr_t)+chunk_len); sup_addr->ph.param_type = htons(SCTP_SUPPORTED_ADDRTYPE); sup_addr->ph.param_length = htons(parameter_len); i = 0; if (stcb->asoc.scope.ipv4_addr_legal) { sup_addr->addr_type[i++] = htons(SCTP_IPV4_ADDRESS); } if (stcb->asoc.scope.ipv6_addr_legal) { sup_addr->addr_type[i++] = htons(SCTP_IPV6_ADDRESS); } padding_len = 4 - 2 * i; chunk_len += parameter_len; } SCTP_BUF_LEN(m) = chunk_len; /* now the addresses */ /* * To optimize this we could put the scoping stuff into a structure * and remove the individual uint8's from the assoc structure. Then * we could just sifa in the address within the stcb. But for now * this is a quick hack to get the address stuff teased apart. */ m_last = sctp_add_addresses_to_i_ia(inp, stcb, &stcb->asoc.scope, m, cnt_inits_to, &padding_len, &chunk_len); init->ch.chunk_length = htons(chunk_len); if (padding_len > 0) { if (sctp_add_pad_tombuf(m_last, padding_len) == NULL) { sctp_m_freem(m); return; } } SCTPDBG(SCTP_DEBUG_OUTPUT4, "Sending INIT - calls lowlevel_output\n"); ret = sctp_lowlevel_chunk_output(inp, stcb, net, (struct sockaddr *)&net->ro._l_addr, m, 0, NULL, 0, 0, 0, 0, inp->sctp_lport, stcb->rport, htonl(0), net->port, NULL, 0, 0, so_locked); SCTPDBG(SCTP_DEBUG_OUTPUT4, "lowlevel_output - %d\n", ret); SCTP_STAT_INCR_COUNTER64(sctps_outcontrolchunks); (void)SCTP_GETTIME_TIMEVAL(&net->last_sent_time); } struct mbuf * sctp_arethere_unrecognized_parameters(struct mbuf *in_initpkt, int param_offset, int *abort_processing, struct sctp_chunkhdr *cp, int *nat_friendly) { /* * Given a mbuf containing an INIT or INIT-ACK with the param_offset * being equal to the beginning of the params i.e. (iphlen + * sizeof(struct sctp_init_msg) parse through the parameters to the * end of the mbuf verifying that all parameters are known. * * For unknown parameters build and return a mbuf with * UNRECOGNIZED_PARAMETER errors. If the flags indicate to stop * processing this chunk stop, and set *abort_processing to 1. * * By having param_offset be pre-set to where parameters begin it is * hoped that this routine may be reused in the future by new * features. */ struct sctp_paramhdr *phdr, params; struct mbuf *mat, *op_err; char tempbuf[SCTP_PARAM_BUFFER_SIZE]; int at, limit, pad_needed; uint16_t ptype, plen, padded_size; int err_at; *abort_processing = 0; mat = in_initpkt; err_at = 0; limit = ntohs(cp->chunk_length) - sizeof(struct sctp_init_chunk); at = param_offset; op_err = NULL; SCTPDBG(SCTP_DEBUG_OUTPUT1, "Check for unrecognized param's\n"); phdr = sctp_get_next_param(mat, at, ¶ms, sizeof(params)); while ((phdr != NULL) && ((size_t)limit >= sizeof(struct sctp_paramhdr))) { ptype = ntohs(phdr->param_type); plen = ntohs(phdr->param_length); if ((plen > limit) || (plen < sizeof(struct sctp_paramhdr))) { /* wacked parameter */ SCTPDBG(SCTP_DEBUG_OUTPUT1, "Invalid size - error %d\n", plen); goto invalid_size; } limit -= SCTP_SIZE32(plen); /*- * All parameters for all chunks that we know/understand are * listed here. We process them other places and make * appropriate stop actions per the upper bits. However this * is the generic routine processor's can call to get back * an operr.. to either incorporate (init-ack) or send. */ padded_size = SCTP_SIZE32(plen); switch (ptype) { /* Param's with variable size */ case SCTP_HEARTBEAT_INFO: case SCTP_STATE_COOKIE: case SCTP_UNRECOG_PARAM: case SCTP_ERROR_CAUSE_IND: /* ok skip fwd */ at += padded_size; break; /* Param's with variable size within a range */ case SCTP_CHUNK_LIST: case SCTP_SUPPORTED_CHUNK_EXT: if (padded_size > (sizeof(struct sctp_supported_chunk_types_param) + (sizeof(uint8_t) * SCTP_MAX_SUPPORTED_EXT))) { SCTPDBG(SCTP_DEBUG_OUTPUT1, "Invalid size - error chklist %d\n", plen); goto invalid_size; } at += padded_size; break; case SCTP_SUPPORTED_ADDRTYPE: if (padded_size > SCTP_MAX_ADDR_PARAMS_SIZE) { SCTPDBG(SCTP_DEBUG_OUTPUT1, "Invalid size - error supaddrtype %d\n", plen); goto invalid_size; } at += padded_size; break; case SCTP_RANDOM: if (padded_size > (sizeof(struct sctp_auth_random) + SCTP_RANDOM_MAX_SIZE)) { SCTPDBG(SCTP_DEBUG_OUTPUT1, "Invalid size - error random %d\n", plen); goto invalid_size; } at += padded_size; break; case SCTP_SET_PRIM_ADDR: case SCTP_DEL_IP_ADDRESS: case SCTP_ADD_IP_ADDRESS: if ((padded_size != sizeof(struct sctp_asconf_addrv4_param)) && (padded_size != sizeof(struct sctp_asconf_addr_param))) { SCTPDBG(SCTP_DEBUG_OUTPUT1, "Invalid size - error setprim %d\n", plen); goto invalid_size; } at += padded_size; break; /* Param's with a fixed size */ case SCTP_IPV4_ADDRESS: if (padded_size != sizeof(struct sctp_ipv4addr_param)) { SCTPDBG(SCTP_DEBUG_OUTPUT1, "Invalid size - error ipv4 addr %d\n", plen); goto invalid_size; } at += padded_size; break; case SCTP_IPV6_ADDRESS: if (padded_size != sizeof(struct sctp_ipv6addr_param)) { SCTPDBG(SCTP_DEBUG_OUTPUT1, "Invalid size - error ipv6 addr %d\n", plen); goto invalid_size; } at += padded_size; break; case SCTP_COOKIE_PRESERVE: if (padded_size != sizeof(struct sctp_cookie_perserve_param)) { SCTPDBG(SCTP_DEBUG_OUTPUT1, "Invalid size - error cookie-preserve %d\n", plen); goto invalid_size; } at += padded_size; break; case SCTP_HAS_NAT_SUPPORT: *nat_friendly = 1; /* fall through */ case SCTP_PRSCTP_SUPPORTED: if (padded_size != sizeof(struct sctp_paramhdr)) { SCTPDBG(SCTP_DEBUG_OUTPUT1, "Invalid size - error prsctp/nat support %d\n", plen); goto invalid_size; } at += padded_size; break; case SCTP_ECN_CAPABLE: if (padded_size != sizeof(struct sctp_paramhdr)) { SCTPDBG(SCTP_DEBUG_OUTPUT1, "Invalid size - error ecn %d\n", plen); goto invalid_size; } at += padded_size; break; case SCTP_ULP_ADAPTATION: if (padded_size != sizeof(struct sctp_adaptation_layer_indication)) { SCTPDBG(SCTP_DEBUG_OUTPUT1, "Invalid size - error adapatation %d\n", plen); goto invalid_size; } at += padded_size; break; case SCTP_SUCCESS_REPORT: if (padded_size != sizeof(struct sctp_asconf_paramhdr)) { SCTPDBG(SCTP_DEBUG_OUTPUT1, "Invalid size - error success %d\n", plen); goto invalid_size; } at += padded_size; break; case SCTP_HOSTNAME_ADDRESS: { /* We can NOT handle HOST NAME addresses!! */ int l_len; SCTPDBG(SCTP_DEBUG_OUTPUT1, "Can't handle hostname addresses.. abort processing\n"); *abort_processing = 1; if (op_err == NULL) { /* Ok need to try to get a mbuf */ #ifdef INET6 l_len = SCTP_MIN_OVERHEAD; #else l_len = SCTP_MIN_V4_OVERHEAD; #endif l_len += sizeof(struct sctp_chunkhdr); l_len += plen; l_len += sizeof(struct sctp_paramhdr); op_err = sctp_get_mbuf_for_msg(l_len, 0, M_NOWAIT, 1, MT_DATA); if (op_err) { SCTP_BUF_LEN(op_err) = 0; /* * pre-reserve space for ip * and sctp header and * chunk hdr */ #ifdef INET6 SCTP_BUF_RESV_UF(op_err, sizeof(struct ip6_hdr)); #else SCTP_BUF_RESV_UF(op_err, sizeof(struct ip)); #endif SCTP_BUF_RESV_UF(op_err, sizeof(struct sctphdr)); SCTP_BUF_RESV_UF(op_err, sizeof(struct sctp_chunkhdr)); } } if (op_err) { /* If we have space */ struct sctp_paramhdr s; if (err_at % 4) { uint32_t cpthis = 0; pad_needed = 4 - (err_at % 4); m_copyback(op_err, err_at, pad_needed, (caddr_t)&cpthis); err_at += pad_needed; } s.param_type = htons(SCTP_CAUSE_UNRESOLVABLE_ADDR); s.param_length = htons(sizeof(s) + plen); m_copyback(op_err, err_at, sizeof(s), (caddr_t)&s); err_at += sizeof(s); phdr = sctp_get_next_param(mat, at, (struct sctp_paramhdr *)tempbuf, min(sizeof(tempbuf), plen)); if (phdr == NULL) { sctp_m_freem(op_err); /* * we are out of memory but * we still need to have a * look at what to do (the * system is in trouble * though). */ return (NULL); } m_copyback(op_err, err_at, plen, (caddr_t)phdr); } return (op_err); break; } default: /* * we do not recognize the parameter figure out what * we do. */ SCTPDBG(SCTP_DEBUG_OUTPUT1, "Hit default param %x\n", ptype); if ((ptype & 0x4000) == 0x4000) { /* Report bit is set?? */ SCTPDBG(SCTP_DEBUG_OUTPUT1, "report op err\n"); if (op_err == NULL) { int l_len; /* Ok need to try to get an mbuf */ #ifdef INET6 l_len = SCTP_MIN_OVERHEAD; #else l_len = SCTP_MIN_V4_OVERHEAD; #endif l_len += sizeof(struct sctp_chunkhdr); l_len += plen; l_len += sizeof(struct sctp_paramhdr); op_err = sctp_get_mbuf_for_msg(l_len, 0, M_NOWAIT, 1, MT_DATA); if (op_err) { SCTP_BUF_LEN(op_err) = 0; #ifdef INET6 SCTP_BUF_RESV_UF(op_err, sizeof(struct ip6_hdr)); #else SCTP_BUF_RESV_UF(op_err, sizeof(struct ip)); #endif SCTP_BUF_RESV_UF(op_err, sizeof(struct sctphdr)); SCTP_BUF_RESV_UF(op_err, sizeof(struct sctp_chunkhdr)); } } if (op_err) { /* If we have space */ struct sctp_paramhdr s; if (err_at % 4) { uint32_t cpthis = 0; pad_needed = 4 - (err_at % 4); m_copyback(op_err, err_at, pad_needed, (caddr_t)&cpthis); err_at += pad_needed; } s.param_type = htons(SCTP_UNRECOG_PARAM); s.param_length = htons(sizeof(s) + plen); m_copyback(op_err, err_at, sizeof(s), (caddr_t)&s); err_at += sizeof(s); if (plen > sizeof(tempbuf)) { plen = sizeof(tempbuf); } phdr = sctp_get_next_param(mat, at, (struct sctp_paramhdr *)tempbuf, min(sizeof(tempbuf), plen)); if (phdr == NULL) { sctp_m_freem(op_err); /* * we are out of memory but * we still need to have a * look at what to do (the * system is in trouble * though). */ op_err = NULL; goto more_processing; } m_copyback(op_err, err_at, plen, (caddr_t)phdr); err_at += plen; } } more_processing: if ((ptype & 0x8000) == 0x0000) { SCTPDBG(SCTP_DEBUG_OUTPUT1, "stop proc\n"); return (op_err); } else { /* skip this chunk and continue processing */ SCTPDBG(SCTP_DEBUG_OUTPUT1, "move on\n"); at += SCTP_SIZE32(plen); } break; } phdr = sctp_get_next_param(mat, at, ¶ms, sizeof(params)); } return (op_err); invalid_size: SCTPDBG(SCTP_DEBUG_OUTPUT1, "abort flag set\n"); *abort_processing = 1; if ((op_err == NULL) && phdr) { int l_len; #ifdef INET6 l_len = SCTP_MIN_OVERHEAD; #else l_len = SCTP_MIN_V4_OVERHEAD; #endif l_len += sizeof(struct sctp_chunkhdr); l_len += (2 * sizeof(struct sctp_paramhdr)); op_err = sctp_get_mbuf_for_msg(l_len, 0, M_NOWAIT, 1, MT_DATA); if (op_err) { SCTP_BUF_LEN(op_err) = 0; #ifdef INET6 SCTP_BUF_RESV_UF(op_err, sizeof(struct ip6_hdr)); #else SCTP_BUF_RESV_UF(op_err, sizeof(struct ip)); #endif SCTP_BUF_RESV_UF(op_err, sizeof(struct sctphdr)); SCTP_BUF_RESV_UF(op_err, sizeof(struct sctp_chunkhdr)); } } if ((op_err) && phdr) { struct sctp_paramhdr s; if (err_at % 4) { uint32_t cpthis = 0; pad_needed = 4 - (err_at % 4); m_copyback(op_err, err_at, pad_needed, (caddr_t)&cpthis); err_at += pad_needed; } s.param_type = htons(SCTP_CAUSE_PROTOCOL_VIOLATION); s.param_length = htons(sizeof(s) + sizeof(struct sctp_paramhdr)); m_copyback(op_err, err_at, sizeof(s), (caddr_t)&s); err_at += sizeof(s); /* Only copy back the p-hdr that caused the issue */ m_copyback(op_err, err_at, sizeof(struct sctp_paramhdr), (caddr_t)phdr); } return (op_err); } static int sctp_are_there_new_addresses(struct sctp_association *asoc, struct mbuf *in_initpkt, int offset, struct sockaddr *src) { /* * Given a INIT packet, look through the packet to verify that there * are NO new addresses. As we go through the parameters add reports * of any un-understood parameters that require an error. Also we * must return (1) to drop the packet if we see a un-understood * parameter that tells us to drop the chunk. */ struct sockaddr *sa_touse; struct sockaddr *sa; struct sctp_paramhdr *phdr, params; uint16_t ptype, plen; uint8_t fnd; struct sctp_nets *net; int check_src; #ifdef INET struct sockaddr_in sin4, *sa4; #endif #ifdef INET6 struct sockaddr_in6 sin6, *sa6; #endif #ifdef INET memset(&sin4, 0, sizeof(sin4)); sin4.sin_family = AF_INET; sin4.sin_len = sizeof(sin4); #endif #ifdef INET6 memset(&sin6, 0, sizeof(sin6)); sin6.sin6_family = AF_INET6; sin6.sin6_len = sizeof(sin6); #endif /* First what about the src address of the pkt ? */ check_src = 0; switch (src->sa_family) { #ifdef INET case AF_INET: if (asoc->scope.ipv4_addr_legal) { check_src = 1; } break; #endif #ifdef INET6 case AF_INET6: if (asoc->scope.ipv6_addr_legal) { check_src = 1; } break; #endif default: /* TSNH */ break; } if (check_src) { fnd = 0; TAILQ_FOREACH(net, &asoc->nets, sctp_next) { sa = (struct sockaddr *)&net->ro._l_addr; if (sa->sa_family == src->sa_family) { #ifdef INET if (sa->sa_family == AF_INET) { struct sockaddr_in *src4; sa4 = (struct sockaddr_in *)sa; src4 = (struct sockaddr_in *)src; if (sa4->sin_addr.s_addr == src4->sin_addr.s_addr) { fnd = 1; break; } } #endif #ifdef INET6 if (sa->sa_family == AF_INET6) { struct sockaddr_in6 *src6; sa6 = (struct sockaddr_in6 *)sa; src6 = (struct sockaddr_in6 *)src; if (SCTP6_ARE_ADDR_EQUAL(sa6, src6)) { fnd = 1; break; } } #endif } } if (fnd == 0) { /* New address added! no need to look further. */ return (1); } } /* Ok so far lets munge through the rest of the packet */ offset += sizeof(struct sctp_init_chunk); phdr = sctp_get_next_param(in_initpkt, offset, ¶ms, sizeof(params)); while (phdr) { sa_touse = NULL; ptype = ntohs(phdr->param_type); plen = ntohs(phdr->param_length); switch (ptype) { #ifdef INET case SCTP_IPV4_ADDRESS: { struct sctp_ipv4addr_param *p4, p4_buf; phdr = sctp_get_next_param(in_initpkt, offset, (struct sctp_paramhdr *)&p4_buf, sizeof(p4_buf)); if (plen != sizeof(struct sctp_ipv4addr_param) || phdr == NULL) { return (1); } if (asoc->scope.ipv4_addr_legal) { p4 = (struct sctp_ipv4addr_param *)phdr; sin4.sin_addr.s_addr = p4->addr; sa_touse = (struct sockaddr *)&sin4; } break; } #endif #ifdef INET6 case SCTP_IPV6_ADDRESS: { struct sctp_ipv6addr_param *p6, p6_buf; phdr = sctp_get_next_param(in_initpkt, offset, (struct sctp_paramhdr *)&p6_buf, sizeof(p6_buf)); if (plen != sizeof(struct sctp_ipv6addr_param) || phdr == NULL) { return (1); } if (asoc->scope.ipv6_addr_legal) { p6 = (struct sctp_ipv6addr_param *)phdr; memcpy((caddr_t)&sin6.sin6_addr, p6->addr, sizeof(p6->addr)); sa_touse = (struct sockaddr *)&sin6; } break; } #endif default: sa_touse = NULL; break; } if (sa_touse) { /* ok, sa_touse points to one to check */ fnd = 0; TAILQ_FOREACH(net, &asoc->nets, sctp_next) { sa = (struct sockaddr *)&net->ro._l_addr; if (sa->sa_family != sa_touse->sa_family) { continue; } #ifdef INET if (sa->sa_family == AF_INET) { sa4 = (struct sockaddr_in *)sa; if (sa4->sin_addr.s_addr == sin4.sin_addr.s_addr) { fnd = 1; break; } } #endif #ifdef INET6 if (sa->sa_family == AF_INET6) { sa6 = (struct sockaddr_in6 *)sa; if (SCTP6_ARE_ADDR_EQUAL( sa6, &sin6)) { fnd = 1; break; } } #endif } if (!fnd) { /* New addr added! no need to look further */ return (1); } } offset += SCTP_SIZE32(plen); phdr = sctp_get_next_param(in_initpkt, offset, ¶ms, sizeof(params)); } return (0); } /* * Given a MBUF chain that was sent into us containing an INIT. Build a * INIT-ACK with COOKIE and send back. We assume that the in_initpkt has done * a pullup to include IPv6/4header, SCTP header and initial part of INIT * message (i.e. the struct sctp_init_msg). */ void sctp_send_initiate_ack(struct sctp_inpcb *inp, struct sctp_tcb *stcb, struct sctp_nets *src_net, struct mbuf *init_pkt, int iphlen, int offset, struct sockaddr *src, struct sockaddr *dst, struct sctphdr *sh, struct sctp_init_chunk *init_chk, uint8_t mflowtype, uint32_t mflowid, uint32_t vrf_id, uint16_t port, int hold_inp_lock) { struct sctp_association *asoc; struct mbuf *m, *m_tmp, *m_last, *m_cookie, *op_err; struct sctp_init_ack_chunk *initack; struct sctp_adaptation_layer_indication *ali; struct sctp_supported_chunk_types_param *pr_supported; struct sctp_paramhdr *ph; union sctp_sockstore *over_addr; struct sctp_scoping scp; #ifdef INET struct sockaddr_in *dst4 = (struct sockaddr_in *)dst; struct sockaddr_in *src4 = (struct sockaddr_in *)src; struct sockaddr_in *sin; #endif #ifdef INET6 struct sockaddr_in6 *dst6 = (struct sockaddr_in6 *)dst; struct sockaddr_in6 *src6 = (struct sockaddr_in6 *)src; struct sockaddr_in6 *sin6; #endif struct sockaddr *to; struct sctp_state_cookie stc; struct sctp_nets *net = NULL; uint8_t *signature = NULL; int cnt_inits_to = 0; uint16_t his_limit, i_want; int abort_flag; int nat_friendly = 0; struct socket *so; uint16_t num_ext, chunk_len, padding_len, parameter_len; if (stcb) { asoc = &stcb->asoc; } else { asoc = NULL; } if ((asoc != NULL) && (SCTP_GET_STATE(asoc) != SCTP_STATE_COOKIE_WAIT)) { if (sctp_are_there_new_addresses(asoc, init_pkt, offset, src)) { /* * new addresses, out of here in non-cookie-wait * states * * Send an ABORT, without the new address error cause. * This looks no different than if no listener was * present. */ op_err = sctp_generate_cause(SCTP_BASE_SYSCTL(sctp_diag_info_code), "Address added"); sctp_send_abort(init_pkt, iphlen, src, dst, sh, 0, op_err, mflowtype, mflowid, inp->fibnum, vrf_id, port); return; } if (src_net != NULL && (src_net->port != port)) { /* * change of remote encapsulation port, out of here * in non-cookie-wait states * * Send an ABORT, without an specific error cause. This * looks no different than if no listener was * present. */ op_err = sctp_generate_cause(SCTP_BASE_SYSCTL(sctp_diag_info_code), "Remote encapsulation port changed"); sctp_send_abort(init_pkt, iphlen, src, dst, sh, 0, op_err, mflowtype, mflowid, inp->fibnum, vrf_id, port); return; } } abort_flag = 0; op_err = sctp_arethere_unrecognized_parameters(init_pkt, (offset + sizeof(struct sctp_init_chunk)), &abort_flag, (struct sctp_chunkhdr *)init_chk, &nat_friendly); if (abort_flag) { do_a_abort: if (op_err == NULL) { char msg[SCTP_DIAG_INFO_LEN]; snprintf(msg, sizeof(msg), "%s:%d at %s", __FILE__, __LINE__, __func__); op_err = sctp_generate_cause(SCTP_BASE_SYSCTL(sctp_diag_info_code), msg); } sctp_send_abort(init_pkt, iphlen, src, dst, sh, init_chk->init.initiate_tag, op_err, mflowtype, mflowid, inp->fibnum, vrf_id, port); return; } m = sctp_get_mbuf_for_msg(MCLBYTES, 0, M_NOWAIT, 1, MT_DATA); if (m == NULL) { /* No memory, INIT timer will re-attempt. */ if (op_err) sctp_m_freem(op_err); return; } chunk_len = (uint16_t) sizeof(struct sctp_init_ack_chunk); padding_len = 0; /* * We might not overwrite the identification[] completely and on * some platforms time_entered will contain some padding. Therefore * zero out the cookie to avoid putting uninitialized memory on the * wire. */ memset(&stc, 0, sizeof(struct sctp_state_cookie)); /* the time I built cookie */ (void)SCTP_GETTIME_TIMEVAL(&stc.time_entered); /* populate any tie tags */ if (asoc != NULL) { /* unlock before tag selections */ stc.tie_tag_my_vtag = asoc->my_vtag_nonce; stc.tie_tag_peer_vtag = asoc->peer_vtag_nonce; stc.cookie_life = asoc->cookie_life; net = asoc->primary_destination; } else { stc.tie_tag_my_vtag = 0; stc.tie_tag_peer_vtag = 0; /* life I will award this cookie */ stc.cookie_life = inp->sctp_ep.def_cookie_life; } /* copy in the ports for later check */ stc.myport = sh->dest_port; stc.peerport = sh->src_port; /* * If we wanted to honor cookie life extensions, we would add to * stc.cookie_life. For now we should NOT honor any extension */ stc.site_scope = stc.local_scope = stc.loopback_scope = 0; if (inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) { stc.ipv6_addr_legal = 1; if (SCTP_IPV6_V6ONLY(inp)) { stc.ipv4_addr_legal = 0; } else { stc.ipv4_addr_legal = 1; } } else { stc.ipv6_addr_legal = 0; stc.ipv4_addr_legal = 1; } stc.ipv4_scope = 0; if (net == NULL) { to = src; switch (dst->sa_family) { #ifdef INET case AF_INET: { /* lookup address */ stc.address[0] = src4->sin_addr.s_addr; stc.address[1] = 0; stc.address[2] = 0; stc.address[3] = 0; stc.addr_type = SCTP_IPV4_ADDRESS; /* local from address */ stc.laddress[0] = dst4->sin_addr.s_addr; stc.laddress[1] = 0; stc.laddress[2] = 0; stc.laddress[3] = 0; stc.laddr_type = SCTP_IPV4_ADDRESS; /* scope_id is only for v6 */ stc.scope_id = 0; if ((IN4_ISPRIVATE_ADDRESS(&src4->sin_addr)) || (IN4_ISPRIVATE_ADDRESS(&dst4->sin_addr))) { stc.ipv4_scope = 1; } /* Must use the address in this case */ if (sctp_is_address_on_local_host(src, vrf_id)) { stc.loopback_scope = 1; stc.ipv4_scope = 1; stc.site_scope = 1; stc.local_scope = 0; } break; } #endif #ifdef INET6 case AF_INET6: { stc.addr_type = SCTP_IPV6_ADDRESS; memcpy(&stc.address, &src6->sin6_addr, sizeof(struct in6_addr)); stc.scope_id = ntohs(in6_getscope(&src6->sin6_addr)); if (sctp_is_address_on_local_host(src, vrf_id)) { stc.loopback_scope = 1; stc.local_scope = 0; stc.site_scope = 1; stc.ipv4_scope = 1; } else if (IN6_IS_ADDR_LINKLOCAL(&src6->sin6_addr) || IN6_IS_ADDR_LINKLOCAL(&dst6->sin6_addr)) { /* * If the new destination or source * is a LINK_LOCAL we must have * common both site and local scope. * Don't set local scope though * since we must depend on the * source to be added implicitly. We * cannot assure just because we * share one link that all links are * common. */ stc.local_scope = 0; stc.site_scope = 1; stc.ipv4_scope = 1; /* * we start counting for the private * address stuff at 1. since the * link local we source from won't * show up in our scoped count. */ cnt_inits_to = 1; /* * pull out the scope_id from * incoming pkt */ } else if (IN6_IS_ADDR_SITELOCAL(&src6->sin6_addr) || IN6_IS_ADDR_SITELOCAL(&dst6->sin6_addr)) { /* * If the new destination or source * is SITE_LOCAL then we must have * site scope in common. */ stc.site_scope = 1; } memcpy(&stc.laddress, &dst6->sin6_addr, sizeof(struct in6_addr)); stc.laddr_type = SCTP_IPV6_ADDRESS; break; } #endif default: /* TSNH */ goto do_a_abort; break; } } else { /* set the scope per the existing tcb */ #ifdef INET6 struct sctp_nets *lnet; #endif stc.loopback_scope = asoc->scope.loopback_scope; stc.ipv4_scope = asoc->scope.ipv4_local_scope; stc.site_scope = asoc->scope.site_scope; stc.local_scope = asoc->scope.local_scope; #ifdef INET6 /* Why do we not consider IPv4 LL addresses? */ TAILQ_FOREACH(lnet, &asoc->nets, sctp_next) { if (lnet->ro._l_addr.sin6.sin6_family == AF_INET6) { if (IN6_IS_ADDR_LINKLOCAL(&lnet->ro._l_addr.sin6.sin6_addr)) { /* * if we have a LL address, start * counting at 1. */ cnt_inits_to = 1; } } } #endif /* use the net pointer */ to = (struct sockaddr *)&net->ro._l_addr; switch (to->sa_family) { #ifdef INET case AF_INET: sin = (struct sockaddr_in *)to; stc.address[0] = sin->sin_addr.s_addr; stc.address[1] = 0; stc.address[2] = 0; stc.address[3] = 0; stc.addr_type = SCTP_IPV4_ADDRESS; if (net->src_addr_selected == 0) { /* * strange case here, the INIT should have * did the selection. */ net->ro._s_addr = sctp_source_address_selection(inp, stcb, (sctp_route_t *) & net->ro, net, 0, vrf_id); if (net->ro._s_addr == NULL) return; net->src_addr_selected = 1; } stc.laddress[0] = net->ro._s_addr->address.sin.sin_addr.s_addr; stc.laddress[1] = 0; stc.laddress[2] = 0; stc.laddress[3] = 0; stc.laddr_type = SCTP_IPV4_ADDRESS; /* scope_id is only for v6 */ stc.scope_id = 0; break; #endif #ifdef INET6 case AF_INET6: sin6 = (struct sockaddr_in6 *)to; memcpy(&stc.address, &sin6->sin6_addr, sizeof(struct in6_addr)); stc.addr_type = SCTP_IPV6_ADDRESS; stc.scope_id = sin6->sin6_scope_id; if (net->src_addr_selected == 0) { /* * strange case here, the INIT should have * done the selection. */ net->ro._s_addr = sctp_source_address_selection(inp, stcb, (sctp_route_t *) & net->ro, net, 0, vrf_id); if (net->ro._s_addr == NULL) return; net->src_addr_selected = 1; } memcpy(&stc.laddress, &net->ro._s_addr->address.sin6.sin6_addr, sizeof(struct in6_addr)); stc.laddr_type = SCTP_IPV6_ADDRESS; break; #endif } } /* Now lets put the SCTP header in place */ initack = mtod(m, struct sctp_init_ack_chunk *); /* Save it off for quick ref */ stc.peers_vtag = ntohl(init_chk->init.initiate_tag); /* who are we */ memcpy(stc.identification, SCTP_VERSION_STRING, min(strlen(SCTP_VERSION_STRING), sizeof(stc.identification))); memset(stc.reserved, 0, SCTP_RESERVE_SPACE); /* now the chunk header */ initack->ch.chunk_type = SCTP_INITIATION_ACK; initack->ch.chunk_flags = 0; /* fill in later from mbuf we build */ initack->ch.chunk_length = 0; /* place in my tag */ if ((asoc != NULL) && ((SCTP_GET_STATE(asoc) == SCTP_STATE_COOKIE_WAIT) || (SCTP_GET_STATE(asoc) == SCTP_STATE_INUSE) || (SCTP_GET_STATE(asoc) == SCTP_STATE_COOKIE_ECHOED))) { /* re-use the v-tags and init-seq here */ initack->init.initiate_tag = htonl(asoc->my_vtag); initack->init.initial_tsn = htonl(asoc->init_seq_number); } else { uint32_t vtag, itsn; if (hold_inp_lock) { SCTP_INP_INCR_REF(inp); SCTP_INP_RUNLOCK(inp); } if (asoc) { atomic_add_int(&asoc->refcnt, 1); SCTP_TCB_UNLOCK(stcb); new_tag: vtag = sctp_select_a_tag(inp, inp->sctp_lport, sh->src_port, 1); if ((asoc->peer_supports_nat) && (vtag == asoc->my_vtag)) { /* * Got a duplicate vtag on some guy behind a * nat make sure we don't use it. */ goto new_tag; } initack->init.initiate_tag = htonl(vtag); /* get a TSN to use too */ itsn = sctp_select_initial_TSN(&inp->sctp_ep); initack->init.initial_tsn = htonl(itsn); SCTP_TCB_LOCK(stcb); atomic_add_int(&asoc->refcnt, -1); } else { vtag = sctp_select_a_tag(inp, inp->sctp_lport, sh->src_port, 1); initack->init.initiate_tag = htonl(vtag); /* get a TSN to use too */ initack->init.initial_tsn = htonl(sctp_select_initial_TSN(&inp->sctp_ep)); } if (hold_inp_lock) { SCTP_INP_RLOCK(inp); SCTP_INP_DECR_REF(inp); } } /* save away my tag to */ stc.my_vtag = initack->init.initiate_tag; /* set up some of the credits. */ so = inp->sctp_socket; if (so == NULL) { /* memory problem */ sctp_m_freem(m); return; } else { initack->init.a_rwnd = htonl(max(SCTP_SB_LIMIT_RCV(so), SCTP_MINIMAL_RWND)); } /* set what I want */ his_limit = ntohs(init_chk->init.num_inbound_streams); /* choose what I want */ if (asoc != NULL) { if (asoc->streamoutcnt > asoc->pre_open_streams) { i_want = asoc->streamoutcnt; } else { i_want = asoc->pre_open_streams; } } else { i_want = inp->sctp_ep.pre_open_stream_count; } if (his_limit < i_want) { /* I Want more :< */ initack->init.num_outbound_streams = init_chk->init.num_inbound_streams; } else { /* I can have what I want :> */ initack->init.num_outbound_streams = htons(i_want); } /* tell him his limit. */ initack->init.num_inbound_streams = htons(inp->sctp_ep.max_open_streams_intome); /* adaptation layer indication parameter */ if (inp->sctp_ep.adaptation_layer_indicator_provided) { parameter_len = (uint16_t) sizeof(struct sctp_adaptation_layer_indication); ali = (struct sctp_adaptation_layer_indication *)(mtod(m, caddr_t)+chunk_len); ali->ph.param_type = htons(SCTP_ULP_ADAPTATION); ali->ph.param_length = htons(parameter_len); ali->indication = htonl(inp->sctp_ep.adaptation_layer_indicator); chunk_len += parameter_len; } /* ECN parameter */ if (((asoc != NULL) && (asoc->ecn_supported == 1)) || ((asoc == NULL) && (inp->ecn_supported == 1))) { parameter_len = (uint16_t) sizeof(struct sctp_paramhdr); ph = (struct sctp_paramhdr *)(mtod(m, caddr_t)+chunk_len); ph->param_type = htons(SCTP_ECN_CAPABLE); ph->param_length = htons(parameter_len); chunk_len += parameter_len; } /* PR-SCTP supported parameter */ if (((asoc != NULL) && (asoc->prsctp_supported == 1)) || ((asoc == NULL) && (inp->prsctp_supported == 1))) { parameter_len = (uint16_t) sizeof(struct sctp_paramhdr); ph = (struct sctp_paramhdr *)(mtod(m, caddr_t)+chunk_len); ph->param_type = htons(SCTP_PRSCTP_SUPPORTED); ph->param_length = htons(parameter_len); chunk_len += parameter_len; } /* Add NAT friendly parameter */ if (nat_friendly) { parameter_len = (uint16_t) sizeof(struct sctp_paramhdr); ph = (struct sctp_paramhdr *)(mtod(m, caddr_t)+chunk_len); ph->param_type = htons(SCTP_HAS_NAT_SUPPORT); ph->param_length = htons(parameter_len); chunk_len += parameter_len; } /* And now tell the peer which extensions we support */ num_ext = 0; pr_supported = (struct sctp_supported_chunk_types_param *)(mtod(m, caddr_t)+chunk_len); if (((asoc != NULL) && (asoc->prsctp_supported == 1)) || ((asoc == NULL) && (inp->prsctp_supported == 1))) { pr_supported->chunk_types[num_ext++] = SCTP_FORWARD_CUM_TSN; if (((asoc != NULL) && (asoc->idata_supported == 1)) || ((asoc == NULL) && (inp->idata_supported == 1))) { pr_supported->chunk_types[num_ext++] = SCTP_IFORWARD_CUM_TSN; } } if (((asoc != NULL) && (asoc->auth_supported == 1)) || ((asoc == NULL) && (inp->auth_supported == 1))) { pr_supported->chunk_types[num_ext++] = SCTP_AUTHENTICATION; } if (((asoc != NULL) && (asoc->asconf_supported == 1)) || ((asoc == NULL) && (inp->asconf_supported == 1))) { pr_supported->chunk_types[num_ext++] = SCTP_ASCONF; pr_supported->chunk_types[num_ext++] = SCTP_ASCONF_ACK; } if (((asoc != NULL) && (asoc->reconfig_supported == 1)) || ((asoc == NULL) && (inp->reconfig_supported == 1))) { pr_supported->chunk_types[num_ext++] = SCTP_STREAM_RESET; } if (((asoc != NULL) && (asoc->idata_supported == 1)) || ((asoc == NULL) && (inp->idata_supported == 1))) { pr_supported->chunk_types[num_ext++] = SCTP_IDATA; } if (((asoc != NULL) && (asoc->nrsack_supported == 1)) || ((asoc == NULL) && (inp->nrsack_supported == 1))) { pr_supported->chunk_types[num_ext++] = SCTP_NR_SELECTIVE_ACK; } if (((asoc != NULL) && (asoc->pktdrop_supported == 1)) || ((asoc == NULL) && (inp->pktdrop_supported == 1))) { pr_supported->chunk_types[num_ext++] = SCTP_PACKET_DROPPED; } if (num_ext > 0) { parameter_len = (uint16_t) sizeof(struct sctp_supported_chunk_types_param) + num_ext; pr_supported->ph.param_type = htons(SCTP_SUPPORTED_CHUNK_EXT); pr_supported->ph.param_length = htons(parameter_len); padding_len = SCTP_SIZE32(parameter_len) - parameter_len; chunk_len += parameter_len; } /* add authentication parameters */ if (((asoc != NULL) && (asoc->auth_supported == 1)) || ((asoc == NULL) && (inp->auth_supported == 1))) { struct sctp_auth_random *randp; struct sctp_auth_hmac_algo *hmacs; struct sctp_auth_chunk_list *chunks; if (padding_len > 0) { memset(mtod(m, caddr_t)+chunk_len, 0, padding_len); chunk_len += padding_len; padding_len = 0; } /* generate and add RANDOM parameter */ randp = (struct sctp_auth_random *)(mtod(m, caddr_t)+chunk_len); parameter_len = (uint16_t) sizeof(struct sctp_auth_random) + SCTP_AUTH_RANDOM_SIZE_DEFAULT; randp->ph.param_type = htons(SCTP_RANDOM); randp->ph.param_length = htons(parameter_len); SCTP_READ_RANDOM(randp->random_data, SCTP_AUTH_RANDOM_SIZE_DEFAULT); padding_len = SCTP_SIZE32(parameter_len) - parameter_len; chunk_len += parameter_len; if (padding_len > 0) { memset(mtod(m, caddr_t)+chunk_len, 0, padding_len); chunk_len += padding_len; padding_len = 0; } /* add HMAC_ALGO parameter */ hmacs = (struct sctp_auth_hmac_algo *)(mtod(m, caddr_t)+chunk_len); parameter_len = (uint16_t) sizeof(struct sctp_auth_hmac_algo) + sctp_serialize_hmaclist(inp->sctp_ep.local_hmacs, (uint8_t *) hmacs->hmac_ids); hmacs->ph.param_type = htons(SCTP_HMAC_LIST); hmacs->ph.param_length = htons(parameter_len); padding_len = SCTP_SIZE32(parameter_len) - parameter_len; chunk_len += parameter_len; if (padding_len > 0) { memset(mtod(m, caddr_t)+chunk_len, 0, padding_len); chunk_len += padding_len; padding_len = 0; } /* add CHUNKS parameter */ chunks = (struct sctp_auth_chunk_list *)(mtod(m, caddr_t)+chunk_len); parameter_len = (uint16_t) sizeof(struct sctp_auth_chunk_list) + sctp_serialize_auth_chunks(inp->sctp_ep.local_auth_chunks, chunks->chunk_types); chunks->ph.param_type = htons(SCTP_CHUNK_LIST); chunks->ph.param_length = htons(parameter_len); padding_len = SCTP_SIZE32(parameter_len) - parameter_len; chunk_len += parameter_len; } SCTP_BUF_LEN(m) = chunk_len; m_last = m; /* now the addresses */ /* * To optimize this we could put the scoping stuff into a structure * and remove the individual uint8's from the stc structure. Then we * could just sifa in the address within the stc.. but for now this * is a quick hack to get the address stuff teased apart. */ scp.ipv4_addr_legal = stc.ipv4_addr_legal; scp.ipv6_addr_legal = stc.ipv6_addr_legal; scp.loopback_scope = stc.loopback_scope; scp.ipv4_local_scope = stc.ipv4_scope; scp.local_scope = stc.local_scope; scp.site_scope = stc.site_scope; m_last = sctp_add_addresses_to_i_ia(inp, stcb, &scp, m_last, cnt_inits_to, &padding_len, &chunk_len); /* padding_len can only be positive, if no addresses have been added */ if (padding_len > 0) { memset(mtod(m, caddr_t)+chunk_len, 0, padding_len); chunk_len += padding_len; SCTP_BUF_LEN(m) += padding_len; padding_len = 0; } /* tack on the operational error if present */ if (op_err) { parameter_len = 0; for (m_tmp = op_err; m_tmp != NULL; m_tmp = SCTP_BUF_NEXT(m_tmp)) { parameter_len += SCTP_BUF_LEN(m_tmp); } padding_len = SCTP_SIZE32(parameter_len) - parameter_len; SCTP_BUF_NEXT(m_last) = op_err; while (SCTP_BUF_NEXT(m_last) != NULL) { m_last = SCTP_BUF_NEXT(m_last); } chunk_len += parameter_len; } if (padding_len > 0) { m_last = sctp_add_pad_tombuf(m_last, padding_len); if (m_last == NULL) { /* Houston we have a problem, no space */ sctp_m_freem(m); return; } chunk_len += padding_len; padding_len = 0; } /* Now we must build a cookie */ m_cookie = sctp_add_cookie(init_pkt, offset, m, 0, &stc, &signature); if (m_cookie == NULL) { /* memory problem */ sctp_m_freem(m); return; } /* Now append the cookie to the end and update the space/size */ SCTP_BUF_NEXT(m_last) = m_cookie; parameter_len = 0; for (m_tmp = m_cookie; m_tmp != NULL; m_tmp = SCTP_BUF_NEXT(m_tmp)) { parameter_len += SCTP_BUF_LEN(m_tmp); if (SCTP_BUF_NEXT(m_tmp) == NULL) { m_last = m_tmp; } } padding_len = SCTP_SIZE32(parameter_len) - parameter_len; chunk_len += parameter_len; /* * Place in the size, but we don't include the last pad (if any) in * the INIT-ACK. */ initack->ch.chunk_length = htons(chunk_len); /* * Time to sign the cookie, we don't sign over the cookie signature * though thus we set trailer. */ (void)sctp_hmac_m(SCTP_HMAC, (uint8_t *) inp->sctp_ep.secret_key[(int)(inp->sctp_ep.current_secret_number)], SCTP_SECRET_SIZE, m_cookie, sizeof(struct sctp_paramhdr), (uint8_t *) signature, SCTP_SIGNATURE_SIZE); /* * We sifa 0 here to NOT set IP_DF if its IPv4, we ignore the return * here since the timer will drive a retranmission. */ if (padding_len > 0) { if (sctp_add_pad_tombuf(m_last, padding_len) == NULL) { sctp_m_freem(m); return; } } if (stc.loopback_scope) { over_addr = (union sctp_sockstore *)dst; } else { over_addr = NULL; } (void)sctp_lowlevel_chunk_output(inp, NULL, NULL, to, m, 0, NULL, 0, 0, 0, 0, inp->sctp_lport, sh->src_port, init_chk->init.initiate_tag, port, over_addr, mflowtype, mflowid, SCTP_SO_NOT_LOCKED); SCTP_STAT_INCR_COUNTER64(sctps_outcontrolchunks); } static void sctp_prune_prsctp(struct sctp_tcb *stcb, struct sctp_association *asoc, struct sctp_sndrcvinfo *srcv, int dataout) { int freed_spc = 0; struct sctp_tmit_chunk *chk, *nchk; SCTP_TCB_LOCK_ASSERT(stcb); if ((asoc->prsctp_supported) && (asoc->sent_queue_cnt_removeable > 0)) { TAILQ_FOREACH(chk, &asoc->sent_queue, sctp_next) { /* * Look for chunks marked with the PR_SCTP flag AND * the buffer space flag. If the one being sent is * equal or greater priority then purge the old one * and free some space. */ if (PR_SCTP_BUF_ENABLED(chk->flags)) { /* * This one is PR-SCTP AND buffer space * limited type */ if (chk->rec.data.timetodrop.tv_sec >= (long)srcv->sinfo_timetolive) { /* * Lower numbers equates to higher * priority so if the one we are * looking at has a larger or equal * priority we want to drop the data * and NOT retransmit it. */ if (chk->data) { /* * We release the book_size * if the mbuf is here */ int ret_spc; uint8_t sent; if (chk->sent > SCTP_DATAGRAM_UNSENT) sent = 1; else sent = 0; ret_spc = sctp_release_pr_sctp_chunk(stcb, chk, sent, SCTP_SO_LOCKED); freed_spc += ret_spc; if (freed_spc >= dataout) { return; } } /* if chunk was present */ } /* if of sufficient priority */ } /* if chunk has enabled */ } /* tailqforeach */ TAILQ_FOREACH_SAFE(chk, &asoc->send_queue, sctp_next, nchk) { /* Here we must move to the sent queue and mark */ if (PR_SCTP_BUF_ENABLED(chk->flags)) { if (chk->rec.data.timetodrop.tv_sec >= (long)srcv->sinfo_timetolive) { if (chk->data) { /* * We release the book_size * if the mbuf is here */ int ret_spc; ret_spc = sctp_release_pr_sctp_chunk(stcb, chk, 0, SCTP_SO_LOCKED); freed_spc += ret_spc; if (freed_spc >= dataout) { return; } } /* end if chk->data */ } /* end if right class */ } /* end if chk pr-sctp */ } /* tailqforeachsafe (chk) */ } /* if enabled in asoc */ } int sctp_get_frag_point(struct sctp_tcb *stcb, struct sctp_association *asoc) { int siz, ovh; /* * For endpoints that have both v6 and v4 addresses we must reserve * room for the ipv6 header, for those that are only dealing with V4 * we use a larger frag point. */ if (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) { ovh = SCTP_MIN_OVERHEAD; } else { ovh = SCTP_MIN_V4_OVERHEAD; } if (stcb->asoc.idata_supported) { ovh += sizeof(struct sctp_idata_chunk); } else { ovh += sizeof(struct sctp_data_chunk); } if (stcb->asoc.sctp_frag_point > asoc->smallest_mtu) siz = asoc->smallest_mtu - ovh; else siz = (stcb->asoc.sctp_frag_point - ovh); /* * if (siz > (MCLBYTES-sizeof(struct sctp_data_chunk))) { */ /* A data chunk MUST fit in a cluster */ /* siz = (MCLBYTES - sizeof(struct sctp_data_chunk)); */ /* } */ /* adjust for an AUTH chunk if DATA requires auth */ if (sctp_auth_is_required_chunk(SCTP_DATA, stcb->asoc.peer_auth_chunks)) siz -= sctp_get_auth_chunk_len(stcb->asoc.peer_hmac_id); if (siz % 4) { /* make it an even word boundary please */ siz -= (siz % 4); } return (siz); } static void sctp_set_prsctp_policy(struct sctp_stream_queue_pending *sp) { /* * We assume that the user wants PR_SCTP_TTL if the user provides a * positive lifetime but does not specify any PR_SCTP policy. */ if (PR_SCTP_ENABLED(sp->sinfo_flags)) { sp->act_flags |= PR_SCTP_POLICY(sp->sinfo_flags); } else if (sp->timetolive > 0) { sp->sinfo_flags |= SCTP_PR_SCTP_TTL; sp->act_flags |= PR_SCTP_POLICY(sp->sinfo_flags); } else { return; } switch (PR_SCTP_POLICY(sp->sinfo_flags)) { case CHUNK_FLAGS_PR_SCTP_BUF: /* * Time to live is a priority stored in tv_sec when doing * the buffer drop thing. */ sp->ts.tv_sec = sp->timetolive; sp->ts.tv_usec = 0; break; case CHUNK_FLAGS_PR_SCTP_TTL: { struct timeval tv; (void)SCTP_GETTIME_TIMEVAL(&sp->ts); tv.tv_sec = sp->timetolive / 1000; tv.tv_usec = (sp->timetolive * 1000) % 1000000; /* * TODO sctp_constants.h needs alternative time * macros when _KERNEL is undefined. */ timevaladd(&sp->ts, &tv); } break; case CHUNK_FLAGS_PR_SCTP_RTX: /* * Time to live is a the number or retransmissions stored in * tv_sec. */ sp->ts.tv_sec = sp->timetolive; sp->ts.tv_usec = 0; break; default: SCTPDBG(SCTP_DEBUG_USRREQ1, "Unknown PR_SCTP policy %u.\n", PR_SCTP_POLICY(sp->sinfo_flags)); break; } } static int sctp_msg_append(struct sctp_tcb *stcb, struct sctp_nets *net, struct mbuf *m, struct sctp_sndrcvinfo *srcv, int hold_stcb_lock) { int error = 0; struct mbuf *at; struct sctp_stream_queue_pending *sp = NULL; struct sctp_stream_out *strm; /* * Given an mbuf chain, put it into the association send queue and * place it on the wheel */ if (srcv->sinfo_stream >= stcb->asoc.streamoutcnt) { /* Invalid stream number */ SCTP_LTRACE_ERR_RET_PKT(m, NULL, stcb, net, SCTP_FROM_SCTP_OUTPUT, EINVAL); error = EINVAL; goto out_now; } if ((stcb->asoc.stream_locked) && (stcb->asoc.stream_locked_on != srcv->sinfo_stream)) { SCTP_LTRACE_ERR_RET_PKT(m, NULL, stcb, net, SCTP_FROM_SCTP_OUTPUT, EINVAL); error = EINVAL; goto out_now; } strm = &stcb->asoc.strmout[srcv->sinfo_stream]; /* Now can we send this? */ if ((SCTP_GET_STATE(&stcb->asoc) == SCTP_STATE_SHUTDOWN_SENT) || (SCTP_GET_STATE(&stcb->asoc) == SCTP_STATE_SHUTDOWN_ACK_SENT) || (SCTP_GET_STATE(&stcb->asoc) == SCTP_STATE_SHUTDOWN_RECEIVED) || (stcb->asoc.state & SCTP_STATE_SHUTDOWN_PENDING)) { /* got data while shutting down */ SCTP_LTRACE_ERR_RET(NULL, stcb, NULL, SCTP_FROM_SCTP_OUTPUT, ECONNRESET); error = ECONNRESET; goto out_now; } sctp_alloc_a_strmoq(stcb, sp); if (sp == NULL) { SCTP_LTRACE_ERR_RET(NULL, stcb, NULL, SCTP_FROM_SCTP_OUTPUT, ENOMEM); error = ENOMEM; goto out_now; } sp->sinfo_flags = srcv->sinfo_flags; sp->timetolive = srcv->sinfo_timetolive; sp->ppid = srcv->sinfo_ppid; sp->context = srcv->sinfo_context; sp->fsn = 0; if (sp->sinfo_flags & SCTP_ADDR_OVER) { sp->net = net; atomic_add_int(&sp->net->ref_count, 1); } else { sp->net = NULL; } (void)SCTP_GETTIME_TIMEVAL(&sp->ts); sp->stream = srcv->sinfo_stream; sp->msg_is_complete = 1; sp->sender_all_done = 1; sp->some_taken = 0; sp->data = m; sp->tail_mbuf = NULL; sctp_set_prsctp_policy(sp); /* * We could in theory (for sendall) sifa the length in, but we would * still have to hunt through the chain since we need to setup the * tail_mbuf */ sp->length = 0; for (at = m; at; at = SCTP_BUF_NEXT(at)) { if (SCTP_BUF_NEXT(at) == NULL) sp->tail_mbuf = at; sp->length += SCTP_BUF_LEN(at); } if (srcv->sinfo_keynumber_valid) { sp->auth_keyid = srcv->sinfo_keynumber; } else { sp->auth_keyid = stcb->asoc.authinfo.active_keyid; } if (sctp_auth_is_required_chunk(SCTP_DATA, stcb->asoc.peer_auth_chunks)) { sctp_auth_key_acquire(stcb, sp->auth_keyid); sp->holds_key_ref = 1; } if (hold_stcb_lock == 0) { SCTP_TCB_SEND_LOCK(stcb); } sctp_snd_sb_alloc(stcb, sp->length); atomic_add_int(&stcb->asoc.stream_queue_cnt, 1); TAILQ_INSERT_TAIL(&strm->outqueue, sp, next); stcb->asoc.ss_functions.sctp_ss_add_to_stream(stcb, &stcb->asoc, strm, sp, 1); m = NULL; if (hold_stcb_lock == 0) { SCTP_TCB_SEND_UNLOCK(stcb); } out_now: if (m) { sctp_m_freem(m); } return (error); } static struct mbuf * sctp_copy_mbufchain(struct mbuf *clonechain, struct mbuf *outchain, struct mbuf **endofchain, int can_take_mbuf, int sizeofcpy, uint8_t copy_by_ref) { struct mbuf *m; struct mbuf *appendchain; caddr_t cp; int len; if (endofchain == NULL) { /* error */ error_out: if (outchain) sctp_m_freem(outchain); return (NULL); } if (can_take_mbuf) { appendchain = clonechain; } else { if (!copy_by_ref && (sizeofcpy <= (int)((((SCTP_BASE_SYSCTL(sctp_mbuf_threshold_count) - 1) * MLEN) + MHLEN))) ) { /* Its not in a cluster */ if (*endofchain == NULL) { /* lets get a mbuf cluster */ if (outchain == NULL) { /* This is the general case */ new_mbuf: outchain = sctp_get_mbuf_for_msg(MCLBYTES, 0, M_NOWAIT, 1, MT_HEADER); if (outchain == NULL) { goto error_out; } SCTP_BUF_LEN(outchain) = 0; *endofchain = outchain; /* get the prepend space */ SCTP_BUF_RESV_UF(outchain, (SCTP_FIRST_MBUF_RESV + 4)); } else { /* * We really should not get a NULL * in endofchain */ /* find end */ m = outchain; while (m) { if (SCTP_BUF_NEXT(m) == NULL) { *endofchain = m; break; } m = SCTP_BUF_NEXT(m); } /* sanity */ if (*endofchain == NULL) { /* * huh, TSNH XXX maybe we * should panic */ sctp_m_freem(outchain); goto new_mbuf; } } /* get the new end of length */ len = (int)M_TRAILINGSPACE(*endofchain); } else { /* how much is left at the end? */ len = (int)M_TRAILINGSPACE(*endofchain); } /* Find the end of the data, for appending */ cp = (mtod((*endofchain), caddr_t)+SCTP_BUF_LEN((*endofchain))); /* Now lets copy it out */ if (len >= sizeofcpy) { /* It all fits, copy it in */ m_copydata(clonechain, 0, sizeofcpy, cp); SCTP_BUF_LEN((*endofchain)) += sizeofcpy; } else { /* fill up the end of the chain */ if (len > 0) { m_copydata(clonechain, 0, len, cp); SCTP_BUF_LEN((*endofchain)) += len; /* now we need another one */ sizeofcpy -= len; } m = sctp_get_mbuf_for_msg(MCLBYTES, 0, M_NOWAIT, 1, MT_HEADER); if (m == NULL) { /* We failed */ goto error_out; } SCTP_BUF_NEXT((*endofchain)) = m; *endofchain = m; cp = mtod((*endofchain), caddr_t); m_copydata(clonechain, len, sizeofcpy, cp); SCTP_BUF_LEN((*endofchain)) += sizeofcpy; } return (outchain); } else { /* copy the old fashion way */ appendchain = SCTP_M_COPYM(clonechain, 0, M_COPYALL, M_NOWAIT); #ifdef SCTP_MBUF_LOGGING if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_MBUF_LOGGING_ENABLE) { sctp_log_mbc(appendchain, SCTP_MBUF_ICOPY); } #endif } } if (appendchain == NULL) { /* error */ if (outchain) sctp_m_freem(outchain); return (NULL); } if (outchain) { /* tack on to the end */ if (*endofchain != NULL) { SCTP_BUF_NEXT(((*endofchain))) = appendchain; } else { m = outchain; while (m) { if (SCTP_BUF_NEXT(m) == NULL) { SCTP_BUF_NEXT(m) = appendchain; break; } m = SCTP_BUF_NEXT(m); } } /* * save off the end and update the end-chain position */ m = appendchain; while (m) { if (SCTP_BUF_NEXT(m) == NULL) { *endofchain = m; break; } m = SCTP_BUF_NEXT(m); } return (outchain); } else { /* save off the end and update the end-chain position */ m = appendchain; while (m) { if (SCTP_BUF_NEXT(m) == NULL) { *endofchain = m; break; } m = SCTP_BUF_NEXT(m); } return (appendchain); } } static int sctp_med_chunk_output(struct sctp_inpcb *inp, struct sctp_tcb *stcb, struct sctp_association *asoc, int *num_out, int *reason_code, int control_only, int from_where, struct timeval *now, int *now_filled, int frag_point, int so_locked #if !defined(__APPLE__) && !defined(SCTP_SO_LOCK_TESTING) SCTP_UNUSED #endif ); static void sctp_sendall_iterator(struct sctp_inpcb *inp, struct sctp_tcb *stcb, void *ptr, uint32_t val SCTP_UNUSED) { struct sctp_copy_all *ca; struct mbuf *m; int ret = 0; int added_control = 0; int un_sent, do_chunk_output = 1; struct sctp_association *asoc; struct sctp_nets *net; ca = (struct sctp_copy_all *)ptr; if (ca->m == NULL) { return; } if (ca->inp != inp) { /* TSNH */ return; } if (ca->sndlen > 0) { m = SCTP_M_COPYM(ca->m, 0, M_COPYALL, M_NOWAIT); if (m == NULL) { /* can't copy so we are done */ ca->cnt_failed++; return; } #ifdef SCTP_MBUF_LOGGING if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_MBUF_LOGGING_ENABLE) { sctp_log_mbc(m, SCTP_MBUF_ICOPY); } #endif } else { m = NULL; } SCTP_TCB_LOCK_ASSERT(stcb); if (stcb->asoc.alternate) { net = stcb->asoc.alternate; } else { net = stcb->asoc.primary_destination; } if (ca->sndrcv.sinfo_flags & SCTP_ABORT) { /* Abort this assoc with m as the user defined reason */ if (m != NULL) { SCTP_BUF_PREPEND(m, sizeof(struct sctp_paramhdr), M_NOWAIT); } else { m = sctp_get_mbuf_for_msg(sizeof(struct sctp_paramhdr), 0, M_NOWAIT, 1, MT_DATA); SCTP_BUF_LEN(m) = sizeof(struct sctp_paramhdr); } if (m != NULL) { struct sctp_paramhdr *ph; ph = mtod(m, struct sctp_paramhdr *); ph->param_type = htons(SCTP_CAUSE_USER_INITIATED_ABT); ph->param_length = htons((uint16_t) (sizeof(struct sctp_paramhdr) + ca->sndlen)); } /* * We add one here to keep the assoc from dis-appearing on * us. */ atomic_add_int(&stcb->asoc.refcnt, 1); sctp_abort_an_association(inp, stcb, m, SCTP_SO_NOT_LOCKED); /* * sctp_abort_an_association calls sctp_free_asoc() free * association will NOT free it since we incremented the * refcnt .. we do this to prevent it being freed and things * getting tricky since we could end up (from free_asoc) * calling inpcb_free which would get a recursive lock call * to the iterator lock.. But as a consequence of that the * stcb will return to us un-locked.. since free_asoc * returns with either no TCB or the TCB unlocked, we must * relock.. to unlock in the iterator timer :-0 */ SCTP_TCB_LOCK(stcb); atomic_add_int(&stcb->asoc.refcnt, -1); goto no_chunk_output; } else { if (m) { ret = sctp_msg_append(stcb, net, m, &ca->sndrcv, 1); } asoc = &stcb->asoc; if (ca->sndrcv.sinfo_flags & SCTP_EOF) { /* shutdown this assoc */ if (TAILQ_EMPTY(&asoc->send_queue) && TAILQ_EMPTY(&asoc->sent_queue) && sctp_is_there_unsent_data(stcb, SCTP_SO_NOT_LOCKED) == 0) { if ((*asoc->ss_functions.sctp_ss_is_user_msgs_incomplete) (stcb, asoc)) { goto abort_anyway; } /* * there is nothing queued to send, so I'm * done... */ if ((SCTP_GET_STATE(asoc) != SCTP_STATE_SHUTDOWN_SENT) && (SCTP_GET_STATE(asoc) != SCTP_STATE_SHUTDOWN_RECEIVED) && (SCTP_GET_STATE(asoc) != SCTP_STATE_SHUTDOWN_ACK_SENT)) { /* * only send SHUTDOWN the first time * through */ if (SCTP_GET_STATE(asoc) == SCTP_STATE_OPEN) { SCTP_STAT_DECR_GAUGE32(sctps_currestab); } SCTP_SET_STATE(asoc, SCTP_STATE_SHUTDOWN_SENT); SCTP_CLEAR_SUBSTATE(asoc, SCTP_STATE_SHUTDOWN_PENDING); sctp_stop_timers_for_shutdown(stcb); sctp_send_shutdown(stcb, net); sctp_timer_start(SCTP_TIMER_TYPE_SHUTDOWN, stcb->sctp_ep, stcb, net); sctp_timer_start(SCTP_TIMER_TYPE_SHUTDOWNGUARD, stcb->sctp_ep, stcb, asoc->primary_destination); added_control = 1; do_chunk_output = 0; } } else { /* * we still got (or just got) data to send, * so set SHUTDOWN_PENDING */ /* * XXX sockets draft says that SCTP_EOF * should be sent with no data. currently, * we will allow user data to be sent first * and move to SHUTDOWN-PENDING */ if ((SCTP_GET_STATE(asoc) != SCTP_STATE_SHUTDOWN_SENT) && (SCTP_GET_STATE(asoc) != SCTP_STATE_SHUTDOWN_RECEIVED) && (SCTP_GET_STATE(asoc) != SCTP_STATE_SHUTDOWN_ACK_SENT)) { if ((*asoc->ss_functions.sctp_ss_is_user_msgs_incomplete) (stcb, asoc)) { asoc->state |= SCTP_STATE_PARTIAL_MSG_LEFT; } asoc->state |= SCTP_STATE_SHUTDOWN_PENDING; if (TAILQ_EMPTY(&asoc->send_queue) && TAILQ_EMPTY(&asoc->sent_queue) && (asoc->state & SCTP_STATE_PARTIAL_MSG_LEFT)) { struct mbuf *op_err; char msg[SCTP_DIAG_INFO_LEN]; abort_anyway: snprintf(msg, sizeof(msg), "%s:%d at %s", __FILE__, __LINE__, __func__); op_err = sctp_generate_cause(SCTP_BASE_SYSCTL(sctp_diag_info_code), msg); atomic_add_int(&stcb->asoc.refcnt, 1); sctp_abort_an_association(stcb->sctp_ep, stcb, op_err, SCTP_SO_NOT_LOCKED); atomic_add_int(&stcb->asoc.refcnt, -1); goto no_chunk_output; } sctp_timer_start(SCTP_TIMER_TYPE_SHUTDOWNGUARD, stcb->sctp_ep, stcb, asoc->primary_destination); } } } } un_sent = ((stcb->asoc.total_output_queue_size - stcb->asoc.total_flight) + (stcb->asoc.stream_queue_cnt * sizeof(struct sctp_data_chunk))); if ((sctp_is_feature_off(inp, SCTP_PCB_FLAGS_NODELAY)) && (stcb->asoc.total_flight > 0) && (un_sent < (int)(stcb->asoc.smallest_mtu - SCTP_MIN_OVERHEAD))) { do_chunk_output = 0; } if (do_chunk_output) sctp_chunk_output(inp, stcb, SCTP_OUTPUT_FROM_USR_SEND, SCTP_SO_NOT_LOCKED); else if (added_control) { int num_out, reason, now_filled = 0; struct timeval now; int frag_point; frag_point = sctp_get_frag_point(stcb, &stcb->asoc); (void)sctp_med_chunk_output(inp, stcb, &stcb->asoc, &num_out, &reason, 1, 1, &now, &now_filled, frag_point, SCTP_SO_NOT_LOCKED); } no_chunk_output: if (ret) { ca->cnt_failed++; } else { ca->cnt_sent++; } } static void sctp_sendall_completes(void *ptr, uint32_t val SCTP_UNUSED) { struct sctp_copy_all *ca; ca = (struct sctp_copy_all *)ptr; /* * Do a notify here? Kacheong suggests that the notify be done at * the send time.. so you would push up a notification if any send * failed. Don't know if this is feasible since the only failures we * have is "memory" related and if you cannot get an mbuf to send * the data you surely can't get an mbuf to send up to notify the * user you can't send the data :-> */ /* now free everything */ sctp_m_freem(ca->m); SCTP_FREE(ca, SCTP_M_COPYAL); } static struct mbuf * sctp_copy_out_all(struct uio *uio, int len) { struct mbuf *ret, *at; int left, willcpy, cancpy, error; ret = sctp_get_mbuf_for_msg(MCLBYTES, 0, M_WAITOK, 1, MT_DATA); if (ret == NULL) { /* TSNH */ return (NULL); } left = len; SCTP_BUF_LEN(ret) = 0; /* save space for the data chunk header */ cancpy = (int)M_TRAILINGSPACE(ret); willcpy = min(cancpy, left); at = ret; while (left > 0) { /* Align data to the end */ error = uiomove(mtod(at, caddr_t), willcpy, uio); if (error) { err_out_now: sctp_m_freem(at); return (NULL); } SCTP_BUF_LEN(at) = willcpy; SCTP_BUF_NEXT_PKT(at) = SCTP_BUF_NEXT(at) = 0; left -= willcpy; if (left > 0) { SCTP_BUF_NEXT(at) = sctp_get_mbuf_for_msg(left, 0, M_WAITOK, 1, MT_DATA); if (SCTP_BUF_NEXT(at) == NULL) { goto err_out_now; } at = SCTP_BUF_NEXT(at); SCTP_BUF_LEN(at) = 0; cancpy = (int)M_TRAILINGSPACE(at); willcpy = min(cancpy, left); } } return (ret); } static int sctp_sendall(struct sctp_inpcb *inp, struct uio *uio, struct mbuf *m, struct sctp_sndrcvinfo *srcv) { int ret; struct sctp_copy_all *ca; SCTP_MALLOC(ca, struct sctp_copy_all *, sizeof(struct sctp_copy_all), SCTP_M_COPYAL); if (ca == NULL) { sctp_m_freem(m); SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_OUTPUT, ENOMEM); return (ENOMEM); } memset(ca, 0, sizeof(struct sctp_copy_all)); ca->inp = inp; if (srcv) { memcpy(&ca->sndrcv, srcv, sizeof(struct sctp_nonpad_sndrcvinfo)); } /* * take off the sendall flag, it would be bad if we failed to do * this :-0 */ ca->sndrcv.sinfo_flags &= ~SCTP_SENDALL; /* get length and mbuf chain */ if (uio) { ca->sndlen = (int)uio->uio_resid; ca->m = sctp_copy_out_all(uio, ca->sndlen); if (ca->m == NULL) { SCTP_FREE(ca, SCTP_M_COPYAL); SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_OUTPUT, ENOMEM); return (ENOMEM); } } else { /* Gather the length of the send */ struct mbuf *mat; ca->sndlen = 0; for (mat = m; mat; mat = SCTP_BUF_NEXT(mat)) { ca->sndlen += SCTP_BUF_LEN(mat); } } ret = sctp_initiate_iterator(NULL, sctp_sendall_iterator, NULL, SCTP_PCB_ANY_FLAGS, SCTP_PCB_ANY_FEATURES, SCTP_ASOC_ANY_STATE, (void *)ca, 0, sctp_sendall_completes, inp, 1); if (ret) { SCTP_PRINTF("Failed to initiate iterator for sendall\n"); SCTP_FREE(ca, SCTP_M_COPYAL); SCTP_LTRACE_ERR_RET_PKT(m, inp, NULL, NULL, SCTP_FROM_SCTP_OUTPUT, EFAULT); return (EFAULT); } return (0); } void sctp_toss_old_cookies(struct sctp_tcb *stcb, struct sctp_association *asoc) { struct sctp_tmit_chunk *chk, *nchk; TAILQ_FOREACH_SAFE(chk, &asoc->control_send_queue, sctp_next, nchk) { if (chk->rec.chunk_id.id == SCTP_COOKIE_ECHO) { TAILQ_REMOVE(&asoc->control_send_queue, chk, sctp_next); if (chk->data) { sctp_m_freem(chk->data); chk->data = NULL; } asoc->ctrl_queue_cnt--; sctp_free_a_chunk(stcb, chk, SCTP_SO_NOT_LOCKED); } } } void sctp_toss_old_asconf(struct sctp_tcb *stcb) { struct sctp_association *asoc; struct sctp_tmit_chunk *chk, *nchk; struct sctp_asconf_chunk *acp; asoc = &stcb->asoc; TAILQ_FOREACH_SAFE(chk, &asoc->asconf_send_queue, sctp_next, nchk) { /* find SCTP_ASCONF chunk in queue */ if (chk->rec.chunk_id.id == SCTP_ASCONF) { if (chk->data) { acp = mtod(chk->data, struct sctp_asconf_chunk *); if (SCTP_TSN_GT(ntohl(acp->serial_number), asoc->asconf_seq_out_acked)) { /* Not Acked yet */ break; } } TAILQ_REMOVE(&asoc->asconf_send_queue, chk, sctp_next); if (chk->data) { sctp_m_freem(chk->data); chk->data = NULL; } asoc->ctrl_queue_cnt--; sctp_free_a_chunk(stcb, chk, SCTP_SO_NOT_LOCKED); } } } static void sctp_clean_up_datalist(struct sctp_tcb *stcb, struct sctp_association *asoc, struct sctp_tmit_chunk **data_list, int bundle_at, struct sctp_nets *net) { int i; struct sctp_tmit_chunk *tp1; for (i = 0; i < bundle_at; i++) { /* off of the send queue */ TAILQ_REMOVE(&asoc->send_queue, data_list[i], sctp_next); asoc->send_queue_cnt--; if (i > 0) { /* * Any chunk NOT 0 you zap the time chunk 0 gets * zapped or set based on if a RTO measurment is * needed. */ data_list[i]->do_rtt = 0; } /* record time */ data_list[i]->sent_rcv_time = net->last_sent_time; data_list[i]->rec.data.cwnd_at_send = net->cwnd; data_list[i]->rec.data.fast_retran_tsn = data_list[i]->rec.data.TSN_seq; if (data_list[i]->whoTo == NULL) { data_list[i]->whoTo = net; atomic_add_int(&net->ref_count, 1); } /* on to the sent queue */ tp1 = TAILQ_LAST(&asoc->sent_queue, sctpchunk_listhead); if ((tp1) && SCTP_TSN_GT(tp1->rec.data.TSN_seq, data_list[i]->rec.data.TSN_seq)) { struct sctp_tmit_chunk *tpp; /* need to move back */ back_up_more: tpp = TAILQ_PREV(tp1, sctpchunk_listhead, sctp_next); if (tpp == NULL) { TAILQ_INSERT_BEFORE(tp1, data_list[i], sctp_next); goto all_done; } tp1 = tpp; if (SCTP_TSN_GT(tp1->rec.data.TSN_seq, data_list[i]->rec.data.TSN_seq)) { goto back_up_more; } TAILQ_INSERT_AFTER(&asoc->sent_queue, tp1, data_list[i], sctp_next); } else { TAILQ_INSERT_TAIL(&asoc->sent_queue, data_list[i], sctp_next); } all_done: /* This does not lower until the cum-ack passes it */ asoc->sent_queue_cnt++; if ((asoc->peers_rwnd <= 0) && (asoc->total_flight == 0) && (bundle_at == 1)) { /* Mark the chunk as being a window probe */ SCTP_STAT_INCR(sctps_windowprobed); } #ifdef SCTP_AUDITING_ENABLED sctp_audit_log(0xC2, 3); #endif data_list[i]->sent = SCTP_DATAGRAM_SENT; data_list[i]->snd_count = 1; data_list[i]->rec.data.chunk_was_revoked = 0; if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_FLIGHT_LOGGING_ENABLE) { sctp_misc_ints(SCTP_FLIGHT_LOG_UP, data_list[i]->whoTo->flight_size, data_list[i]->book_size, (uint32_t) (uintptr_t) data_list[i]->whoTo, data_list[i]->rec.data.TSN_seq); } sctp_flight_size_increase(data_list[i]); sctp_total_flight_increase(stcb, data_list[i]); if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_LOG_RWND_ENABLE) { sctp_log_rwnd(SCTP_DECREASE_PEER_RWND, asoc->peers_rwnd, data_list[i]->send_size, SCTP_BASE_SYSCTL(sctp_peer_chunk_oh)); } asoc->peers_rwnd = sctp_sbspace_sub(asoc->peers_rwnd, (uint32_t) (data_list[i]->send_size + SCTP_BASE_SYSCTL(sctp_peer_chunk_oh))); if (asoc->peers_rwnd < stcb->sctp_ep->sctp_ep.sctp_sws_sender) { /* SWS sender side engages */ asoc->peers_rwnd = 0; } } if (asoc->cc_functions.sctp_cwnd_update_packet_transmitted) { (*asoc->cc_functions.sctp_cwnd_update_packet_transmitted) (stcb, net); } } static void sctp_clean_up_ctl(struct sctp_tcb *stcb, struct sctp_association *asoc, int so_locked #if !defined(__APPLE__) && !defined(SCTP_SO_LOCK_TESTING) SCTP_UNUSED #endif ) { struct sctp_tmit_chunk *chk, *nchk; TAILQ_FOREACH_SAFE(chk, &asoc->control_send_queue, sctp_next, nchk) { if ((chk->rec.chunk_id.id == SCTP_SELECTIVE_ACK) || (chk->rec.chunk_id.id == SCTP_NR_SELECTIVE_ACK) || /* EY */ (chk->rec.chunk_id.id == SCTP_HEARTBEAT_REQUEST) || (chk->rec.chunk_id.id == SCTP_HEARTBEAT_ACK) || (chk->rec.chunk_id.id == SCTP_FORWARD_CUM_TSN) || (chk->rec.chunk_id.id == SCTP_SHUTDOWN) || (chk->rec.chunk_id.id == SCTP_SHUTDOWN_ACK) || (chk->rec.chunk_id.id == SCTP_OPERATION_ERROR) || (chk->rec.chunk_id.id == SCTP_PACKET_DROPPED) || (chk->rec.chunk_id.id == SCTP_COOKIE_ACK) || (chk->rec.chunk_id.id == SCTP_ECN_CWR) || (chk->rec.chunk_id.id == SCTP_ASCONF_ACK)) { /* Stray chunks must be cleaned up */ clean_up_anyway: TAILQ_REMOVE(&asoc->control_send_queue, chk, sctp_next); if (chk->data) { sctp_m_freem(chk->data); chk->data = NULL; } asoc->ctrl_queue_cnt--; if (chk->rec.chunk_id.id == SCTP_FORWARD_CUM_TSN) asoc->fwd_tsn_cnt--; sctp_free_a_chunk(stcb, chk, so_locked); } else if (chk->rec.chunk_id.id == SCTP_STREAM_RESET) { /* special handling, we must look into the param */ if (chk != asoc->str_reset) { goto clean_up_anyway; } } } } static int sctp_can_we_split_this(struct sctp_tcb *stcb, uint32_t length, uint32_t goal_mtu, uint32_t frag_point, int eeor_on) { /* * Make a decision on if I should split a msg into multiple parts. * This is only asked of incomplete messages. */ if (eeor_on) { /* * If we are doing EEOR we need to always send it if its the * entire thing, since it might be all the guy is putting in * the hopper. */ if (goal_mtu >= length) { /*- * If we have data outstanding, * we get another chance when the sack * arrives to transmit - wait for more data */ if (stcb->asoc.total_flight == 0) { /* * If nothing is in flight, we zero the * packet counter. */ return (length); } return (0); } else { /* You can fill the rest */ return (goal_mtu); } } /*- * For those strange folk that make the send buffer * smaller than our fragmentation point, we can't * get a full msg in so we have to allow splitting. */ if (SCTP_SB_LIMIT_SND(stcb->sctp_socket) < frag_point) { return (length); } if ((length <= goal_mtu) || ((length - goal_mtu) < SCTP_BASE_SYSCTL(sctp_min_residual))) { /* Sub-optimial residual don't split in non-eeor mode. */ return (0); } /* * If we reach here length is larger than the goal_mtu. Do we wish * to split it for the sake of packet putting together? */ if (goal_mtu >= min(SCTP_BASE_SYSCTL(sctp_min_split_point), frag_point)) { /* Its ok to split it */ return (min(goal_mtu, frag_point)); } /* Nope, can't split */ return (0); } static uint32_t sctp_move_to_outqueue(struct sctp_tcb *stcb, struct sctp_stream_out *strq, uint32_t goal_mtu, uint32_t frag_point, int *giveup, int eeor_mode, int *bail, int so_locked #if !defined(__APPLE__) && !defined(SCTP_SO_LOCK_TESTING) SCTP_UNUSED #endif ) { /* Move from the stream to the send_queue keeping track of the total */ struct sctp_association *asoc; struct sctp_stream_queue_pending *sp; struct sctp_tmit_chunk *chk; struct sctp_data_chunk *dchkh = NULL; struct sctp_idata_chunk *ndchkh = NULL; uint32_t to_move, length; int leading; uint8_t rcv_flags = 0; uint8_t some_taken; uint8_t send_lock_up = 0; SCTP_TCB_LOCK_ASSERT(stcb); asoc = &stcb->asoc; one_more_time: /* sa_ignore FREED_MEMORY */ sp = TAILQ_FIRST(&strq->outqueue); if (sp == NULL) { if (send_lock_up == 0) { SCTP_TCB_SEND_LOCK(stcb); send_lock_up = 1; } sp = TAILQ_FIRST(&strq->outqueue); if (sp) { goto one_more_time; } if ((sctp_is_feature_on(stcb->sctp_ep, SCTP_PCB_FLAGS_EXPLICIT_EOR) == 0) && (stcb->asoc.idata_supported == 0) && (strq->last_msg_incomplete)) { SCTP_PRINTF("Huh? Stream:%d lm_in_c=%d but queue is NULL\n", strq->stream_no, strq->last_msg_incomplete); strq->last_msg_incomplete = 0; } to_move = 0; if (send_lock_up) { SCTP_TCB_SEND_UNLOCK(stcb); send_lock_up = 0; } goto out_of; } if ((sp->msg_is_complete) && (sp->length == 0)) { if (sp->sender_all_done) { /* * We are doing differed cleanup. Last time through * when we took all the data the sender_all_done was * not set. */ if ((sp->put_last_out == 0) && (sp->discard_rest == 0)) { SCTP_PRINTF("Gak, put out entire msg with NO end!-1\n"); SCTP_PRINTF("sender_done:%d len:%d msg_comp:%d put_last_out:%d send_lock:%d\n", sp->sender_all_done, sp->length, sp->msg_is_complete, sp->put_last_out, send_lock_up); } if ((TAILQ_NEXT(sp, next) == NULL) && (send_lock_up == 0)) { SCTP_TCB_SEND_LOCK(stcb); send_lock_up = 1; } atomic_subtract_int(&asoc->stream_queue_cnt, 1); TAILQ_REMOVE(&strq->outqueue, sp, next); stcb->asoc.ss_functions.sctp_ss_remove_from_stream(stcb, asoc, strq, sp, send_lock_up); if ((strq->state == SCTP_STREAM_RESET_PENDING) && (strq->chunks_on_queues == 0) && TAILQ_EMPTY(&strq->outqueue)) { stcb->asoc.trigger_reset = 1; } if (sp->net) { sctp_free_remote_addr(sp->net); sp->net = NULL; } if (sp->data) { sctp_m_freem(sp->data); sp->data = NULL; } sctp_free_a_strmoq(stcb, sp, so_locked); /* we can't be locked to it */ if (send_lock_up) { SCTP_TCB_SEND_UNLOCK(stcb); send_lock_up = 0; } /* back to get the next msg */ goto one_more_time; } else { /* * sender just finished this but still holds a * reference */ *giveup = 1; to_move = 0; goto out_of; } } else { /* is there some to get */ if (sp->length == 0) { /* no */ *giveup = 1; to_move = 0; goto out_of; } else if (sp->discard_rest) { if (send_lock_up == 0) { SCTP_TCB_SEND_LOCK(stcb); send_lock_up = 1; } /* Whack down the size */ atomic_subtract_int(&stcb->asoc.total_output_queue_size, sp->length); if ((stcb->sctp_socket != NULL) && ((stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) || (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL))) { atomic_subtract_int(&stcb->sctp_socket->so_snd.sb_cc, sp->length); } if (sp->data) { sctp_m_freem(sp->data); sp->data = NULL; sp->tail_mbuf = NULL; } sp->length = 0; sp->some_taken = 1; *giveup = 1; to_move = 0; goto out_of; } } some_taken = sp->some_taken; re_look: length = sp->length; if (sp->msg_is_complete) { /* The message is complete */ to_move = min(length, frag_point); if (to_move == length) { /* All of it fits in the MTU */ if (sp->some_taken) { rcv_flags |= SCTP_DATA_LAST_FRAG; } else { rcv_flags |= SCTP_DATA_NOT_FRAG; } sp->put_last_out = 1; if (sp->sinfo_flags & SCTP_SACK_IMMEDIATELY) { rcv_flags |= SCTP_DATA_SACK_IMMEDIATELY; } } else { /* Not all of it fits, we fragment */ if (sp->some_taken == 0) { rcv_flags |= SCTP_DATA_FIRST_FRAG; } sp->some_taken = 1; } } else { to_move = sctp_can_we_split_this(stcb, length, goal_mtu, frag_point, eeor_mode); if (to_move) { /*- * We use a snapshot of length in case it * is expanding during the compare. */ uint32_t llen; llen = length; if (to_move >= llen) { to_move = llen; if (send_lock_up == 0) { /*- * We are taking all of an incomplete msg * thus we need a send lock. */ SCTP_TCB_SEND_LOCK(stcb); send_lock_up = 1; if (sp->msg_is_complete) { /* * the sender finished the * msg */ goto re_look; } } } if (sp->some_taken == 0) { rcv_flags |= SCTP_DATA_FIRST_FRAG; sp->some_taken = 1; } } else { /* Nothing to take. */ *giveup = 1; to_move = 0; goto out_of; } } /* If we reach here, we can copy out a chunk */ sctp_alloc_a_chunk(stcb, chk); if (chk == NULL) { /* No chunk memory */ *giveup = 1; to_move = 0; goto out_of; } /* * Setup for unordered if needed by looking at the user sent info * flags. */ if (sp->sinfo_flags & SCTP_UNORDERED) { rcv_flags |= SCTP_DATA_UNORDERED; } if (SCTP_BASE_SYSCTL(sctp_enable_sack_immediately) && (sp->sinfo_flags & SCTP_EOF) == SCTP_EOF) { rcv_flags |= SCTP_DATA_SACK_IMMEDIATELY; } /* clear out the chunk before setting up */ memset(chk, 0, sizeof(*chk)); chk->rec.data.rcv_flags = rcv_flags; if (to_move >= length) { /* we think we can steal the whole thing */ if ((sp->sender_all_done == 0) && (send_lock_up == 0)) { SCTP_TCB_SEND_LOCK(stcb); send_lock_up = 1; } if (to_move < sp->length) { /* bail, it changed */ goto dont_do_it; } chk->data = sp->data; chk->last_mbuf = sp->tail_mbuf; /* register the stealing */ sp->data = sp->tail_mbuf = NULL; } else { struct mbuf *m; dont_do_it: chk->data = SCTP_M_COPYM(sp->data, 0, to_move, M_NOWAIT); chk->last_mbuf = NULL; if (chk->data == NULL) { sp->some_taken = some_taken; sctp_free_a_chunk(stcb, chk, so_locked); *bail = 1; to_move = 0; goto out_of; } #ifdef SCTP_MBUF_LOGGING if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_MBUF_LOGGING_ENABLE) { sctp_log_mbc(chk->data, SCTP_MBUF_ICOPY); } #endif /* Pull off the data */ m_adj(sp->data, to_move); /* Now lets work our way down and compact it */ m = sp->data; while (m && (SCTP_BUF_LEN(m) == 0)) { sp->data = SCTP_BUF_NEXT(m); SCTP_BUF_NEXT(m) = NULL; if (sp->tail_mbuf == m) { /*- * Freeing tail? TSNH since * we supposedly were taking less * than the sp->length. */ #ifdef INVARIANTS panic("Huh, freing tail? - TSNH"); #else SCTP_PRINTF("Huh, freeing tail? - TSNH\n"); sp->tail_mbuf = sp->data = NULL; sp->length = 0; #endif } sctp_m_free(m); m = sp->data; } } if (SCTP_BUF_IS_EXTENDED(chk->data)) { chk->copy_by_ref = 1; } else { chk->copy_by_ref = 0; } /* * get last_mbuf and counts of mb usage This is ugly but hopefully * its only one mbuf. */ if (chk->last_mbuf == NULL) { chk->last_mbuf = chk->data; while (SCTP_BUF_NEXT(chk->last_mbuf) != NULL) { chk->last_mbuf = SCTP_BUF_NEXT(chk->last_mbuf); } } if (to_move > length) { /*- This should not happen either * since we always lower to_move to the size * of sp->length if its larger. */ #ifdef INVARIANTS panic("Huh, how can to_move be larger?"); #else SCTP_PRINTF("Huh, how can to_move be larger?\n"); sp->length = 0; #endif } else { atomic_subtract_int(&sp->length, to_move); } if (stcb->asoc.idata_supported == 0) { leading = sizeof(struct sctp_data_chunk); } else { leading = sizeof(struct sctp_idata_chunk); } if (M_LEADINGSPACE(chk->data) < leading) { /* Not enough room for a chunk header, get some */ struct mbuf *m; m = sctp_get_mbuf_for_msg(1, 0, M_NOWAIT, 0, MT_DATA); if (m == NULL) { /* * we're in trouble here. _PREPEND below will free * all the data if there is no leading space, so we * must put the data back and restore. */ if (send_lock_up == 0) { SCTP_TCB_SEND_LOCK(stcb); send_lock_up = 1; } if (sp->data == NULL) { /* unsteal the data */ sp->data = chk->data; sp->tail_mbuf = chk->last_mbuf; } else { struct mbuf *m_tmp; /* reassemble the data */ m_tmp = sp->data; sp->data = chk->data; SCTP_BUF_NEXT(chk->last_mbuf) = m_tmp; } sp->some_taken = some_taken; atomic_add_int(&sp->length, to_move); chk->data = NULL; *bail = 1; sctp_free_a_chunk(stcb, chk, so_locked); to_move = 0; goto out_of; } else { SCTP_BUF_LEN(m) = 0; SCTP_BUF_NEXT(m) = chk->data; chk->data = m; M_ALIGN(chk->data, 4); } } if (stcb->asoc.idata_supported == 0) { SCTP_BUF_PREPEND(chk->data, sizeof(struct sctp_data_chunk), M_NOWAIT); } else { SCTP_BUF_PREPEND(chk->data, sizeof(struct sctp_idata_chunk), M_NOWAIT); } if (chk->data == NULL) { /* HELP, TSNH since we assured it would not above? */ #ifdef INVARIANTS panic("prepend failes HELP?"); #else SCTP_PRINTF("prepend fails HELP?\n"); sctp_free_a_chunk(stcb, chk, so_locked); #endif *bail = 1; to_move = 0; goto out_of; } if (stcb->asoc.idata_supported == 0) { sctp_snd_sb_alloc(stcb, sizeof(struct sctp_data_chunk)); chk->book_size = chk->send_size = (uint16_t) (to_move + sizeof(struct sctp_data_chunk)); } else { sctp_snd_sb_alloc(stcb, sizeof(struct sctp_idata_chunk)); chk->book_size = chk->send_size = (uint16_t) (to_move + sizeof(struct sctp_idata_chunk)); } chk->book_size_scale = 0; chk->sent = SCTP_DATAGRAM_UNSENT; chk->flags = 0; chk->asoc = &stcb->asoc; chk->pad_inplace = 0; chk->no_fr_allowed = 0; if (stcb->asoc.idata_supported == 0) { if (rcv_flags & SCTP_DATA_UNORDERED) { /* Just use 0. The receiver ignores the values. */ chk->rec.data.stream_seq = 0; } else { chk->rec.data.stream_seq = strq->next_mid_ordered; if (rcv_flags & SCTP_DATA_LAST_FRAG) { strq->next_mid_ordered++; } } } else { if (rcv_flags & SCTP_DATA_UNORDERED) { chk->rec.data.stream_seq = strq->next_mid_unordered; if (rcv_flags & SCTP_DATA_LAST_FRAG) { strq->next_mid_unordered++; } } else { chk->rec.data.stream_seq = strq->next_mid_ordered; if (rcv_flags & SCTP_DATA_LAST_FRAG) { strq->next_mid_ordered++; } } } chk->rec.data.stream_number = sp->stream; chk->rec.data.payloadtype = sp->ppid; chk->rec.data.context = sp->context; chk->rec.data.doing_fast_retransmit = 0; chk->rec.data.timetodrop = sp->ts; chk->flags = sp->act_flags; if (sp->net) { chk->whoTo = sp->net; atomic_add_int(&chk->whoTo->ref_count, 1); } else chk->whoTo = NULL; if (sp->holds_key_ref) { chk->auth_keyid = sp->auth_keyid; sctp_auth_key_acquire(stcb, chk->auth_keyid); chk->holds_key_ref = 1; } chk->rec.data.TSN_seq = atomic_fetchadd_int(&asoc->sending_seq, 1); if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_LOG_AT_SEND_2_OUTQ) { sctp_misc_ints(SCTP_STRMOUT_LOG_SEND, (uint32_t) (uintptr_t) stcb, sp->length, (uint32_t) ((chk->rec.data.stream_number << 16) | chk->rec.data.stream_seq), chk->rec.data.TSN_seq); } if (stcb->asoc.idata_supported == 0) { dchkh = mtod(chk->data, struct sctp_data_chunk *); } else { ndchkh = mtod(chk->data, struct sctp_idata_chunk *); } /* * Put the rest of the things in place now. Size was done earlier in * previous loop prior to padding. */ #ifdef SCTP_ASOCLOG_OF_TSNS SCTP_TCB_LOCK_ASSERT(stcb); if (asoc->tsn_out_at >= SCTP_TSN_LOG_SIZE) { asoc->tsn_out_at = 0; asoc->tsn_out_wrapped = 1; } asoc->out_tsnlog[asoc->tsn_out_at].tsn = chk->rec.data.TSN_seq; asoc->out_tsnlog[asoc->tsn_out_at].strm = chk->rec.data.stream_number; asoc->out_tsnlog[asoc->tsn_out_at].seq = chk->rec.data.stream_seq; asoc->out_tsnlog[asoc->tsn_out_at].sz = chk->send_size; asoc->out_tsnlog[asoc->tsn_out_at].flgs = chk->rec.data.rcv_flags; asoc->out_tsnlog[asoc->tsn_out_at].stcb = (void *)stcb; asoc->out_tsnlog[asoc->tsn_out_at].in_pos = asoc->tsn_out_at; asoc->out_tsnlog[asoc->tsn_out_at].in_out = 2; asoc->tsn_out_at++; #endif if (stcb->asoc.idata_supported == 0) { dchkh->ch.chunk_type = SCTP_DATA; dchkh->ch.chunk_flags = chk->rec.data.rcv_flags; dchkh->dp.tsn = htonl(chk->rec.data.TSN_seq); dchkh->dp.stream_id = htons((strq->stream_no & 0x0000ffff)); dchkh->dp.stream_sequence = htons((uint16_t) chk->rec.data.stream_seq); dchkh->dp.protocol_id = chk->rec.data.payloadtype; dchkh->ch.chunk_length = htons(chk->send_size); } else { ndchkh->ch.chunk_type = SCTP_IDATA; ndchkh->ch.chunk_flags = chk->rec.data.rcv_flags; ndchkh->dp.tsn = htonl(chk->rec.data.TSN_seq); ndchkh->dp.stream_id = htons(strq->stream_no); ndchkh->dp.reserved = htons(0); ndchkh->dp.msg_id = htonl(chk->rec.data.stream_seq); if (sp->fsn == 0) ndchkh->dp.ppid_fsn.protocol_id = chk->rec.data.payloadtype; else ndchkh->dp.ppid_fsn.fsn = htonl(sp->fsn); sp->fsn++; ndchkh->ch.chunk_length = htons(chk->send_size); } /* Now advance the chk->send_size by the actual pad needed. */ if (chk->send_size < SCTP_SIZE32(chk->book_size)) { /* need a pad */ struct mbuf *lm; int pads; pads = SCTP_SIZE32(chk->book_size) - chk->send_size; lm = sctp_pad_lastmbuf(chk->data, pads, chk->last_mbuf); if (lm != NULL) { chk->last_mbuf = lm; chk->pad_inplace = 1; } chk->send_size += pads; } if (PR_SCTP_ENABLED(chk->flags)) { asoc->pr_sctp_cnt++; } if (sp->msg_is_complete && (sp->length == 0) && (sp->sender_all_done)) { /* All done pull and kill the message */ if (sp->put_last_out == 0) { SCTP_PRINTF("Gak, put out entire msg with NO end!-2\n"); SCTP_PRINTF("sender_done:%d len:%d msg_comp:%d put_last_out:%d send_lock:%d\n", sp->sender_all_done, sp->length, sp->msg_is_complete, sp->put_last_out, send_lock_up); } if ((send_lock_up == 0) && (TAILQ_NEXT(sp, next) == NULL)) { SCTP_TCB_SEND_LOCK(stcb); send_lock_up = 1; } atomic_subtract_int(&asoc->stream_queue_cnt, 1); TAILQ_REMOVE(&strq->outqueue, sp, next); stcb->asoc.ss_functions.sctp_ss_remove_from_stream(stcb, asoc, strq, sp, send_lock_up); if ((strq->state == SCTP_STREAM_RESET_PENDING) && (strq->chunks_on_queues == 0) && TAILQ_EMPTY(&strq->outqueue)) { stcb->asoc.trigger_reset = 1; } if (sp->net) { sctp_free_remote_addr(sp->net); sp->net = NULL; } if (sp->data) { sctp_m_freem(sp->data); sp->data = NULL; } sctp_free_a_strmoq(stcb, sp, so_locked); } asoc->chunks_on_out_queue++; strq->chunks_on_queues++; TAILQ_INSERT_TAIL(&asoc->send_queue, chk, sctp_next); asoc->send_queue_cnt++; out_of: if (send_lock_up) { SCTP_TCB_SEND_UNLOCK(stcb); } return (to_move); } static void sctp_fill_outqueue(struct sctp_tcb *stcb, struct sctp_nets *net, int frag_point, int eeor_mode, int *quit_now, int so_locked #if !defined(__APPLE__) && !defined(SCTP_SO_LOCK_TESTING) SCTP_UNUSED #endif ) { struct sctp_association *asoc; struct sctp_stream_out *strq; int goal_mtu, moved_how_much, total_moved = 0, bail = 0; int giveup; SCTP_TCB_LOCK_ASSERT(stcb); asoc = &stcb->asoc; switch (net->ro._l_addr.sa.sa_family) { #ifdef INET case AF_INET: goal_mtu = net->mtu - SCTP_MIN_V4_OVERHEAD; break; #endif #ifdef INET6 case AF_INET6: goal_mtu = net->mtu - SCTP_MIN_OVERHEAD; break; #endif default: /* TSNH */ goal_mtu = net->mtu; break; } /* Need an allowance for the data chunk header too */ if (stcb->asoc.idata_supported == 0) { goal_mtu -= sizeof(struct sctp_data_chunk); } else { goal_mtu -= sizeof(struct sctp_idata_chunk); } /* must make even word boundary */ goal_mtu &= 0xfffffffc; strq = stcb->asoc.ss_functions.sctp_ss_select_stream(stcb, net, asoc); while ((goal_mtu > 0) && strq) { giveup = 0; bail = 0; moved_how_much = sctp_move_to_outqueue(stcb, strq, goal_mtu, frag_point, &giveup, eeor_mode, &bail, so_locked); stcb->asoc.ss_functions.sctp_ss_scheduled(stcb, net, asoc, strq, moved_how_much); if ((giveup) || bail) { break; } strq = stcb->asoc.ss_functions.sctp_ss_select_stream(stcb, net, asoc); if (strq == NULL) { break; } total_moved += moved_how_much; goal_mtu -= (moved_how_much + sizeof(struct sctp_data_chunk)); goal_mtu &= 0xfffffffc; } if (bail) *quit_now = 1; stcb->asoc.ss_functions.sctp_ss_packet_done(stcb, net, asoc); if (total_moved == 0) { if ((stcb->asoc.sctp_cmt_on_off == 0) && (net == stcb->asoc.primary_destination)) { /* ran dry for primary network net */ SCTP_STAT_INCR(sctps_primary_randry); } else if (stcb->asoc.sctp_cmt_on_off > 0) { /* ran dry with CMT on */ SCTP_STAT_INCR(sctps_cmt_randry); } } } void sctp_fix_ecn_echo(struct sctp_association *asoc) { struct sctp_tmit_chunk *chk; TAILQ_FOREACH(chk, &asoc->control_send_queue, sctp_next) { if (chk->rec.chunk_id.id == SCTP_ECN_ECHO) { chk->sent = SCTP_DATAGRAM_UNSENT; } } } void sctp_move_chunks_from_net(struct sctp_tcb *stcb, struct sctp_nets *net) { struct sctp_association *asoc; struct sctp_tmit_chunk *chk; struct sctp_stream_queue_pending *sp; unsigned int i; if (net == NULL) { return; } asoc = &stcb->asoc; for (i = 0; i < stcb->asoc.streamoutcnt; i++) { TAILQ_FOREACH(sp, &stcb->asoc.strmout[i].outqueue, next) { if (sp->net == net) { sctp_free_remote_addr(sp->net); sp->net = NULL; } } } TAILQ_FOREACH(chk, &asoc->send_queue, sctp_next) { if (chk->whoTo == net) { sctp_free_remote_addr(chk->whoTo); chk->whoTo = NULL; } } } int sctp_med_chunk_output(struct sctp_inpcb *inp, struct sctp_tcb *stcb, struct sctp_association *asoc, int *num_out, int *reason_code, int control_only, int from_where, struct timeval *now, int *now_filled, int frag_point, int so_locked #if !defined(__APPLE__) && !defined(SCTP_SO_LOCK_TESTING) SCTP_UNUSED #endif ) { /** * Ok this is the generic chunk service queue. we must do the * following: * - Service the stream queue that is next, moving any * message (note I must get a complete message i.e. FIRST/MIDDLE and * LAST to the out queue in one pass) and assigning TSN's. This * only applys though if the peer does not support NDATA. For NDATA * chunks its ok to not send the entire message ;-) * - Check to see if the cwnd/rwnd allows any output, if so we go ahead and * fomulate and send the low level chunks. Making sure to combine * any control in the control chunk queue also. */ struct sctp_nets *net, *start_at, *sack_goes_to = NULL, *old_start_at = NULL; struct mbuf *outchain, *endoutchain; struct sctp_tmit_chunk *chk, *nchk; /* temp arrays for unlinking */ struct sctp_tmit_chunk *data_list[SCTP_MAX_DATA_BUNDLING]; int no_fragmentflg, error; unsigned int max_rwnd_per_dest, max_send_per_dest; int one_chunk, hbflag, skip_data_for_this_net; int asconf, cookie, no_out_cnt; int bundle_at, ctl_cnt, no_data_chunks, eeor_mode; unsigned int mtu, r_mtu, omtu, mx_mtu, to_out; int tsns_sent = 0; uint32_t auth_offset = 0; struct sctp_auth_chunk *auth = NULL; uint16_t auth_keyid; int override_ok = 1; int skip_fill_up = 0; int data_auth_reqd = 0; /* * JRS 5/14/07 - Add flag for whether a heartbeat is sent to the * destination. */ int quit_now = 0; *num_out = 0; *reason_code = 0; auth_keyid = stcb->asoc.authinfo.active_keyid; if ((asoc->state & SCTP_STATE_SHUTDOWN_PENDING) || (asoc->state & SCTP_STATE_SHUTDOWN_RECEIVED) || (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_EXPLICIT_EOR))) { eeor_mode = 1; } else { eeor_mode = 0; } ctl_cnt = no_out_cnt = asconf = cookie = 0; /* * First lets prime the pump. For each destination, if there is room * in the flight size, attempt to pull an MTU's worth out of the * stream queues into the general send_queue */ #ifdef SCTP_AUDITING_ENABLED sctp_audit_log(0xC2, 2); #endif SCTP_TCB_LOCK_ASSERT(stcb); hbflag = 0; if (control_only) no_data_chunks = 1; else no_data_chunks = 0; /* Nothing to possible to send? */ if ((TAILQ_EMPTY(&asoc->control_send_queue) || (asoc->ctrl_queue_cnt == stcb->asoc.ecn_echo_cnt_onq)) && TAILQ_EMPTY(&asoc->asconf_send_queue) && TAILQ_EMPTY(&asoc->send_queue) && sctp_is_there_unsent_data(stcb, so_locked) == 0) { nothing_to_send: *reason_code = 9; return (0); } if (asoc->peers_rwnd == 0) { /* No room in peers rwnd */ *reason_code = 1; if (asoc->total_flight > 0) { /* we are allowed one chunk in flight */ no_data_chunks = 1; } } if (stcb->asoc.ecn_echo_cnt_onq) { /* Record where a sack goes, if any */ if (no_data_chunks && (asoc->ctrl_queue_cnt == stcb->asoc.ecn_echo_cnt_onq)) { /* Nothing but ECNe to send - we don't do that */ goto nothing_to_send; } TAILQ_FOREACH(chk, &asoc->control_send_queue, sctp_next) { if ((chk->rec.chunk_id.id == SCTP_SELECTIVE_ACK) || (chk->rec.chunk_id.id == SCTP_NR_SELECTIVE_ACK)) { sack_goes_to = chk->whoTo; break; } } } max_rwnd_per_dest = ((asoc->peers_rwnd + asoc->total_flight) / asoc->numnets); if (stcb->sctp_socket) max_send_per_dest = SCTP_SB_LIMIT_SND(stcb->sctp_socket) / asoc->numnets; else max_send_per_dest = 0; if (no_data_chunks == 0) { /* How many non-directed chunks are there? */ TAILQ_FOREACH(chk, &asoc->send_queue, sctp_next) { if (chk->whoTo == NULL) { /* * We already have non-directed chunks on * the queue, no need to do a fill-up. */ skip_fill_up = 1; break; } } } if ((no_data_chunks == 0) && (skip_fill_up == 0) && (!stcb->asoc.ss_functions.sctp_ss_is_empty(stcb, asoc))) { TAILQ_FOREACH(net, &asoc->nets, sctp_next) { /* * This for loop we are in takes in each net, if * its's got space in cwnd and has data sent to it * (when CMT is off) then it calls * sctp_fill_outqueue for the net. This gets data on * the send queue for that network. * * In sctp_fill_outqueue TSN's are assigned and data is * copied out of the stream buffers. Note mostly * copy by reference (we hope). */ net->window_probe = 0; if ((net != stcb->asoc.alternate) && ((net->dest_state & SCTP_ADDR_PF) || (!(net->dest_state & SCTP_ADDR_REACHABLE)) || (net->dest_state & SCTP_ADDR_UNCONFIRMED))) { if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_CWND_LOGGING_ENABLE) { sctp_log_cwnd(stcb, net, 1, SCTP_CWND_LOG_FILL_OUTQ_CALLED); } continue; } if ((stcb->asoc.cc_functions.sctp_cwnd_new_transmission_begins) && (net->flight_size == 0)) { (*stcb->asoc.cc_functions.sctp_cwnd_new_transmission_begins) (stcb, net); } if (net->flight_size >= net->cwnd) { /* skip this network, no room - can't fill */ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_CWND_LOGGING_ENABLE) { sctp_log_cwnd(stcb, net, 3, SCTP_CWND_LOG_FILL_OUTQ_CALLED); } continue; } if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_CWND_LOGGING_ENABLE) { sctp_log_cwnd(stcb, net, 4, SCTP_CWND_LOG_FILL_OUTQ_CALLED); } sctp_fill_outqueue(stcb, net, frag_point, eeor_mode, &quit_now, so_locked); if (quit_now) { /* memory alloc failure */ no_data_chunks = 1; break; } } } /* now service each destination and send out what we can for it */ /* Nothing to send? */ if (TAILQ_EMPTY(&asoc->control_send_queue) && TAILQ_EMPTY(&asoc->asconf_send_queue) && TAILQ_EMPTY(&asoc->send_queue)) { *reason_code = 8; return (0); } if (asoc->sctp_cmt_on_off > 0) { /* get the last start point */ start_at = asoc->last_net_cmt_send_started; if (start_at == NULL) { /* null so to beginning */ start_at = TAILQ_FIRST(&asoc->nets); } else { start_at = TAILQ_NEXT(asoc->last_net_cmt_send_started, sctp_next); if (start_at == NULL) { start_at = TAILQ_FIRST(&asoc->nets); } } asoc->last_net_cmt_send_started = start_at; } else { start_at = TAILQ_FIRST(&asoc->nets); } TAILQ_FOREACH(chk, &asoc->control_send_queue, sctp_next) { if (chk->whoTo == NULL) { if (asoc->alternate) { chk->whoTo = asoc->alternate; } else { chk->whoTo = asoc->primary_destination; } atomic_add_int(&chk->whoTo->ref_count, 1); } } old_start_at = NULL; again_one_more_time: for (net = start_at; net != NULL; net = TAILQ_NEXT(net, sctp_next)) { /* how much can we send? */ /* SCTPDBG("Examine for sending net:%x\n", (uint32_t)net); */ if (old_start_at && (old_start_at == net)) { /* through list ocmpletely. */ break; } tsns_sent = 0xa; if (TAILQ_EMPTY(&asoc->control_send_queue) && TAILQ_EMPTY(&asoc->asconf_send_queue) && (net->flight_size >= net->cwnd)) { /* * Nothing on control or asconf and flight is full, * we can skip even in the CMT case. */ continue; } bundle_at = 0; endoutchain = outchain = NULL; no_fragmentflg = 1; one_chunk = 0; if (net->dest_state & SCTP_ADDR_UNCONFIRMED) { skip_data_for_this_net = 1; } else { skip_data_for_this_net = 0; } switch (((struct sockaddr *)&net->ro._l_addr)->sa_family) { #ifdef INET case AF_INET: mtu = net->mtu - SCTP_MIN_V4_OVERHEAD; break; #endif #ifdef INET6 case AF_INET6: mtu = net->mtu - SCTP_MIN_OVERHEAD; break; #endif default: /* TSNH */ mtu = net->mtu; break; } mx_mtu = mtu; to_out = 0; if (mtu > asoc->peers_rwnd) { if (asoc->total_flight > 0) { /* We have a packet in flight somewhere */ r_mtu = asoc->peers_rwnd; } else { /* We are always allowed to send one MTU out */ one_chunk = 1; r_mtu = mtu; } } else { r_mtu = mtu; } error = 0; /************************/ /* ASCONF transmission */ /************************/ /* Now first lets go through the asconf queue */ TAILQ_FOREACH_SAFE(chk, &asoc->asconf_send_queue, sctp_next, nchk) { if (chk->rec.chunk_id.id != SCTP_ASCONF) { continue; } if (chk->whoTo == NULL) { if (asoc->alternate == NULL) { if (asoc->primary_destination != net) { break; } } else { if (asoc->alternate != net) { break; } } } else { if (chk->whoTo != net) { break; } } if (chk->data == NULL) { break; } if (chk->sent != SCTP_DATAGRAM_UNSENT && chk->sent != SCTP_DATAGRAM_RESEND) { break; } /* * if no AUTH is yet included and this chunk * requires it, make sure to account for it. We * don't apply the size until the AUTH chunk is * actually added below in case there is no room for * this chunk. NOTE: we overload the use of "omtu" * here */ if ((auth == NULL) && sctp_auth_is_required_chunk(chk->rec.chunk_id.id, stcb->asoc.peer_auth_chunks)) { omtu = sctp_get_auth_chunk_len(stcb->asoc.peer_hmac_id); } else omtu = 0; /* Here we do NOT factor the r_mtu */ if ((chk->send_size < (int)(mtu - omtu)) || (chk->flags & CHUNK_FLAGS_FRAGMENT_OK)) { /* * We probably should glom the mbuf chain * from the chk->data for control but the * problem is it becomes yet one more level * of tracking to do if for some reason * output fails. Then I have got to * reconstruct the merged control chain.. el * yucko.. for now we take the easy way and * do the copy */ /* * Add an AUTH chunk, if chunk requires it * save the offset into the chain for AUTH */ if ((auth == NULL) && (sctp_auth_is_required_chunk(chk->rec.chunk_id.id, stcb->asoc.peer_auth_chunks))) { outchain = sctp_add_auth_chunk(outchain, &endoutchain, &auth, &auth_offset, stcb, chk->rec.chunk_id.id); SCTP_STAT_INCR_COUNTER64(sctps_outcontrolchunks); } outchain = sctp_copy_mbufchain(chk->data, outchain, &endoutchain, (int)chk->rec.chunk_id.can_take_data, chk->send_size, chk->copy_by_ref); if (outchain == NULL) { *reason_code = 8; SCTP_LTRACE_ERR_RET(inp, stcb, NULL, SCTP_FROM_SCTP_OUTPUT, ENOMEM); return (ENOMEM); } SCTP_STAT_INCR_COUNTER64(sctps_outcontrolchunks); /* update our MTU size */ if (mtu > (chk->send_size + omtu)) mtu -= (chk->send_size + omtu); else mtu = 0; to_out += (chk->send_size + omtu); /* Do clear IP_DF ? */ if (chk->flags & CHUNK_FLAGS_FRAGMENT_OK) { no_fragmentflg = 0; } if (chk->rec.chunk_id.can_take_data) chk->data = NULL; /* * set hb flag since we can use these for * RTO */ hbflag = 1; asconf = 1; /* * should sysctl this: don't bundle data * with ASCONF since it requires AUTH */ no_data_chunks = 1; chk->sent = SCTP_DATAGRAM_SENT; if (chk->whoTo == NULL) { chk->whoTo = net; atomic_add_int(&net->ref_count, 1); } chk->snd_count++; if (mtu == 0) { /* * Ok we are out of room but we can * output without effecting the * flight size since this little guy * is a control only packet. */ sctp_timer_start(SCTP_TIMER_TYPE_ASCONF, inp, stcb, net); /* * do NOT clear the asconf flag as * it is used to do appropriate * source address selection. */ if (*now_filled == 0) { (void)SCTP_GETTIME_TIMEVAL(now); *now_filled = 1; } net->last_sent_time = *now; hbflag = 0; if ((error = sctp_lowlevel_chunk_output(inp, stcb, net, (struct sockaddr *)&net->ro._l_addr, outchain, auth_offset, auth, stcb->asoc.authinfo.active_keyid, no_fragmentflg, 0, asconf, inp->sctp_lport, stcb->rport, htonl(stcb->asoc.peer_vtag), net->port, NULL, 0, 0, so_locked))) { /* * error, we could not * output */ SCTPDBG(SCTP_DEBUG_OUTPUT3, "Gak send error %d\n", error); if (from_where == 0) { SCTP_STAT_INCR(sctps_lowlevelerrusr); } if (error == ENOBUFS) { asoc->ifp_had_enobuf = 1; SCTP_STAT_INCR(sctps_lowlevelerr); } /* error, could not output */ if (error == EHOSTUNREACH) { /* * Destination went * unreachable * during this send */ sctp_move_chunks_from_net(stcb, net); } *reason_code = 7; break; } else { asoc->ifp_had_enobuf = 0; } /* * increase the number we sent, if a * cookie is sent we don't tell them * any was sent out. */ outchain = endoutchain = NULL; auth = NULL; auth_offset = 0; if (!no_out_cnt) *num_out += ctl_cnt; /* recalc a clean slate and setup */ switch (net->ro._l_addr.sa.sa_family) { #ifdef INET case AF_INET: mtu = net->mtu - SCTP_MIN_V4_OVERHEAD; break; #endif #ifdef INET6 case AF_INET6: mtu = net->mtu - SCTP_MIN_OVERHEAD; break; #endif default: /* TSNH */ mtu = net->mtu; break; } to_out = 0; no_fragmentflg = 1; } } } if (error != 0) { /* try next net */ continue; } /************************/ /* Control transmission */ /************************/ /* Now first lets go through the control queue */ TAILQ_FOREACH_SAFE(chk, &asoc->control_send_queue, sctp_next, nchk) { if ((sack_goes_to) && (chk->rec.chunk_id.id == SCTP_ECN_ECHO) && (chk->whoTo != sack_goes_to)) { /* * if we have a sack in queue, and we are * looking at an ecn echo that is NOT queued * to where the sack is going.. */ if (chk->whoTo == net) { /* * Don't transmit it to where its * going (current net) */ continue; } else if (sack_goes_to == net) { /* * But do transmit it to this * address */ goto skip_net_check; } } if (chk->whoTo == NULL) { if (asoc->alternate == NULL) { if (asoc->primary_destination != net) { continue; } } else { if (asoc->alternate != net) { continue; } } } else { if (chk->whoTo != net) { continue; } } skip_net_check: if (chk->data == NULL) { continue; } if (chk->sent != SCTP_DATAGRAM_UNSENT) { /* * It must be unsent. Cookies and ASCONF's * hang around but there timers will force * when marked for resend. */ continue; } /* * if no AUTH is yet included and this chunk * requires it, make sure to account for it. We * don't apply the size until the AUTH chunk is * actually added below in case there is no room for * this chunk. NOTE: we overload the use of "omtu" * here */ if ((auth == NULL) && sctp_auth_is_required_chunk(chk->rec.chunk_id.id, stcb->asoc.peer_auth_chunks)) { omtu = sctp_get_auth_chunk_len(stcb->asoc.peer_hmac_id); } else omtu = 0; /* Here we do NOT factor the r_mtu */ if ((chk->send_size <= (int)(mtu - omtu)) || (chk->flags & CHUNK_FLAGS_FRAGMENT_OK)) { /* * We probably should glom the mbuf chain * from the chk->data for control but the * problem is it becomes yet one more level * of tracking to do if for some reason * output fails. Then I have got to * reconstruct the merged control chain.. el * yucko.. for now we take the easy way and * do the copy */ /* * Add an AUTH chunk, if chunk requires it * save the offset into the chain for AUTH */ if ((auth == NULL) && (sctp_auth_is_required_chunk(chk->rec.chunk_id.id, stcb->asoc.peer_auth_chunks))) { outchain = sctp_add_auth_chunk(outchain, &endoutchain, &auth, &auth_offset, stcb, chk->rec.chunk_id.id); SCTP_STAT_INCR_COUNTER64(sctps_outcontrolchunks); } outchain = sctp_copy_mbufchain(chk->data, outchain, &endoutchain, (int)chk->rec.chunk_id.can_take_data, chk->send_size, chk->copy_by_ref); if (outchain == NULL) { *reason_code = 8; SCTP_LTRACE_ERR_RET(inp, stcb, NULL, SCTP_FROM_SCTP_OUTPUT, ENOMEM); return (ENOMEM); } SCTP_STAT_INCR_COUNTER64(sctps_outcontrolchunks); /* update our MTU size */ if (mtu > (chk->send_size + omtu)) mtu -= (chk->send_size + omtu); else mtu = 0; to_out += (chk->send_size + omtu); /* Do clear IP_DF ? */ if (chk->flags & CHUNK_FLAGS_FRAGMENT_OK) { no_fragmentflg = 0; } if (chk->rec.chunk_id.can_take_data) chk->data = NULL; /* Mark things to be removed, if needed */ if ((chk->rec.chunk_id.id == SCTP_SELECTIVE_ACK) || (chk->rec.chunk_id.id == SCTP_NR_SELECTIVE_ACK) || /* EY */ (chk->rec.chunk_id.id == SCTP_HEARTBEAT_REQUEST) || (chk->rec.chunk_id.id == SCTP_HEARTBEAT_ACK) || (chk->rec.chunk_id.id == SCTP_SHUTDOWN) || (chk->rec.chunk_id.id == SCTP_SHUTDOWN_ACK) || (chk->rec.chunk_id.id == SCTP_OPERATION_ERROR) || (chk->rec.chunk_id.id == SCTP_COOKIE_ACK) || (chk->rec.chunk_id.id == SCTP_ECN_CWR) || (chk->rec.chunk_id.id == SCTP_PACKET_DROPPED) || (chk->rec.chunk_id.id == SCTP_ASCONF_ACK)) { if (chk->rec.chunk_id.id == SCTP_HEARTBEAT_REQUEST) { hbflag = 1; } /* remove these chunks at the end */ if ((chk->rec.chunk_id.id == SCTP_SELECTIVE_ACK) || (chk->rec.chunk_id.id == SCTP_NR_SELECTIVE_ACK)) { /* turn off the timer */ if (SCTP_OS_TIMER_PENDING(&stcb->asoc.dack_timer.timer)) { sctp_timer_stop(SCTP_TIMER_TYPE_RECV, inp, stcb, net, SCTP_FROM_SCTP_OUTPUT + SCTP_LOC_1); } } ctl_cnt++; } else { /* * Other chunks, since they have * timers running (i.e. COOKIE) we * just "trust" that it gets sent or * retransmitted. */ ctl_cnt++; if (chk->rec.chunk_id.id == SCTP_COOKIE_ECHO) { cookie = 1; no_out_cnt = 1; } else if (chk->rec.chunk_id.id == SCTP_ECN_ECHO) { /* * Increment ecne send count * here this means we may be * over-zealous in our * counting if the send * fails, but its the best * place to do it (we used * to do it in the queue of * the chunk, but that did * not tell how many times * it was sent. */ SCTP_STAT_INCR(sctps_sendecne); } chk->sent = SCTP_DATAGRAM_SENT; if (chk->whoTo == NULL) { chk->whoTo = net; atomic_add_int(&net->ref_count, 1); } chk->snd_count++; } if (mtu == 0) { /* * Ok we are out of room but we can * output without effecting the * flight size since this little guy * is a control only packet. */ if (asconf) { sctp_timer_start(SCTP_TIMER_TYPE_ASCONF, inp, stcb, net); /* * do NOT clear the asconf * flag as it is used to do * appropriate source * address selection. */ } if (cookie) { sctp_timer_start(SCTP_TIMER_TYPE_COOKIE, inp, stcb, net); cookie = 0; } /* Only HB or ASCONF advances time */ if (hbflag) { if (*now_filled == 0) { (void)SCTP_GETTIME_TIMEVAL(now); *now_filled = 1; } net->last_sent_time = *now; hbflag = 0; } if ((error = sctp_lowlevel_chunk_output(inp, stcb, net, (struct sockaddr *)&net->ro._l_addr, outchain, auth_offset, auth, stcb->asoc.authinfo.active_keyid, no_fragmentflg, 0, asconf, inp->sctp_lport, stcb->rport, htonl(stcb->asoc.peer_vtag), net->port, NULL, 0, 0, so_locked))) { /* * error, we could not * output */ SCTPDBG(SCTP_DEBUG_OUTPUT3, "Gak send error %d\n", error); if (from_where == 0) { SCTP_STAT_INCR(sctps_lowlevelerrusr); } if (error == ENOBUFS) { asoc->ifp_had_enobuf = 1; SCTP_STAT_INCR(sctps_lowlevelerr); } if (error == EHOSTUNREACH) { /* * Destination went * unreachable * during this send */ sctp_move_chunks_from_net(stcb, net); } *reason_code = 7; break; } else { asoc->ifp_had_enobuf = 0; } /* * increase the number we sent, if a * cookie is sent we don't tell them * any was sent out. */ outchain = endoutchain = NULL; auth = NULL; auth_offset = 0; if (!no_out_cnt) *num_out += ctl_cnt; /* recalc a clean slate and setup */ switch (net->ro._l_addr.sa.sa_family) { #ifdef INET case AF_INET: mtu = net->mtu - SCTP_MIN_V4_OVERHEAD; break; #endif #ifdef INET6 case AF_INET6: mtu = net->mtu - SCTP_MIN_OVERHEAD; break; #endif default: /* TSNH */ mtu = net->mtu; break; } to_out = 0; no_fragmentflg = 1; } } } if (error != 0) { /* try next net */ continue; } /* JRI: if dest is in PF state, do not send data to it */ if ((asoc->sctp_cmt_on_off > 0) && (net != stcb->asoc.alternate) && (net->dest_state & SCTP_ADDR_PF)) { goto no_data_fill; } if (net->flight_size >= net->cwnd) { goto no_data_fill; } if ((asoc->sctp_cmt_on_off > 0) && (SCTP_BASE_SYSCTL(sctp_buffer_splitting) & SCTP_RECV_BUFFER_SPLITTING) && (net->flight_size > max_rwnd_per_dest)) { goto no_data_fill; } /* * We need a specific accounting for the usage of the send * buffer. We also need to check the number of messages per * net. For now, this is better than nothing and it disabled * by default... */ if ((asoc->sctp_cmt_on_off > 0) && (SCTP_BASE_SYSCTL(sctp_buffer_splitting) & SCTP_SEND_BUFFER_SPLITTING) && (max_send_per_dest > 0) && (net->flight_size > max_send_per_dest)) { goto no_data_fill; } /*********************/ /* Data transmission */ /*********************/ /* * if AUTH for DATA is required and no AUTH has been added * yet, account for this in the mtu now... if no data can be * bundled, this adjustment won't matter anyways since the * packet will be going out... */ data_auth_reqd = sctp_auth_is_required_chunk(SCTP_DATA, stcb->asoc.peer_auth_chunks); if (data_auth_reqd && (auth == NULL)) { mtu -= sctp_get_auth_chunk_len(stcb->asoc.peer_hmac_id); } /* now lets add any data within the MTU constraints */ switch (((struct sockaddr *)&net->ro._l_addr)->sa_family) { #ifdef INET case AF_INET: if (net->mtu > SCTP_MIN_V4_OVERHEAD) omtu = net->mtu - SCTP_MIN_V4_OVERHEAD; else omtu = 0; break; #endif #ifdef INET6 case AF_INET6: if (net->mtu > SCTP_MIN_OVERHEAD) omtu = net->mtu - SCTP_MIN_OVERHEAD; else omtu = 0; break; #endif default: /* TSNH */ omtu = 0; break; } if ((((SCTP_GET_STATE(asoc) == SCTP_STATE_OPEN) || (SCTP_GET_STATE(asoc) == SCTP_STATE_SHUTDOWN_RECEIVED)) && (skip_data_for_this_net == 0)) || (cookie)) { TAILQ_FOREACH_SAFE(chk, &asoc->send_queue, sctp_next, nchk) { if (no_data_chunks) { /* let only control go out */ *reason_code = 1; break; } if (net->flight_size >= net->cwnd) { /* skip this net, no room for data */ *reason_code = 2; break; } if ((chk->whoTo != NULL) && (chk->whoTo != net)) { /* Don't send the chunk on this net */ continue; } if (asoc->sctp_cmt_on_off == 0) { if ((asoc->alternate) && (asoc->alternate != net) && (chk->whoTo == NULL)) { continue; } else if ((net != asoc->primary_destination) && (asoc->alternate == NULL) && (chk->whoTo == NULL)) { continue; } } if ((chk->send_size > omtu) && ((chk->flags & CHUNK_FLAGS_FRAGMENT_OK) == 0)) { /*- * strange, we have a chunk that is * to big for its destination and * yet no fragment ok flag. * Something went wrong when the * PMTU changed...we did not mark * this chunk for some reason?? I * will fix it here by letting IP * fragment it for now and printing * a warning. This really should not * happen ... */ SCTP_PRINTF("Warning chunk of %d bytes > mtu:%d and yet PMTU disc missed\n", chk->send_size, mtu); chk->flags |= CHUNK_FLAGS_FRAGMENT_OK; } if (SCTP_BASE_SYSCTL(sctp_enable_sack_immediately) && ((asoc->state & SCTP_STATE_SHUTDOWN_PENDING) == SCTP_STATE_SHUTDOWN_PENDING)) { struct sctp_data_chunk *dchkh; dchkh = mtod(chk->data, struct sctp_data_chunk *); dchkh->ch.chunk_flags |= SCTP_DATA_SACK_IMMEDIATELY; } if (((chk->send_size <= mtu) && (chk->send_size <= r_mtu)) || ((chk->flags & CHUNK_FLAGS_FRAGMENT_OK) && (chk->send_size <= asoc->peers_rwnd))) { /* ok we will add this one */ /* * Add an AUTH chunk, if chunk * requires it, save the offset into * the chain for AUTH */ if (data_auth_reqd) { if (auth == NULL) { outchain = sctp_add_auth_chunk(outchain, &endoutchain, &auth, &auth_offset, stcb, SCTP_DATA); auth_keyid = chk->auth_keyid; override_ok = 0; SCTP_STAT_INCR_COUNTER64(sctps_outcontrolchunks); } else if (override_ok) { /* * use this data's * keyid */ auth_keyid = chk->auth_keyid; override_ok = 0; } else if (auth_keyid != chk->auth_keyid) { /* * different keyid, * so done bundling */ break; } } outchain = sctp_copy_mbufchain(chk->data, outchain, &endoutchain, 0, chk->send_size, chk->copy_by_ref); if (outchain == NULL) { SCTPDBG(SCTP_DEBUG_OUTPUT3, "No memory?\n"); if (!SCTP_OS_TIMER_PENDING(&net->rxt_timer.timer)) { sctp_timer_start(SCTP_TIMER_TYPE_SEND, inp, stcb, net); } *reason_code = 3; SCTP_LTRACE_ERR_RET(inp, stcb, NULL, SCTP_FROM_SCTP_OUTPUT, ENOMEM); return (ENOMEM); } /* upate our MTU size */ /* Do clear IP_DF ? */ if (chk->flags & CHUNK_FLAGS_FRAGMENT_OK) { no_fragmentflg = 0; } /* unsigned subtraction of mtu */ if (mtu > chk->send_size) mtu -= chk->send_size; else mtu = 0; /* unsigned subtraction of r_mtu */ if (r_mtu > chk->send_size) r_mtu -= chk->send_size; else r_mtu = 0; to_out += chk->send_size; if ((to_out > mx_mtu) && no_fragmentflg) { #ifdef INVARIANTS panic("Exceeding mtu of %d out size is %d", mx_mtu, to_out); #else SCTP_PRINTF("Exceeding mtu of %d out size is %d\n", mx_mtu, to_out); #endif } chk->window_probe = 0; data_list[bundle_at++] = chk; if (bundle_at >= SCTP_MAX_DATA_BUNDLING) { break; } if (chk->sent == SCTP_DATAGRAM_UNSENT) { if ((chk->rec.data.rcv_flags & SCTP_DATA_UNORDERED) == 0) { SCTP_STAT_INCR_COUNTER64(sctps_outorderchunks); } else { SCTP_STAT_INCR_COUNTER64(sctps_outunorderchunks); } if (((chk->rec.data.rcv_flags & SCTP_DATA_LAST_FRAG) == SCTP_DATA_LAST_FRAG) && ((chk->rec.data.rcv_flags & SCTP_DATA_FIRST_FRAG) == 0)) /* * Count number of * user msg's that * were fragmented * we do this by * counting when we * see a LAST * fragment only. */ SCTP_STAT_INCR_COUNTER64(sctps_fragusrmsgs); } if ((mtu == 0) || (r_mtu == 0) || (one_chunk)) { if ((one_chunk) && (stcb->asoc.total_flight == 0)) { data_list[0]->window_probe = 1; net->window_probe = 1; } break; } } else { /* * Must be sent in order of the * TSN's (on a network) */ break; } } /* for (chunk gather loop for this net) */ } /* if asoc.state OPEN */ no_data_fill: /* Is there something to send for this destination? */ if (outchain) { /* We may need to start a control timer or two */ if (asconf) { sctp_timer_start(SCTP_TIMER_TYPE_ASCONF, inp, stcb, net); /* * do NOT clear the asconf flag as it is * used to do appropriate source address * selection. */ } if (cookie) { sctp_timer_start(SCTP_TIMER_TYPE_COOKIE, inp, stcb, net); cookie = 0; } /* must start a send timer if data is being sent */ if (bundle_at && (!SCTP_OS_TIMER_PENDING(&net->rxt_timer.timer))) { /* * no timer running on this destination * restart it. */ sctp_timer_start(SCTP_TIMER_TYPE_SEND, inp, stcb, net); } if (bundle_at || hbflag) { /* For data/asconf and hb set time */ if (*now_filled == 0) { (void)SCTP_GETTIME_TIMEVAL(now); *now_filled = 1; } net->last_sent_time = *now; } /* Now send it, if there is anything to send :> */ if ((error = sctp_lowlevel_chunk_output(inp, stcb, net, (struct sockaddr *)&net->ro._l_addr, outchain, auth_offset, auth, auth_keyid, no_fragmentflg, bundle_at, asconf, inp->sctp_lport, stcb->rport, htonl(stcb->asoc.peer_vtag), net->port, NULL, 0, 0, so_locked))) { /* error, we could not output */ SCTPDBG(SCTP_DEBUG_OUTPUT3, "Gak send error %d\n", error); if (from_where == 0) { SCTP_STAT_INCR(sctps_lowlevelerrusr); } if (error == ENOBUFS) { SCTP_STAT_INCR(sctps_lowlevelerr); asoc->ifp_had_enobuf = 1; } if (error == EHOSTUNREACH) { /* * Destination went unreachable * during this send */ sctp_move_chunks_from_net(stcb, net); } *reason_code = 6; /*- * I add this line to be paranoid. As far as * I can tell the continue, takes us back to * the top of the for, but just to make sure * I will reset these again here. */ ctl_cnt = bundle_at = 0; continue; /* This takes us back to the * for() for the nets. */ } else { asoc->ifp_had_enobuf = 0; } endoutchain = NULL; auth = NULL; auth_offset = 0; if (!no_out_cnt) { *num_out += (ctl_cnt + bundle_at); } if (bundle_at) { /* setup for a RTO measurement */ tsns_sent = data_list[0]->rec.data.TSN_seq; /* fill time if not already filled */ if (*now_filled == 0) { (void)SCTP_GETTIME_TIMEVAL(&asoc->time_last_sent); *now_filled = 1; *now = asoc->time_last_sent; } else { asoc->time_last_sent = *now; } if (net->rto_needed) { data_list[0]->do_rtt = 1; net->rto_needed = 0; } SCTP_STAT_INCR_BY(sctps_senddata, bundle_at); sctp_clean_up_datalist(stcb, asoc, data_list, bundle_at, net); } if (one_chunk) { break; } } if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_CWND_LOGGING_ENABLE) { sctp_log_cwnd(stcb, net, tsns_sent, SCTP_CWND_LOG_FROM_SEND); } } if (old_start_at == NULL) { old_start_at = start_at; start_at = TAILQ_FIRST(&asoc->nets); if (old_start_at) goto again_one_more_time; } /* * At the end there should be no NON timed chunks hanging on this * queue. */ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_CWND_LOGGING_ENABLE) { sctp_log_cwnd(stcb, net, *num_out, SCTP_CWND_LOG_FROM_SEND); } if ((*num_out == 0) && (*reason_code == 0)) { *reason_code = 4; } else { *reason_code = 5; } sctp_clean_up_ctl(stcb, asoc, so_locked); return (0); } void sctp_queue_op_err(struct sctp_tcb *stcb, struct mbuf *op_err) { /*- * Prepend a OPERATIONAL_ERROR chunk header and put on the end of * the control chunk queue. */ struct sctp_chunkhdr *hdr; struct sctp_tmit_chunk *chk; struct mbuf *mat, *last_mbuf; uint32_t chunk_length; uint16_t padding_length; SCTP_TCB_LOCK_ASSERT(stcb); SCTP_BUF_PREPEND(op_err, sizeof(struct sctp_chunkhdr), M_NOWAIT); if (op_err == NULL) { return; } last_mbuf = NULL; chunk_length = 0; for (mat = op_err; mat != NULL; mat = SCTP_BUF_NEXT(mat)) { chunk_length += SCTP_BUF_LEN(mat); if (SCTP_BUF_NEXT(mat) == NULL) { last_mbuf = mat; } } if (chunk_length > SCTP_MAX_CHUNK_LENGTH) { sctp_m_freem(op_err); return; } padding_length = chunk_length % 4; if (padding_length != 0) { padding_length = 4 - padding_length; } if (padding_length != 0) { if (sctp_add_pad_tombuf(last_mbuf, padding_length) == NULL) { sctp_m_freem(op_err); return; } } sctp_alloc_a_chunk(stcb, chk); if (chk == NULL) { /* no memory */ sctp_m_freem(op_err); return; } chk->copy_by_ref = 0; chk->send_size = (uint16_t) chunk_length; chk->sent = SCTP_DATAGRAM_UNSENT; chk->snd_count = 0; chk->asoc = &stcb->asoc; chk->data = op_err; chk->whoTo = NULL; chk->rec.chunk_id.id = SCTP_OPERATION_ERROR; chk->rec.chunk_id.can_take_data = 0; hdr = mtod(op_err, struct sctp_chunkhdr *); hdr->chunk_type = SCTP_OPERATION_ERROR; hdr->chunk_flags = 0; hdr->chunk_length = htons(chk->send_size); TAILQ_INSERT_TAIL(&chk->asoc->control_send_queue, chk, sctp_next); chk->asoc->ctrl_queue_cnt++; } int sctp_send_cookie_echo(struct mbuf *m, int offset, struct sctp_tcb *stcb, struct sctp_nets *net) { /*- * pull out the cookie and put it at the front of the control chunk * queue. */ int at; struct mbuf *cookie; struct sctp_paramhdr parm, *phdr; struct sctp_chunkhdr *hdr; struct sctp_tmit_chunk *chk; uint16_t ptype, plen; SCTP_TCB_LOCK_ASSERT(stcb); /* First find the cookie in the param area */ cookie = NULL; at = offset + sizeof(struct sctp_init_chunk); for (;;) { phdr = sctp_get_next_param(m, at, &parm, sizeof(parm)); if (phdr == NULL) { return (-3); } ptype = ntohs(phdr->param_type); plen = ntohs(phdr->param_length); if (ptype == SCTP_STATE_COOKIE) { int pad; /* found the cookie */ if ((pad = (plen % 4))) { plen += 4 - pad; } cookie = SCTP_M_COPYM(m, at, plen, M_NOWAIT); if (cookie == NULL) { /* No memory */ return (-2); } #ifdef SCTP_MBUF_LOGGING if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_MBUF_LOGGING_ENABLE) { sctp_log_mbc(cookie, SCTP_MBUF_ICOPY); } #endif break; } at += SCTP_SIZE32(plen); } /* ok, we got the cookie lets change it into a cookie echo chunk */ /* first the change from param to cookie */ hdr = mtod(cookie, struct sctp_chunkhdr *); hdr->chunk_type = SCTP_COOKIE_ECHO; hdr->chunk_flags = 0; /* get the chunk stuff now and place it in the FRONT of the queue */ sctp_alloc_a_chunk(stcb, chk); if (chk == NULL) { /* no memory */ sctp_m_freem(cookie); return (-5); } chk->copy_by_ref = 0; chk->rec.chunk_id.id = SCTP_COOKIE_ECHO; chk->rec.chunk_id.can_take_data = 0; chk->flags = CHUNK_FLAGS_FRAGMENT_OK; chk->send_size = plen; chk->sent = SCTP_DATAGRAM_UNSENT; chk->snd_count = 0; chk->asoc = &stcb->asoc; chk->data = cookie; chk->whoTo = net; atomic_add_int(&chk->whoTo->ref_count, 1); TAILQ_INSERT_HEAD(&chk->asoc->control_send_queue, chk, sctp_next); chk->asoc->ctrl_queue_cnt++; return (0); } void sctp_send_heartbeat_ack(struct sctp_tcb *stcb, struct mbuf *m, int offset, int chk_length, struct sctp_nets *net) { /* * take a HB request and make it into a HB ack and send it. */ struct mbuf *outchain; struct sctp_chunkhdr *chdr; struct sctp_tmit_chunk *chk; if (net == NULL) /* must have a net pointer */ return; outchain = SCTP_M_COPYM(m, offset, chk_length, M_NOWAIT); if (outchain == NULL) { /* gak out of memory */ return; } #ifdef SCTP_MBUF_LOGGING if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_MBUF_LOGGING_ENABLE) { sctp_log_mbc(outchain, SCTP_MBUF_ICOPY); } #endif chdr = mtod(outchain, struct sctp_chunkhdr *); chdr->chunk_type = SCTP_HEARTBEAT_ACK; chdr->chunk_flags = 0; if (chk_length % 4) { /* need pad */ uint32_t cpthis = 0; int padlen; padlen = 4 - (chk_length % 4); m_copyback(outchain, chk_length, padlen, (caddr_t)&cpthis); } sctp_alloc_a_chunk(stcb, chk); if (chk == NULL) { /* no memory */ sctp_m_freem(outchain); return; } chk->copy_by_ref = 0; chk->rec.chunk_id.id = SCTP_HEARTBEAT_ACK; chk->rec.chunk_id.can_take_data = 1; chk->flags = 0; chk->send_size = chk_length; chk->sent = SCTP_DATAGRAM_UNSENT; chk->snd_count = 0; chk->asoc = &stcb->asoc; chk->data = outchain; chk->whoTo = net; atomic_add_int(&chk->whoTo->ref_count, 1); TAILQ_INSERT_TAIL(&chk->asoc->control_send_queue, chk, sctp_next); chk->asoc->ctrl_queue_cnt++; } void sctp_send_cookie_ack(struct sctp_tcb *stcb) { /* formulate and queue a cookie-ack back to sender */ struct mbuf *cookie_ack; struct sctp_chunkhdr *hdr; struct sctp_tmit_chunk *chk; SCTP_TCB_LOCK_ASSERT(stcb); cookie_ack = sctp_get_mbuf_for_msg(sizeof(struct sctp_chunkhdr), 0, M_NOWAIT, 1, MT_HEADER); if (cookie_ack == NULL) { /* no mbuf's */ return; } SCTP_BUF_RESV_UF(cookie_ack, SCTP_MIN_OVERHEAD); sctp_alloc_a_chunk(stcb, chk); if (chk == NULL) { /* no memory */ sctp_m_freem(cookie_ack); return; } chk->copy_by_ref = 0; chk->rec.chunk_id.id = SCTP_COOKIE_ACK; chk->rec.chunk_id.can_take_data = 1; chk->flags = 0; chk->send_size = sizeof(struct sctp_chunkhdr); chk->sent = SCTP_DATAGRAM_UNSENT; chk->snd_count = 0; chk->asoc = &stcb->asoc; chk->data = cookie_ack; if (chk->asoc->last_control_chunk_from != NULL) { chk->whoTo = chk->asoc->last_control_chunk_from; atomic_add_int(&chk->whoTo->ref_count, 1); } else { chk->whoTo = NULL; } hdr = mtod(cookie_ack, struct sctp_chunkhdr *); hdr->chunk_type = SCTP_COOKIE_ACK; hdr->chunk_flags = 0; hdr->chunk_length = htons(chk->send_size); SCTP_BUF_LEN(cookie_ack) = chk->send_size; TAILQ_INSERT_TAIL(&chk->asoc->control_send_queue, chk, sctp_next); chk->asoc->ctrl_queue_cnt++; return; } void sctp_send_shutdown_ack(struct sctp_tcb *stcb, struct sctp_nets *net) { /* formulate and queue a SHUTDOWN-ACK back to the sender */ struct mbuf *m_shutdown_ack; struct sctp_shutdown_ack_chunk *ack_cp; struct sctp_tmit_chunk *chk; m_shutdown_ack = sctp_get_mbuf_for_msg(sizeof(struct sctp_shutdown_ack_chunk), 0, M_NOWAIT, 1, MT_HEADER); if (m_shutdown_ack == NULL) { /* no mbuf's */ return; } SCTP_BUF_RESV_UF(m_shutdown_ack, SCTP_MIN_OVERHEAD); sctp_alloc_a_chunk(stcb, chk); if (chk == NULL) { /* no memory */ sctp_m_freem(m_shutdown_ack); return; } chk->copy_by_ref = 0; chk->rec.chunk_id.id = SCTP_SHUTDOWN_ACK; chk->rec.chunk_id.can_take_data = 1; chk->flags = 0; chk->send_size = sizeof(struct sctp_chunkhdr); chk->sent = SCTP_DATAGRAM_UNSENT; chk->snd_count = 0; chk->flags = 0; chk->asoc = &stcb->asoc; chk->data = m_shutdown_ack; chk->whoTo = net; if (chk->whoTo) { atomic_add_int(&chk->whoTo->ref_count, 1); } ack_cp = mtod(m_shutdown_ack, struct sctp_shutdown_ack_chunk *); ack_cp->ch.chunk_type = SCTP_SHUTDOWN_ACK; ack_cp->ch.chunk_flags = 0; ack_cp->ch.chunk_length = htons(chk->send_size); SCTP_BUF_LEN(m_shutdown_ack) = chk->send_size; TAILQ_INSERT_TAIL(&chk->asoc->control_send_queue, chk, sctp_next); chk->asoc->ctrl_queue_cnt++; return; } void sctp_send_shutdown(struct sctp_tcb *stcb, struct sctp_nets *net) { /* formulate and queue a SHUTDOWN to the sender */ struct mbuf *m_shutdown; struct sctp_shutdown_chunk *shutdown_cp; struct sctp_tmit_chunk *chk; m_shutdown = sctp_get_mbuf_for_msg(sizeof(struct sctp_shutdown_chunk), 0, M_NOWAIT, 1, MT_HEADER); if (m_shutdown == NULL) { /* no mbuf's */ return; } SCTP_BUF_RESV_UF(m_shutdown, SCTP_MIN_OVERHEAD); sctp_alloc_a_chunk(stcb, chk); if (chk == NULL) { /* no memory */ sctp_m_freem(m_shutdown); return; } chk->copy_by_ref = 0; chk->rec.chunk_id.id = SCTP_SHUTDOWN; chk->rec.chunk_id.can_take_data = 1; chk->flags = 0; chk->send_size = sizeof(struct sctp_shutdown_chunk); chk->sent = SCTP_DATAGRAM_UNSENT; chk->snd_count = 0; chk->flags = 0; chk->asoc = &stcb->asoc; chk->data = m_shutdown; chk->whoTo = net; if (chk->whoTo) { atomic_add_int(&chk->whoTo->ref_count, 1); } shutdown_cp = mtod(m_shutdown, struct sctp_shutdown_chunk *); shutdown_cp->ch.chunk_type = SCTP_SHUTDOWN; shutdown_cp->ch.chunk_flags = 0; shutdown_cp->ch.chunk_length = htons(chk->send_size); shutdown_cp->cumulative_tsn_ack = htonl(stcb->asoc.cumulative_tsn); SCTP_BUF_LEN(m_shutdown) = chk->send_size; TAILQ_INSERT_TAIL(&chk->asoc->control_send_queue, chk, sctp_next); chk->asoc->ctrl_queue_cnt++; return; } void sctp_send_asconf(struct sctp_tcb *stcb, struct sctp_nets *net, int addr_locked) { /* * formulate and queue an ASCONF to the peer. ASCONF parameters * should be queued on the assoc queue. */ struct sctp_tmit_chunk *chk; struct mbuf *m_asconf; int len; SCTP_TCB_LOCK_ASSERT(stcb); if ((!TAILQ_EMPTY(&stcb->asoc.asconf_send_queue)) && (!sctp_is_feature_on(stcb->sctp_ep, SCTP_PCB_FLAGS_MULTIPLE_ASCONFS))) { /* can't send a new one if there is one in flight already */ return; } /* compose an ASCONF chunk, maximum length is PMTU */ m_asconf = sctp_compose_asconf(stcb, &len, addr_locked); if (m_asconf == NULL) { return; } sctp_alloc_a_chunk(stcb, chk); if (chk == NULL) { /* no memory */ sctp_m_freem(m_asconf); return; } chk->copy_by_ref = 0; chk->rec.chunk_id.id = SCTP_ASCONF; chk->rec.chunk_id.can_take_data = 0; chk->flags = CHUNK_FLAGS_FRAGMENT_OK; chk->data = m_asconf; chk->send_size = len; chk->sent = SCTP_DATAGRAM_UNSENT; chk->snd_count = 0; chk->asoc = &stcb->asoc; chk->whoTo = net; if (chk->whoTo) { atomic_add_int(&chk->whoTo->ref_count, 1); } TAILQ_INSERT_TAIL(&chk->asoc->asconf_send_queue, chk, sctp_next); chk->asoc->ctrl_queue_cnt++; return; } void sctp_send_asconf_ack(struct sctp_tcb *stcb) { /* * formulate and queue a asconf-ack back to sender. the asconf-ack * must be stored in the tcb. */ struct sctp_tmit_chunk *chk; struct sctp_asconf_ack *ack, *latest_ack; struct mbuf *m_ack; struct sctp_nets *net = NULL; SCTP_TCB_LOCK_ASSERT(stcb); /* Get the latest ASCONF-ACK */ latest_ack = TAILQ_LAST(&stcb->asoc.asconf_ack_sent, sctp_asconf_ackhead); if (latest_ack == NULL) { return; } if (latest_ack->last_sent_to != NULL && latest_ack->last_sent_to == stcb->asoc.last_control_chunk_from) { /* we're doing a retransmission */ net = sctp_find_alternate_net(stcb, stcb->asoc.last_control_chunk_from, 0); if (net == NULL) { /* no alternate */ if (stcb->asoc.last_control_chunk_from == NULL) { if (stcb->asoc.alternate) { net = stcb->asoc.alternate; } else { net = stcb->asoc.primary_destination; } } else { net = stcb->asoc.last_control_chunk_from; } } } else { /* normal case */ if (stcb->asoc.last_control_chunk_from == NULL) { if (stcb->asoc.alternate) { net = stcb->asoc.alternate; } else { net = stcb->asoc.primary_destination; } } else { net = stcb->asoc.last_control_chunk_from; } } latest_ack->last_sent_to = net; TAILQ_FOREACH(ack, &stcb->asoc.asconf_ack_sent, next) { if (ack->data == NULL) { continue; } /* copy the asconf_ack */ m_ack = SCTP_M_COPYM(ack->data, 0, M_COPYALL, M_NOWAIT); if (m_ack == NULL) { /* couldn't copy it */ return; } #ifdef SCTP_MBUF_LOGGING if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_MBUF_LOGGING_ENABLE) { sctp_log_mbc(m_ack, SCTP_MBUF_ICOPY); } #endif sctp_alloc_a_chunk(stcb, chk); if (chk == NULL) { /* no memory */ if (m_ack) sctp_m_freem(m_ack); return; } chk->copy_by_ref = 0; chk->rec.chunk_id.id = SCTP_ASCONF_ACK; chk->rec.chunk_id.can_take_data = 1; chk->flags = CHUNK_FLAGS_FRAGMENT_OK; chk->whoTo = net; if (chk->whoTo) { atomic_add_int(&chk->whoTo->ref_count, 1); } chk->data = m_ack; chk->send_size = ack->len; chk->sent = SCTP_DATAGRAM_UNSENT; chk->snd_count = 0; chk->asoc = &stcb->asoc; TAILQ_INSERT_TAIL(&chk->asoc->control_send_queue, chk, sctp_next); chk->asoc->ctrl_queue_cnt++; } return; } static int sctp_chunk_retransmission(struct sctp_inpcb *inp, struct sctp_tcb *stcb, struct sctp_association *asoc, int *cnt_out, struct timeval *now, int *now_filled, int *fr_done, int so_locked #if !defined(__APPLE__) && !defined(SCTP_SO_LOCK_TESTING) SCTP_UNUSED #endif ) { /*- * send out one MTU of retransmission. If fast_retransmit is * happening we ignore the cwnd. Otherwise we obey the cwnd and * rwnd. For a Cookie or Asconf in the control chunk queue we * retransmit them by themselves. * * For data chunks we will pick out the lowest TSN's in the sent_queue * marked for resend and bundle them all together (up to a MTU of * destination). The address to send to should have been * selected/changed where the retransmission was marked (i.e. in FR * or t3-timeout routines). */ struct sctp_tmit_chunk *data_list[SCTP_MAX_DATA_BUNDLING]; struct sctp_tmit_chunk *chk, *fwd; struct mbuf *m, *endofchain; struct sctp_nets *net = NULL; uint32_t tsns_sent = 0; int no_fragmentflg, bundle_at, cnt_thru; unsigned int mtu; int error, i, one_chunk, fwd_tsn, ctl_cnt, tmr_started; struct sctp_auth_chunk *auth = NULL; uint32_t auth_offset = 0; uint16_t auth_keyid; int override_ok = 1; int data_auth_reqd = 0; uint32_t dmtu = 0; SCTP_TCB_LOCK_ASSERT(stcb); tmr_started = ctl_cnt = bundle_at = error = 0; no_fragmentflg = 1; fwd_tsn = 0; *cnt_out = 0; fwd = NULL; endofchain = m = NULL; auth_keyid = stcb->asoc.authinfo.active_keyid; #ifdef SCTP_AUDITING_ENABLED sctp_audit_log(0xC3, 1); #endif if ((TAILQ_EMPTY(&asoc->sent_queue)) && (TAILQ_EMPTY(&asoc->control_send_queue))) { SCTPDBG(SCTP_DEBUG_OUTPUT1, "SCTP hits empty queue with cnt set to %d?\n", asoc->sent_queue_retran_cnt); asoc->sent_queue_cnt = 0; asoc->sent_queue_cnt_removeable = 0; /* send back 0/0 so we enter normal transmission */ *cnt_out = 0; return (0); } TAILQ_FOREACH(chk, &asoc->control_send_queue, sctp_next) { if ((chk->rec.chunk_id.id == SCTP_COOKIE_ECHO) || (chk->rec.chunk_id.id == SCTP_STREAM_RESET) || (chk->rec.chunk_id.id == SCTP_FORWARD_CUM_TSN)) { if (chk->sent != SCTP_DATAGRAM_RESEND) { continue; } if (chk->rec.chunk_id.id == SCTP_STREAM_RESET) { if (chk != asoc->str_reset) { /* * not eligible for retran if its * not ours */ continue; } } ctl_cnt++; if (chk->rec.chunk_id.id == SCTP_FORWARD_CUM_TSN) { fwd_tsn = 1; } /* * Add an AUTH chunk, if chunk requires it save the * offset into the chain for AUTH */ if ((auth == NULL) && (sctp_auth_is_required_chunk(chk->rec.chunk_id.id, stcb->asoc.peer_auth_chunks))) { m = sctp_add_auth_chunk(m, &endofchain, &auth, &auth_offset, stcb, chk->rec.chunk_id.id); SCTP_STAT_INCR_COUNTER64(sctps_outcontrolchunks); } m = sctp_copy_mbufchain(chk->data, m, &endofchain, 0, chk->send_size, chk->copy_by_ref); break; } } one_chunk = 0; cnt_thru = 0; /* do we have control chunks to retransmit? */ if (m != NULL) { /* Start a timer no matter if we succeed or fail */ if (chk->rec.chunk_id.id == SCTP_COOKIE_ECHO) { sctp_timer_start(SCTP_TIMER_TYPE_COOKIE, inp, stcb, chk->whoTo); } else if (chk->rec.chunk_id.id == SCTP_ASCONF) sctp_timer_start(SCTP_TIMER_TYPE_ASCONF, inp, stcb, chk->whoTo); chk->snd_count++; /* update our count */ if ((error = sctp_lowlevel_chunk_output(inp, stcb, chk->whoTo, (struct sockaddr *)&chk->whoTo->ro._l_addr, m, auth_offset, auth, stcb->asoc.authinfo.active_keyid, no_fragmentflg, 0, 0, inp->sctp_lport, stcb->rport, htonl(stcb->asoc.peer_vtag), chk->whoTo->port, NULL, 0, 0, so_locked))) { SCTP_STAT_INCR(sctps_lowlevelerr); return (error); } endofchain = NULL; auth = NULL; auth_offset = 0; /* * We don't want to mark the net->sent time here since this * we use this for HB and retrans cannot measure RTT */ /* (void)SCTP_GETTIME_TIMEVAL(&chk->whoTo->last_sent_time); */ *cnt_out += 1; chk->sent = SCTP_DATAGRAM_SENT; sctp_ucount_decr(stcb->asoc.sent_queue_retran_cnt); if (fwd_tsn == 0) { return (0); } else { /* Clean up the fwd-tsn list */ sctp_clean_up_ctl(stcb, asoc, so_locked); return (0); } } /* * Ok, it is just data retransmission we need to do or that and a * fwd-tsn with it all. */ if (TAILQ_EMPTY(&asoc->sent_queue)) { return (SCTP_RETRAN_DONE); } if ((SCTP_GET_STATE(asoc) == SCTP_STATE_COOKIE_ECHOED) || (SCTP_GET_STATE(asoc) == SCTP_STATE_COOKIE_WAIT)) { /* not yet open, resend the cookie and that is it */ return (1); } #ifdef SCTP_AUDITING_ENABLED sctp_auditing(20, inp, stcb, NULL); #endif data_auth_reqd = sctp_auth_is_required_chunk(SCTP_DATA, stcb->asoc.peer_auth_chunks); TAILQ_FOREACH(chk, &asoc->sent_queue, sctp_next) { if (chk->sent != SCTP_DATAGRAM_RESEND) { /* No, not sent to this net or not ready for rtx */ continue; } if (chk->data == NULL) { SCTP_PRINTF("TSN:%x chk->snd_count:%d chk->sent:%d can't retran - no data\n", chk->rec.data.TSN_seq, chk->snd_count, chk->sent); continue; } if ((SCTP_BASE_SYSCTL(sctp_max_retran_chunk)) && (chk->snd_count >= SCTP_BASE_SYSCTL(sctp_max_retran_chunk))) { struct mbuf *op_err; char msg[SCTP_DIAG_INFO_LEN]; snprintf(msg, sizeof(msg), "TSN %8.8x retransmitted %d times, giving up", chk->rec.data.TSN_seq, chk->snd_count); op_err = sctp_generate_cause(SCTP_BASE_SYSCTL(sctp_diag_info_code), msg); atomic_add_int(&stcb->asoc.refcnt, 1); sctp_abort_an_association(stcb->sctp_ep, stcb, op_err, so_locked); SCTP_TCB_LOCK(stcb); atomic_subtract_int(&stcb->asoc.refcnt, 1); return (SCTP_RETRAN_EXIT); } /* pick up the net */ net = chk->whoTo; switch (net->ro._l_addr.sa.sa_family) { #ifdef INET case AF_INET: mtu = net->mtu - SCTP_MIN_V4_OVERHEAD; break; #endif #ifdef INET6 case AF_INET6: mtu = net->mtu - SCTP_MIN_OVERHEAD; break; #endif default: /* TSNH */ mtu = net->mtu; break; } if ((asoc->peers_rwnd < mtu) && (asoc->total_flight > 0)) { /* No room in peers rwnd */ uint32_t tsn; tsn = asoc->last_acked_seq + 1; if (tsn == chk->rec.data.TSN_seq) { /* * we make a special exception for this * case. The peer has no rwnd but is missing * the lowest chunk.. which is probably what * is holding up the rwnd. */ goto one_chunk_around; } return (1); } one_chunk_around: if (asoc->peers_rwnd < mtu) { one_chunk = 1; if ((asoc->peers_rwnd == 0) && (asoc->total_flight == 0)) { chk->window_probe = 1; chk->whoTo->window_probe = 1; } } #ifdef SCTP_AUDITING_ENABLED sctp_audit_log(0xC3, 2); #endif bundle_at = 0; m = NULL; net->fast_retran_ip = 0; if (chk->rec.data.doing_fast_retransmit == 0) { /* * if no FR in progress skip destination that have * flight_size > cwnd. */ if (net->flight_size >= net->cwnd) { continue; } } else { /* * Mark the destination net to have FR recovery * limits put on it. */ *fr_done = 1; net->fast_retran_ip = 1; } /* * if no AUTH is yet included and this chunk requires it, * make sure to account for it. We don't apply the size * until the AUTH chunk is actually added below in case * there is no room for this chunk. */ if (data_auth_reqd && (auth == NULL)) { dmtu = sctp_get_auth_chunk_len(stcb->asoc.peer_hmac_id); } else dmtu = 0; if ((chk->send_size <= (mtu - dmtu)) || (chk->flags & CHUNK_FLAGS_FRAGMENT_OK)) { /* ok we will add this one */ if (data_auth_reqd) { if (auth == NULL) { m = sctp_add_auth_chunk(m, &endofchain, &auth, &auth_offset, stcb, SCTP_DATA); auth_keyid = chk->auth_keyid; override_ok = 0; SCTP_STAT_INCR_COUNTER64(sctps_outcontrolchunks); } else if (override_ok) { auth_keyid = chk->auth_keyid; override_ok = 0; } else if (chk->auth_keyid != auth_keyid) { /* different keyid, so done bundling */ break; } } m = sctp_copy_mbufchain(chk->data, m, &endofchain, 0, chk->send_size, chk->copy_by_ref); if (m == NULL) { SCTP_LTRACE_ERR_RET(inp, stcb, NULL, SCTP_FROM_SCTP_OUTPUT, ENOMEM); return (ENOMEM); } /* Do clear IP_DF ? */ if (chk->flags & CHUNK_FLAGS_FRAGMENT_OK) { no_fragmentflg = 0; } /* upate our MTU size */ if (mtu > (chk->send_size + dmtu)) mtu -= (chk->send_size + dmtu); else mtu = 0; data_list[bundle_at++] = chk; if (one_chunk && (asoc->total_flight <= 0)) { SCTP_STAT_INCR(sctps_windowprobed); } } if (one_chunk == 0) { /* * now are there anymore forward from chk to pick * up? */ for (fwd = TAILQ_NEXT(chk, sctp_next); fwd != NULL; fwd = TAILQ_NEXT(fwd, sctp_next)) { if (fwd->sent != SCTP_DATAGRAM_RESEND) { /* Nope, not for retran */ continue; } if (fwd->whoTo != net) { /* Nope, not the net in question */ continue; } if (data_auth_reqd && (auth == NULL)) { dmtu = sctp_get_auth_chunk_len(stcb->asoc.peer_hmac_id); } else dmtu = 0; if (fwd->send_size <= (mtu - dmtu)) { if (data_auth_reqd) { if (auth == NULL) { m = sctp_add_auth_chunk(m, &endofchain, &auth, &auth_offset, stcb, SCTP_DATA); auth_keyid = fwd->auth_keyid; override_ok = 0; SCTP_STAT_INCR_COUNTER64(sctps_outcontrolchunks); } else if (override_ok) { auth_keyid = fwd->auth_keyid; override_ok = 0; } else if (fwd->auth_keyid != auth_keyid) { /* * different keyid, * so done bundling */ break; } } m = sctp_copy_mbufchain(fwd->data, m, &endofchain, 0, fwd->send_size, fwd->copy_by_ref); if (m == NULL) { SCTP_LTRACE_ERR_RET(inp, stcb, NULL, SCTP_FROM_SCTP_OUTPUT, ENOMEM); return (ENOMEM); } /* Do clear IP_DF ? */ if (fwd->flags & CHUNK_FLAGS_FRAGMENT_OK) { no_fragmentflg = 0; } /* upate our MTU size */ if (mtu > (fwd->send_size + dmtu)) mtu -= (fwd->send_size + dmtu); else mtu = 0; data_list[bundle_at++] = fwd; if (bundle_at >= SCTP_MAX_DATA_BUNDLING) { break; } } else { /* can't fit so we are done */ break; } } } /* Is there something to send for this destination? */ if (m) { /* * No matter if we fail/or succeed we should start a * timer. A failure is like a lost IP packet :-) */ if (!SCTP_OS_TIMER_PENDING(&net->rxt_timer.timer)) { /* * no timer running on this destination * restart it. */ sctp_timer_start(SCTP_TIMER_TYPE_SEND, inp, stcb, net); tmr_started = 1; } /* Now lets send it, if there is anything to send :> */ if ((error = sctp_lowlevel_chunk_output(inp, stcb, net, (struct sockaddr *)&net->ro._l_addr, m, auth_offset, auth, auth_keyid, no_fragmentflg, 0, 0, inp->sctp_lport, stcb->rport, htonl(stcb->asoc.peer_vtag), net->port, NULL, 0, 0, so_locked))) { /* error, we could not output */ SCTP_STAT_INCR(sctps_lowlevelerr); return (error); } endofchain = NULL; auth = NULL; auth_offset = 0; /* For HB's */ /* * We don't want to mark the net->sent time here * since this we use this for HB and retrans cannot * measure RTT */ /* (void)SCTP_GETTIME_TIMEVAL(&net->last_sent_time); */ /* For auto-close */ cnt_thru++; if (*now_filled == 0) { (void)SCTP_GETTIME_TIMEVAL(&asoc->time_last_sent); *now = asoc->time_last_sent; *now_filled = 1; } else { asoc->time_last_sent = *now; } *cnt_out += bundle_at; #ifdef SCTP_AUDITING_ENABLED sctp_audit_log(0xC4, bundle_at); #endif if (bundle_at) { tsns_sent = data_list[0]->rec.data.TSN_seq; } for (i = 0; i < bundle_at; i++) { SCTP_STAT_INCR(sctps_sendretransdata); data_list[i]->sent = SCTP_DATAGRAM_SENT; /* * When we have a revoked data, and we * retransmit it, then we clear the revoked * flag since this flag dictates if we * subtracted from the fs */ if (data_list[i]->rec.data.chunk_was_revoked) { /* Deflate the cwnd */ data_list[i]->whoTo->cwnd -= data_list[i]->book_size; data_list[i]->rec.data.chunk_was_revoked = 0; } data_list[i]->snd_count++; sctp_ucount_decr(asoc->sent_queue_retran_cnt); /* record the time */ data_list[i]->sent_rcv_time = asoc->time_last_sent; if (data_list[i]->book_size_scale) { /* * need to double the book size on * this one */ data_list[i]->book_size_scale = 0; /* * Since we double the booksize, we * must also double the output queue * size, since this get shrunk when * we free by this amount. */ atomic_add_int(&((asoc)->total_output_queue_size), data_list[i]->book_size); data_list[i]->book_size *= 2; } else { if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_LOG_RWND_ENABLE) { sctp_log_rwnd(SCTP_DECREASE_PEER_RWND, asoc->peers_rwnd, data_list[i]->send_size, SCTP_BASE_SYSCTL(sctp_peer_chunk_oh)); } asoc->peers_rwnd = sctp_sbspace_sub(asoc->peers_rwnd, (uint32_t) (data_list[i]->send_size + SCTP_BASE_SYSCTL(sctp_peer_chunk_oh))); } if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_FLIGHT_LOGGING_ENABLE) { sctp_misc_ints(SCTP_FLIGHT_LOG_UP_RSND, data_list[i]->whoTo->flight_size, data_list[i]->book_size, (uint32_t) (uintptr_t) data_list[i]->whoTo, data_list[i]->rec.data.TSN_seq); } sctp_flight_size_increase(data_list[i]); sctp_total_flight_increase(stcb, data_list[i]); if (asoc->peers_rwnd < stcb->sctp_ep->sctp_ep.sctp_sws_sender) { /* SWS sender side engages */ asoc->peers_rwnd = 0; } if ((i == 0) && (data_list[i]->rec.data.doing_fast_retransmit)) { SCTP_STAT_INCR(sctps_sendfastretrans); if ((data_list[i] == TAILQ_FIRST(&asoc->sent_queue)) && (tmr_started == 0)) { /*- * ok we just fast-retrans'd * the lowest TSN, i.e the * first on the list. In * this case we want to give * some more time to get a * SACK back without a * t3-expiring. */ sctp_timer_stop(SCTP_TIMER_TYPE_SEND, inp, stcb, net, SCTP_FROM_SCTP_OUTPUT + SCTP_LOC_2); sctp_timer_start(SCTP_TIMER_TYPE_SEND, inp, stcb, net); } } } if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_CWND_LOGGING_ENABLE) { sctp_log_cwnd(stcb, net, tsns_sent, SCTP_CWND_LOG_FROM_RESEND); } #ifdef SCTP_AUDITING_ENABLED sctp_auditing(21, inp, stcb, NULL); #endif } else { /* None will fit */ return (1); } if (asoc->sent_queue_retran_cnt <= 0) { /* all done we have no more to retran */ asoc->sent_queue_retran_cnt = 0; break; } if (one_chunk) { /* No more room in rwnd */ return (1); } /* stop the for loop here. we sent out a packet */ break; } return (0); } static void sctp_timer_validation(struct sctp_inpcb *inp, struct sctp_tcb *stcb, struct sctp_association *asoc) { struct sctp_nets *net; /* Validate that a timer is running somewhere */ TAILQ_FOREACH(net, &asoc->nets, sctp_next) { if (SCTP_OS_TIMER_PENDING(&net->rxt_timer.timer)) { /* Here is a timer */ return; } } SCTP_TCB_LOCK_ASSERT(stcb); /* Gak, we did not have a timer somewhere */ SCTPDBG(SCTP_DEBUG_OUTPUT3, "Deadlock avoided starting timer on a dest at retran\n"); if (asoc->alternate) { sctp_timer_start(SCTP_TIMER_TYPE_SEND, inp, stcb, asoc->alternate); } else { sctp_timer_start(SCTP_TIMER_TYPE_SEND, inp, stcb, asoc->primary_destination); } return; } void sctp_chunk_output(struct sctp_inpcb *inp, struct sctp_tcb *stcb, int from_where, int so_locked #if !defined(__APPLE__) && !defined(SCTP_SO_LOCK_TESTING) SCTP_UNUSED #endif ) { /*- * Ok this is the generic chunk service queue. we must do the * following: * - See if there are retransmits pending, if so we must * do these first. * - Service the stream queue that is next, moving any * message (note I must get a complete message i.e. * FIRST/MIDDLE and LAST to the out queue in one pass) and assigning * TSN's * - Check to see if the cwnd/rwnd allows any output, if so we * go ahead and fomulate and send the low level chunks. Making sure * to combine any control in the control chunk queue also. */ struct sctp_association *asoc; struct sctp_nets *net; int error = 0, num_out, tot_out = 0, ret = 0, reason_code; unsigned int burst_cnt = 0; struct timeval now; int now_filled = 0; int nagle_on; int frag_point = sctp_get_frag_point(stcb, &stcb->asoc); int un_sent = 0; int fr_done; unsigned int tot_frs = 0; asoc = &stcb->asoc; do_it_again: /* The Nagle algorithm is only applied when handling a send call. */ if (from_where == SCTP_OUTPUT_FROM_USR_SEND) { if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_NODELAY)) { nagle_on = 0; } else { nagle_on = 1; } } else { nagle_on = 0; } SCTP_TCB_LOCK_ASSERT(stcb); un_sent = (stcb->asoc.total_output_queue_size - stcb->asoc.total_flight); if ((un_sent <= 0) && (TAILQ_EMPTY(&asoc->control_send_queue)) && (TAILQ_EMPTY(&asoc->asconf_send_queue)) && (asoc->sent_queue_retran_cnt == 0) && (asoc->trigger_reset == 0)) { /* Nothing to do unless there is something to be sent left */ return; } /* * Do we have something to send, data or control AND a sack timer * running, if so piggy-back the sack. */ if (SCTP_OS_TIMER_PENDING(&stcb->asoc.dack_timer.timer)) { sctp_send_sack(stcb, so_locked); (void)SCTP_OS_TIMER_STOP(&stcb->asoc.dack_timer.timer); } while (asoc->sent_queue_retran_cnt) { /*- * Ok, it is retransmission time only, we send out only ONE * packet with a single call off to the retran code. */ if (from_where == SCTP_OUTPUT_FROM_COOKIE_ACK) { /*- * Special hook for handling cookiess discarded * by peer that carried data. Send cookie-ack only * and then the next call with get the retran's. */ (void)sctp_med_chunk_output(inp, stcb, asoc, &num_out, &reason_code, 1, from_where, &now, &now_filled, frag_point, so_locked); return; } else if (from_where != SCTP_OUTPUT_FROM_HB_TMR) { /* if its not from a HB then do it */ fr_done = 0; ret = sctp_chunk_retransmission(inp, stcb, asoc, &num_out, &now, &now_filled, &fr_done, so_locked); if (fr_done) { tot_frs++; } } else { /* * its from any other place, we don't allow retran * output (only control) */ ret = 1; } if (ret > 0) { /* Can't send anymore */ /*- * now lets push out control by calling med-level * output once. this assures that we WILL send HB's * if queued too. */ (void)sctp_med_chunk_output(inp, stcb, asoc, &num_out, &reason_code, 1, from_where, &now, &now_filled, frag_point, so_locked); #ifdef SCTP_AUDITING_ENABLED sctp_auditing(8, inp, stcb, NULL); #endif sctp_timer_validation(inp, stcb, asoc); return; } if (ret < 0) { /*- * The count was off.. retran is not happening so do * the normal retransmission. */ #ifdef SCTP_AUDITING_ENABLED sctp_auditing(9, inp, stcb, NULL); #endif if (ret == SCTP_RETRAN_EXIT) { return; } break; } if (from_where == SCTP_OUTPUT_FROM_T3) { /* Only one transmission allowed out of a timeout */ #ifdef SCTP_AUDITING_ENABLED sctp_auditing(10, inp, stcb, NULL); #endif /* Push out any control */ (void)sctp_med_chunk_output(inp, stcb, asoc, &num_out, &reason_code, 1, from_where, &now, &now_filled, frag_point, so_locked); return; } if ((asoc->fr_max_burst > 0) && (tot_frs >= asoc->fr_max_burst)) { /* Hit FR burst limit */ return; } if ((num_out == 0) && (ret == 0)) { /* No more retrans to send */ break; } } #ifdef SCTP_AUDITING_ENABLED sctp_auditing(12, inp, stcb, NULL); #endif /* Check for bad destinations, if they exist move chunks around. */ TAILQ_FOREACH(net, &asoc->nets, sctp_next) { if (!(net->dest_state & SCTP_ADDR_REACHABLE)) { /*- * if possible move things off of this address we * still may send below due to the dormant state but * we try to find an alternate address to send to * and if we have one we move all queued data on the * out wheel to this alternate address. */ if (net->ref_count > 1) sctp_move_chunks_from_net(stcb, net); } else { /*- * if ((asoc->sat_network) || (net->addr_is_local)) * { burst_limit = asoc->max_burst * * SCTP_SAT_NETWORK_BURST_INCR; } */ if (asoc->max_burst > 0) { if (SCTP_BASE_SYSCTL(sctp_use_cwnd_based_maxburst)) { if ((net->flight_size + (asoc->max_burst * net->mtu)) < net->cwnd) { /* * JRS - Use the congestion * control given in the * congestion control module */ asoc->cc_functions.sctp_cwnd_update_after_output(stcb, net, asoc->max_burst); if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_LOG_MAXBURST_ENABLE) { sctp_log_maxburst(stcb, net, 0, asoc->max_burst, SCTP_MAX_BURST_APPLIED); } SCTP_STAT_INCR(sctps_maxburstqueued); } net->fast_retran_ip = 0; } else { if (net->flight_size == 0) { /* * Should be decaying the * cwnd here */ ; } } } } } burst_cnt = 0; do { error = sctp_med_chunk_output(inp, stcb, asoc, &num_out, &reason_code, 0, from_where, &now, &now_filled, frag_point, so_locked); if (error) { SCTPDBG(SCTP_DEBUG_OUTPUT1, "Error %d was returned from med-c-op\n", error); if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_LOG_MAXBURST_ENABLE) { sctp_log_maxburst(stcb, asoc->primary_destination, error, burst_cnt, SCTP_MAX_BURST_ERROR_STOP); } if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_CWND_LOGGING_ENABLE) { sctp_log_cwnd(stcb, NULL, error, SCTP_SEND_NOW_COMPLETES); sctp_log_cwnd(stcb, NULL, 0xdeadbeef, SCTP_SEND_NOW_COMPLETES); } break; } SCTPDBG(SCTP_DEBUG_OUTPUT3, "m-c-o put out %d\n", num_out); tot_out += num_out; burst_cnt++; if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_CWND_LOGGING_ENABLE) { sctp_log_cwnd(stcb, NULL, num_out, SCTP_SEND_NOW_COMPLETES); if (num_out == 0) { sctp_log_cwnd(stcb, NULL, reason_code, SCTP_SEND_NOW_COMPLETES); } } if (nagle_on) { /* * When the Nagle algorithm is used, look at how * much is unsent, then if its smaller than an MTU * and we have data in flight we stop, except if we * are handling a fragmented user message. */ un_sent = ((stcb->asoc.total_output_queue_size - stcb->asoc.total_flight) + (stcb->asoc.stream_queue_cnt * sizeof(struct sctp_data_chunk))); if ((un_sent < (int)(stcb->asoc.smallest_mtu - SCTP_MIN_OVERHEAD)) && (stcb->asoc.total_flight > 0)) { /* && sctp_is_feature_on(inp, SCTP_PCB_FLAGS_EXPLICIT_EOR))) {*/ break; } } if (TAILQ_EMPTY(&asoc->control_send_queue) && TAILQ_EMPTY(&asoc->send_queue) && sctp_is_there_unsent_data(stcb, so_locked) == 0) { /* Nothing left to send */ break; } if ((stcb->asoc.total_output_queue_size - stcb->asoc.total_flight) <= 0) { /* Nothing left to send */ break; } } while (num_out && ((asoc->max_burst == 0) || SCTP_BASE_SYSCTL(sctp_use_cwnd_based_maxburst) || (burst_cnt < asoc->max_burst))); if (SCTP_BASE_SYSCTL(sctp_use_cwnd_based_maxburst) == 0) { if ((asoc->max_burst > 0) && (burst_cnt >= asoc->max_burst)) { SCTP_STAT_INCR(sctps_maxburstqueued); asoc->burst_limit_applied = 1; if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_LOG_MAXBURST_ENABLE) { sctp_log_maxburst(stcb, asoc->primary_destination, 0, burst_cnt, SCTP_MAX_BURST_APPLIED); } } else { asoc->burst_limit_applied = 0; } } if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_CWND_LOGGING_ENABLE) { sctp_log_cwnd(stcb, NULL, tot_out, SCTP_SEND_NOW_COMPLETES); } SCTPDBG(SCTP_DEBUG_OUTPUT1, "Ok, we have put out %d chunks\n", tot_out); /*- * Now we need to clean up the control chunk chain if a ECNE is on * it. It must be marked as UNSENT again so next call will continue * to send it until such time that we get a CWR, to remove it. */ if (stcb->asoc.ecn_echo_cnt_onq) sctp_fix_ecn_echo(asoc); if (stcb->asoc.trigger_reset) { if (sctp_send_stream_reset_out_if_possible(stcb, so_locked) == 0) { goto do_it_again; } } return; } int sctp_output( struct sctp_inpcb *inp, struct mbuf *m, struct sockaddr *addr, struct mbuf *control, struct thread *p, int flags) { if (inp == NULL) { SCTP_LTRACE_ERR_RET_PKT(m, inp, NULL, NULL, SCTP_FROM_SCTP_OUTPUT, EINVAL); return (EINVAL); } if (inp->sctp_socket == NULL) { SCTP_LTRACE_ERR_RET_PKT(m, inp, NULL, NULL, SCTP_FROM_SCTP_OUTPUT, EINVAL); return (EINVAL); } return (sctp_sosend(inp->sctp_socket, addr, (struct uio *)NULL, m, control, flags, p )); } void send_forward_tsn(struct sctp_tcb *stcb, struct sctp_association *asoc) { struct sctp_tmit_chunk *chk, *at, *tp1, *last; struct sctp_forward_tsn_chunk *fwdtsn; struct sctp_strseq *strseq; struct sctp_strseq_mid *strseq_m; uint32_t advance_peer_ack_point; unsigned int cnt_of_space, i, ovh; unsigned int space_needed; unsigned int cnt_of_skipped = 0; int old; if (asoc->idata_supported) { old = 0; } else { old = 1; } SCTP_TCB_LOCK_ASSERT(stcb); TAILQ_FOREACH(chk, &asoc->control_send_queue, sctp_next) { if (chk->rec.chunk_id.id == SCTP_FORWARD_CUM_TSN) { /* mark it to unsent */ chk->sent = SCTP_DATAGRAM_UNSENT; chk->snd_count = 0; /* Do we correct its output location? */ if (chk->whoTo) { sctp_free_remote_addr(chk->whoTo); chk->whoTo = NULL; } goto sctp_fill_in_rest; } } /* Ok if we reach here we must build one */ sctp_alloc_a_chunk(stcb, chk); if (chk == NULL) { return; } asoc->fwd_tsn_cnt++; chk->copy_by_ref = 0; /* * We don't do the old thing here since this is used not for on-wire * but to tell if we are sending a fwd-tsn by the stack during * output. And if its a IFORWARD or a FORWARD it is a fwd-tsn. */ chk->rec.chunk_id.id = SCTP_FORWARD_CUM_TSN; chk->rec.chunk_id.can_take_data = 0; chk->flags = 0; chk->asoc = asoc; chk->whoTo = NULL; chk->data = sctp_get_mbuf_for_msg(MCLBYTES, 0, M_NOWAIT, 1, MT_DATA); if (chk->data == NULL) { sctp_free_a_chunk(stcb, chk, SCTP_SO_NOT_LOCKED); return; } SCTP_BUF_RESV_UF(chk->data, SCTP_MIN_OVERHEAD); chk->sent = SCTP_DATAGRAM_UNSENT; chk->snd_count = 0; TAILQ_INSERT_TAIL(&asoc->control_send_queue, chk, sctp_next); asoc->ctrl_queue_cnt++; sctp_fill_in_rest: /*- * Here we go through and fill out the part that deals with * stream/seq of the ones we skip. */ SCTP_BUF_LEN(chk->data) = 0; TAILQ_FOREACH(at, &asoc->sent_queue, sctp_next) { if ((at->sent != SCTP_FORWARD_TSN_SKIP) && (at->sent != SCTP_DATAGRAM_NR_ACKED)) { /* no more to look at */ break; } if (old && (at->rec.data.rcv_flags & SCTP_DATA_UNORDERED)) { /* We don't report these */ continue; } cnt_of_skipped++; } if (old) { space_needed = (sizeof(struct sctp_forward_tsn_chunk) + (cnt_of_skipped * sizeof(struct sctp_strseq))); } else { space_needed = (sizeof(struct sctp_forward_tsn_chunk) + (cnt_of_skipped * sizeof(struct sctp_strseq_mid))); } cnt_of_space = (unsigned int)M_TRAILINGSPACE(chk->data); if (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) { ovh = SCTP_MIN_OVERHEAD; } else { ovh = SCTP_MIN_V4_OVERHEAD; } if (cnt_of_space > (asoc->smallest_mtu - ovh)) { /* trim to a mtu size */ cnt_of_space = asoc->smallest_mtu - ovh; } if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_LOG_TRY_ADVANCE) { sctp_misc_ints(SCTP_FWD_TSN_CHECK, 0xff, 0, cnt_of_skipped, asoc->advanced_peer_ack_point); } advance_peer_ack_point = asoc->advanced_peer_ack_point; if (cnt_of_space < space_needed) { /*- * ok we must trim down the chunk by lowering the * advance peer ack point. */ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_LOG_TRY_ADVANCE) { sctp_misc_ints(SCTP_FWD_TSN_CHECK, 0xff, 0xff, cnt_of_space, space_needed); } if (old) { cnt_of_skipped = cnt_of_space - sizeof(struct sctp_forward_tsn_chunk); cnt_of_skipped /= sizeof(struct sctp_strseq); } else { cnt_of_skipped = cnt_of_space - sizeof(struct sctp_forward_tsn_chunk); cnt_of_skipped /= sizeof(struct sctp_strseq_mid); } /*- * Go through and find the TSN that will be the one * we report. */ at = TAILQ_FIRST(&asoc->sent_queue); if (at != NULL) { for (i = 0; i < cnt_of_skipped; i++) { tp1 = TAILQ_NEXT(at, sctp_next); if (tp1 == NULL) { break; } at = tp1; } } if (at && SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_LOG_TRY_ADVANCE) { sctp_misc_ints(SCTP_FWD_TSN_CHECK, 0xff, cnt_of_skipped, at->rec.data.TSN_seq, asoc->advanced_peer_ack_point); } last = at; /*- * last now points to last one I can report, update * peer ack point */ if (last) { advance_peer_ack_point = last->rec.data.TSN_seq; } if (old) { space_needed = sizeof(struct sctp_forward_tsn_chunk) + cnt_of_skipped * sizeof(struct sctp_strseq); } else { space_needed = sizeof(struct sctp_forward_tsn_chunk) + cnt_of_skipped * sizeof(struct sctp_strseq_mid); } } chk->send_size = space_needed; /* Setup the chunk */ fwdtsn = mtod(chk->data, struct sctp_forward_tsn_chunk *); fwdtsn->ch.chunk_length = htons(chk->send_size); fwdtsn->ch.chunk_flags = 0; if (old) { fwdtsn->ch.chunk_type = SCTP_FORWARD_CUM_TSN; } else { fwdtsn->ch.chunk_type = SCTP_IFORWARD_CUM_TSN; } fwdtsn->new_cumulative_tsn = htonl(advance_peer_ack_point); SCTP_BUF_LEN(chk->data) = chk->send_size; fwdtsn++; /*- * Move pointer to after the fwdtsn and transfer to the * strseq pointer. */ if (old) { strseq = (struct sctp_strseq *)fwdtsn; } else { strseq_m = (struct sctp_strseq_mid *)fwdtsn; } /*- * Now populate the strseq list. This is done blindly * without pulling out duplicate stream info. This is * inefficent but won't harm the process since the peer will * look at these in sequence and will thus release anything. * It could mean we exceed the PMTU and chop off some that * we could have included.. but this is unlikely (aka 1432/4 * would mean 300+ stream seq's would have to be reported in * one FWD-TSN. With a bit of work we can later FIX this to * optimize and pull out duplicates.. but it does add more * overhead. So for now... not! */ i = 0; TAILQ_FOREACH(at, &asoc->sent_queue, sctp_next) { if (i >= cnt_of_skipped) { break; } if (old && (at->rec.data.rcv_flags & SCTP_DATA_UNORDERED)) { /* We don't report these */ continue; } if (at->rec.data.TSN_seq == advance_peer_ack_point) { at->rec.data.fwd_tsn_cnt = 0; } if (old) { strseq->stream = htons(at->rec.data.stream_number); strseq->sequence = htons((uint16_t) at->rec.data.stream_seq); strseq++; } else { strseq_m->stream = htons(at->rec.data.stream_number); if (at->rec.data.rcv_flags & SCTP_DATA_UNORDERED) { strseq_m->flags = htons(PR_SCTP_UNORDERED_FLAG); } else { strseq_m->flags = 0; } strseq_m->msg_id = htonl(at->rec.data.stream_seq); strseq_m++; } i++; } return; } void sctp_send_sack(struct sctp_tcb *stcb, int so_locked #if !defined(__APPLE__) && !defined(SCTP_SO_LOCK_TESTING) SCTP_UNUSED #endif ) { /*- * Queue up a SACK or NR-SACK in the control queue. * We must first check to see if a SACK or NR-SACK is * somehow on the control queue. * If so, we will take and and remove the old one. */ struct sctp_association *asoc; struct sctp_tmit_chunk *chk, *a_chk; struct sctp_sack_chunk *sack; struct sctp_nr_sack_chunk *nr_sack; struct sctp_gap_ack_block *gap_descriptor; const struct sack_track *selector; int mergeable = 0; int offset; caddr_t limit; uint32_t *dup; int limit_reached = 0; unsigned int i, siz, j; unsigned int num_gap_blocks = 0, num_nr_gap_blocks = 0, space; int num_dups = 0; int space_req; uint32_t highest_tsn; uint8_t flags; uint8_t type; uint8_t tsn_map; if (stcb->asoc.nrsack_supported == 1) { type = SCTP_NR_SELECTIVE_ACK; } else { type = SCTP_SELECTIVE_ACK; } a_chk = NULL; asoc = &stcb->asoc; SCTP_TCB_LOCK_ASSERT(stcb); if (asoc->last_data_chunk_from == NULL) { /* Hmm we never received anything */ return; } sctp_slide_mapping_arrays(stcb); sctp_set_rwnd(stcb, asoc); TAILQ_FOREACH(chk, &asoc->control_send_queue, sctp_next) { if (chk->rec.chunk_id.id == type) { /* Hmm, found a sack already on queue, remove it */ TAILQ_REMOVE(&asoc->control_send_queue, chk, sctp_next); asoc->ctrl_queue_cnt--; a_chk = chk; if (a_chk->data) { sctp_m_freem(a_chk->data); a_chk->data = NULL; } if (a_chk->whoTo) { sctp_free_remote_addr(a_chk->whoTo); a_chk->whoTo = NULL; } break; } } if (a_chk == NULL) { sctp_alloc_a_chunk(stcb, a_chk); if (a_chk == NULL) { /* No memory so we drop the idea, and set a timer */ if (stcb->asoc.delayed_ack) { sctp_timer_stop(SCTP_TIMER_TYPE_RECV, stcb->sctp_ep, stcb, NULL, SCTP_FROM_SCTP_OUTPUT + SCTP_LOC_3); sctp_timer_start(SCTP_TIMER_TYPE_RECV, stcb->sctp_ep, stcb, NULL); } else { stcb->asoc.send_sack = 1; } return; } a_chk->copy_by_ref = 0; a_chk->rec.chunk_id.id = type; a_chk->rec.chunk_id.can_take_data = 1; } /* Clear our pkt counts */ asoc->data_pkts_seen = 0; a_chk->flags = 0; a_chk->asoc = asoc; a_chk->snd_count = 0; a_chk->send_size = 0; /* fill in later */ a_chk->sent = SCTP_DATAGRAM_UNSENT; a_chk->whoTo = NULL; if (!(asoc->last_data_chunk_from->dest_state & SCTP_ADDR_REACHABLE)) { /*- * Ok, the destination for the SACK is unreachable, lets see if * we can select an alternate to asoc->last_data_chunk_from */ a_chk->whoTo = sctp_find_alternate_net(stcb, asoc->last_data_chunk_from, 0); if (a_chk->whoTo == NULL) { /* Nope, no alternate */ a_chk->whoTo = asoc->last_data_chunk_from; } } else { a_chk->whoTo = asoc->last_data_chunk_from; } if (a_chk->whoTo) { atomic_add_int(&a_chk->whoTo->ref_count, 1); } if (SCTP_TSN_GT(asoc->highest_tsn_inside_map, asoc->highest_tsn_inside_nr_map)) { highest_tsn = asoc->highest_tsn_inside_map; } else { highest_tsn = asoc->highest_tsn_inside_nr_map; } if (highest_tsn == asoc->cumulative_tsn) { /* no gaps */ if (type == SCTP_SELECTIVE_ACK) { space_req = sizeof(struct sctp_sack_chunk); } else { space_req = sizeof(struct sctp_nr_sack_chunk); } } else { /* gaps get a cluster */ space_req = MCLBYTES; } /* Ok now lets formulate a MBUF with our sack */ a_chk->data = sctp_get_mbuf_for_msg(space_req, 0, M_NOWAIT, 1, MT_DATA); if ((a_chk->data == NULL) || (a_chk->whoTo == NULL)) { /* rats, no mbuf memory */ if (a_chk->data) { /* was a problem with the destination */ sctp_m_freem(a_chk->data); a_chk->data = NULL; } sctp_free_a_chunk(stcb, a_chk, so_locked); /* sa_ignore NO_NULL_CHK */ if (stcb->asoc.delayed_ack) { sctp_timer_stop(SCTP_TIMER_TYPE_RECV, stcb->sctp_ep, stcb, NULL, SCTP_FROM_SCTP_OUTPUT + SCTP_LOC_4); sctp_timer_start(SCTP_TIMER_TYPE_RECV, stcb->sctp_ep, stcb, NULL); } else { stcb->asoc.send_sack = 1; } return; } /* ok, lets go through and fill it in */ SCTP_BUF_RESV_UF(a_chk->data, SCTP_MIN_OVERHEAD); space = (unsigned int)M_TRAILINGSPACE(a_chk->data); if (space > (a_chk->whoTo->mtu - SCTP_MIN_OVERHEAD)) { space = (a_chk->whoTo->mtu - SCTP_MIN_OVERHEAD); } limit = mtod(a_chk->data, caddr_t); limit += space; flags = 0; if ((asoc->sctp_cmt_on_off > 0) && SCTP_BASE_SYSCTL(sctp_cmt_use_dac)) { /*- * CMT DAC algorithm: If 2 (i.e., 0x10) packets have been * received, then set high bit to 1, else 0. Reset * pkts_rcvd. */ flags |= (asoc->cmt_dac_pkts_rcvd << 6); asoc->cmt_dac_pkts_rcvd = 0; } #ifdef SCTP_ASOCLOG_OF_TSNS stcb->asoc.cumack_logsnt[stcb->asoc.cumack_log_atsnt] = asoc->cumulative_tsn; stcb->asoc.cumack_log_atsnt++; if (stcb->asoc.cumack_log_atsnt >= SCTP_TSN_LOG_SIZE) { stcb->asoc.cumack_log_atsnt = 0; } #endif /* reset the readers interpretation */ stcb->freed_by_sorcv_sincelast = 0; if (type == SCTP_SELECTIVE_ACK) { sack = mtod(a_chk->data, struct sctp_sack_chunk *); nr_sack = NULL; gap_descriptor = (struct sctp_gap_ack_block *)((caddr_t)sack + sizeof(struct sctp_sack_chunk)); if (highest_tsn > asoc->mapping_array_base_tsn) { siz = (((highest_tsn - asoc->mapping_array_base_tsn) + 1) + 7) / 8; } else { siz = (((MAX_TSN - highest_tsn) + 1) + highest_tsn + 7) / 8; } } else { sack = NULL; nr_sack = mtod(a_chk->data, struct sctp_nr_sack_chunk *); gap_descriptor = (struct sctp_gap_ack_block *)((caddr_t)nr_sack + sizeof(struct sctp_nr_sack_chunk)); if (asoc->highest_tsn_inside_map > asoc->mapping_array_base_tsn) { siz = (((asoc->highest_tsn_inside_map - asoc->mapping_array_base_tsn) + 1) + 7) / 8; } else { siz = (((MAX_TSN - asoc->mapping_array_base_tsn) + 1) + asoc->highest_tsn_inside_map + 7) / 8; } } if (SCTP_TSN_GT(asoc->mapping_array_base_tsn, asoc->cumulative_tsn)) { offset = 1; } else { offset = asoc->mapping_array_base_tsn - asoc->cumulative_tsn; } if (((type == SCTP_SELECTIVE_ACK) && SCTP_TSN_GT(highest_tsn, asoc->cumulative_tsn)) || ((type == SCTP_NR_SELECTIVE_ACK) && SCTP_TSN_GT(asoc->highest_tsn_inside_map, asoc->cumulative_tsn))) { /* we have a gap .. maybe */ for (i = 0; i < siz; i++) { tsn_map = asoc->mapping_array[i]; if (type == SCTP_SELECTIVE_ACK) { tsn_map |= asoc->nr_mapping_array[i]; } if (i == 0) { /* * Clear all bits corresponding to TSNs * smaller or equal to the cumulative TSN. */ tsn_map &= (~0U << (1 - offset)); } selector = &sack_array[tsn_map]; if (mergeable && selector->right_edge) { /* * Backup, left and right edges were ok to * merge. */ num_gap_blocks--; gap_descriptor--; } if (selector->num_entries == 0) mergeable = 0; else { for (j = 0; j < selector->num_entries; j++) { if (mergeable && selector->right_edge) { /* * do a merge by NOT setting * the left side */ mergeable = 0; } else { /* * no merge, set the left * side */ mergeable = 0; gap_descriptor->start = htons((selector->gaps[j].start + offset)); } gap_descriptor->end = htons((selector->gaps[j].end + offset)); num_gap_blocks++; gap_descriptor++; if (((caddr_t)gap_descriptor + sizeof(struct sctp_gap_ack_block)) > limit) { /* no more room */ limit_reached = 1; break; } } if (selector->left_edge) { mergeable = 1; } } if (limit_reached) { /* Reached the limit stop */ break; } offset += 8; } } if ((type == SCTP_NR_SELECTIVE_ACK) && (limit_reached == 0)) { mergeable = 0; if (asoc->highest_tsn_inside_nr_map > asoc->mapping_array_base_tsn) { siz = (((asoc->highest_tsn_inside_nr_map - asoc->mapping_array_base_tsn) + 1) + 7) / 8; } else { siz = (((MAX_TSN - asoc->mapping_array_base_tsn) + 1) + asoc->highest_tsn_inside_nr_map + 7) / 8; } if (SCTP_TSN_GT(asoc->mapping_array_base_tsn, asoc->cumulative_tsn)) { offset = 1; } else { offset = asoc->mapping_array_base_tsn - asoc->cumulative_tsn; } if (SCTP_TSN_GT(asoc->highest_tsn_inside_nr_map, asoc->cumulative_tsn)) { /* we have a gap .. maybe */ for (i = 0; i < siz; i++) { tsn_map = asoc->nr_mapping_array[i]; if (i == 0) { /* * Clear all bits corresponding to * TSNs smaller or equal to the * cumulative TSN. */ tsn_map &= (~0U << (1 - offset)); } selector = &sack_array[tsn_map]; if (mergeable && selector->right_edge) { /* * Backup, left and right edges were * ok to merge. */ num_nr_gap_blocks--; gap_descriptor--; } if (selector->num_entries == 0) mergeable = 0; else { for (j = 0; j < selector->num_entries; j++) { if (mergeable && selector->right_edge) { /* * do a merge by NOT * setting the left * side */ mergeable = 0; } else { /* * no merge, set the * left side */ mergeable = 0; gap_descriptor->start = htons((selector->gaps[j].start + offset)); } gap_descriptor->end = htons((selector->gaps[j].end + offset)); num_nr_gap_blocks++; gap_descriptor++; if (((caddr_t)gap_descriptor + sizeof(struct sctp_gap_ack_block)) > limit) { /* no more room */ limit_reached = 1; break; } } if (selector->left_edge) { mergeable = 1; } } if (limit_reached) { /* Reached the limit stop */ break; } offset += 8; } } } /* now we must add any dups we are going to report. */ if ((limit_reached == 0) && (asoc->numduptsns)) { dup = (uint32_t *) gap_descriptor; for (i = 0; i < asoc->numduptsns; i++) { *dup = htonl(asoc->dup_tsns[i]); dup++; num_dups++; if (((caddr_t)dup + sizeof(uint32_t)) > limit) { /* no more room */ break; } } asoc->numduptsns = 0; } /* * now that the chunk is prepared queue it to the control chunk * queue. */ if (type == SCTP_SELECTIVE_ACK) { a_chk->send_size = (uint16_t) (sizeof(struct sctp_sack_chunk) + (num_gap_blocks + num_nr_gap_blocks) * sizeof(struct sctp_gap_ack_block) + num_dups * sizeof(int32_t)); SCTP_BUF_LEN(a_chk->data) = a_chk->send_size; sack->sack.cum_tsn_ack = htonl(asoc->cumulative_tsn); sack->sack.a_rwnd = htonl(asoc->my_rwnd); sack->sack.num_gap_ack_blks = htons(num_gap_blocks); sack->sack.num_dup_tsns = htons(num_dups); sack->ch.chunk_type = type; sack->ch.chunk_flags = flags; sack->ch.chunk_length = htons(a_chk->send_size); } else { a_chk->send_size = (uint16_t) (sizeof(struct sctp_nr_sack_chunk) + (num_gap_blocks + num_nr_gap_blocks) * sizeof(struct sctp_gap_ack_block) + num_dups * sizeof(int32_t)); SCTP_BUF_LEN(a_chk->data) = a_chk->send_size; nr_sack->nr_sack.cum_tsn_ack = htonl(asoc->cumulative_tsn); nr_sack->nr_sack.a_rwnd = htonl(asoc->my_rwnd); nr_sack->nr_sack.num_gap_ack_blks = htons(num_gap_blocks); nr_sack->nr_sack.num_nr_gap_ack_blks = htons(num_nr_gap_blocks); nr_sack->nr_sack.num_dup_tsns = htons(num_dups); nr_sack->nr_sack.reserved = 0; nr_sack->ch.chunk_type = type; nr_sack->ch.chunk_flags = flags; nr_sack->ch.chunk_length = htons(a_chk->send_size); } TAILQ_INSERT_TAIL(&asoc->control_send_queue, a_chk, sctp_next); asoc->my_last_reported_rwnd = asoc->my_rwnd; asoc->ctrl_queue_cnt++; asoc->send_sack = 0; SCTP_STAT_INCR(sctps_sendsacks); return; } void sctp_send_abort_tcb(struct sctp_tcb *stcb, struct mbuf *operr, int so_locked #if !defined(__APPLE__) && !defined(SCTP_SO_LOCK_TESTING) SCTP_UNUSED #endif ) { struct mbuf *m_abort, *m, *m_last; struct mbuf *m_out, *m_end = NULL; struct sctp_abort_chunk *abort; struct sctp_auth_chunk *auth = NULL; struct sctp_nets *net; uint32_t vtag; uint32_t auth_offset = 0; uint16_t cause_len, chunk_len, padding_len; SCTP_TCB_LOCK_ASSERT(stcb); /*- * Add an AUTH chunk, if chunk requires it and save the offset into * the chain for AUTH */ if (sctp_auth_is_required_chunk(SCTP_ABORT_ASSOCIATION, stcb->asoc.peer_auth_chunks)) { m_out = sctp_add_auth_chunk(NULL, &m_end, &auth, &auth_offset, stcb, SCTP_ABORT_ASSOCIATION); SCTP_STAT_INCR_COUNTER64(sctps_outcontrolchunks); } else { m_out = NULL; } m_abort = sctp_get_mbuf_for_msg(sizeof(struct sctp_abort_chunk), 0, M_NOWAIT, 1, MT_HEADER); if (m_abort == NULL) { if (m_out) { sctp_m_freem(m_out); } if (operr) { sctp_m_freem(operr); } return; } /* link in any error */ SCTP_BUF_NEXT(m_abort) = operr; cause_len = 0; m_last = NULL; for (m = operr; m; m = SCTP_BUF_NEXT(m)) { cause_len += (uint16_t) SCTP_BUF_LEN(m); if (SCTP_BUF_NEXT(m) == NULL) { m_last = m; } } SCTP_BUF_LEN(m_abort) = sizeof(struct sctp_abort_chunk); chunk_len = (uint16_t) sizeof(struct sctp_abort_chunk) + cause_len; padding_len = SCTP_SIZE32(chunk_len) - chunk_len; if (m_out == NULL) { /* NO Auth chunk prepended, so reserve space in front */ SCTP_BUF_RESV_UF(m_abort, SCTP_MIN_OVERHEAD); m_out = m_abort; } else { /* Put AUTH chunk at the front of the chain */ SCTP_BUF_NEXT(m_end) = m_abort; } if (stcb->asoc.alternate) { net = stcb->asoc.alternate; } else { net = stcb->asoc.primary_destination; } /* Fill in the ABORT chunk header. */ abort = mtod(m_abort, struct sctp_abort_chunk *); abort->ch.chunk_type = SCTP_ABORT_ASSOCIATION; if (stcb->asoc.peer_vtag == 0) { /* This happens iff the assoc is in COOKIE-WAIT state. */ vtag = stcb->asoc.my_vtag; abort->ch.chunk_flags = SCTP_HAD_NO_TCB; } else { vtag = stcb->asoc.peer_vtag; abort->ch.chunk_flags = 0; } abort->ch.chunk_length = htons(chunk_len); /* Add padding, if necessary. */ if (padding_len > 0) { if ((m_last == NULL) || (sctp_add_pad_tombuf(m_last, padding_len) == NULL)) { sctp_m_freem(m_out); return; } } (void)sctp_lowlevel_chunk_output(stcb->sctp_ep, stcb, net, (struct sockaddr *)&net->ro._l_addr, m_out, auth_offset, auth, stcb->asoc.authinfo.active_keyid, 1, 0, 0, stcb->sctp_ep->sctp_lport, stcb->rport, htonl(vtag), stcb->asoc.primary_destination->port, NULL, 0, 0, so_locked); SCTP_STAT_INCR_COUNTER64(sctps_outcontrolchunks); } void sctp_send_shutdown_complete(struct sctp_tcb *stcb, struct sctp_nets *net, int reflect_vtag) { /* formulate and SEND a SHUTDOWN-COMPLETE */ struct mbuf *m_shutdown_comp; struct sctp_shutdown_complete_chunk *shutdown_complete; uint32_t vtag; uint8_t flags; m_shutdown_comp = sctp_get_mbuf_for_msg(sizeof(struct sctp_chunkhdr), 0, M_NOWAIT, 1, MT_HEADER); if (m_shutdown_comp == NULL) { /* no mbuf's */ return; } if (reflect_vtag) { flags = SCTP_HAD_NO_TCB; vtag = stcb->asoc.my_vtag; } else { flags = 0; vtag = stcb->asoc.peer_vtag; } shutdown_complete = mtod(m_shutdown_comp, struct sctp_shutdown_complete_chunk *); shutdown_complete->ch.chunk_type = SCTP_SHUTDOWN_COMPLETE; shutdown_complete->ch.chunk_flags = flags; shutdown_complete->ch.chunk_length = htons(sizeof(struct sctp_shutdown_complete_chunk)); SCTP_BUF_LEN(m_shutdown_comp) = sizeof(struct sctp_shutdown_complete_chunk); (void)sctp_lowlevel_chunk_output(stcb->sctp_ep, stcb, net, (struct sockaddr *)&net->ro._l_addr, m_shutdown_comp, 0, NULL, 0, 1, 0, 0, stcb->sctp_ep->sctp_lport, stcb->rport, htonl(vtag), net->port, NULL, 0, 0, SCTP_SO_NOT_LOCKED); SCTP_STAT_INCR_COUNTER64(sctps_outcontrolchunks); return; } static void sctp_send_resp_msg(struct sockaddr *src, struct sockaddr *dst, struct sctphdr *sh, uint32_t vtag, uint8_t type, struct mbuf *cause, uint8_t mflowtype, uint32_t mflowid, uint16_t fibnum, uint32_t vrf_id, uint16_t port) { struct mbuf *o_pak; struct mbuf *mout; struct sctphdr *shout; struct sctp_chunkhdr *ch; #if defined(INET) || defined(INET6) struct udphdr *udp; int ret; #endif int len, cause_len, padding_len; #ifdef INET struct sockaddr_in *src_sin, *dst_sin; struct ip *ip; #endif #ifdef INET6 struct sockaddr_in6 *src_sin6, *dst_sin6; struct ip6_hdr *ip6; #endif /* Compute the length of the cause and add final padding. */ cause_len = 0; if (cause != NULL) { struct mbuf *m_at, *m_last = NULL; for (m_at = cause; m_at; m_at = SCTP_BUF_NEXT(m_at)) { if (SCTP_BUF_NEXT(m_at) == NULL) m_last = m_at; cause_len += SCTP_BUF_LEN(m_at); } padding_len = cause_len % 4; if (padding_len != 0) { padding_len = 4 - padding_len; } if (padding_len != 0) { if (sctp_add_pad_tombuf(m_last, padding_len) == NULL) { sctp_m_freem(cause); return; } } } else { padding_len = 0; } /* Get an mbuf for the header. */ len = sizeof(struct sctphdr) + sizeof(struct sctp_chunkhdr); switch (dst->sa_family) { #ifdef INET case AF_INET: len += sizeof(struct ip); break; #endif #ifdef INET6 case AF_INET6: len += sizeof(struct ip6_hdr); break; #endif default: break; } #if defined(INET) || defined(INET6) if (port) { len += sizeof(struct udphdr); } #endif mout = sctp_get_mbuf_for_msg(len + max_linkhdr, 1, M_NOWAIT, 1, MT_DATA); if (mout == NULL) { if (cause) { sctp_m_freem(cause); } return; } SCTP_BUF_RESV_UF(mout, max_linkhdr); SCTP_BUF_LEN(mout) = len; SCTP_BUF_NEXT(mout) = cause; M_SETFIB(mout, fibnum); mout->m_pkthdr.flowid = mflowid; M_HASHTYPE_SET(mout, mflowtype); #ifdef INET ip = NULL; #endif #ifdef INET6 ip6 = NULL; #endif switch (dst->sa_family) { #ifdef INET case AF_INET: src_sin = (struct sockaddr_in *)src; dst_sin = (struct sockaddr_in *)dst; ip = mtod(mout, struct ip *); ip->ip_v = IPVERSION; ip->ip_hl = (sizeof(struct ip) >> 2); ip->ip_tos = 0; ip->ip_off = 0; ip_fillid(ip); ip->ip_ttl = MODULE_GLOBAL(ip_defttl); if (port) { ip->ip_p = IPPROTO_UDP; } else { ip->ip_p = IPPROTO_SCTP; } ip->ip_src.s_addr = dst_sin->sin_addr.s_addr; ip->ip_dst.s_addr = src_sin->sin_addr.s_addr; ip->ip_sum = 0; len = sizeof(struct ip); shout = (struct sctphdr *)((caddr_t)ip + len); break; #endif #ifdef INET6 case AF_INET6: src_sin6 = (struct sockaddr_in6 *)src; dst_sin6 = (struct sockaddr_in6 *)dst; ip6 = mtod(mout, struct ip6_hdr *); ip6->ip6_flow = htonl(0x60000000); if (V_ip6_auto_flowlabel) { ip6->ip6_flow |= (htonl(ip6_randomflowlabel()) & IPV6_FLOWLABEL_MASK); } ip6->ip6_hlim = MODULE_GLOBAL(ip6_defhlim); if (port) { ip6->ip6_nxt = IPPROTO_UDP; } else { ip6->ip6_nxt = IPPROTO_SCTP; } ip6->ip6_src = dst_sin6->sin6_addr; ip6->ip6_dst = src_sin6->sin6_addr; len = sizeof(struct ip6_hdr); shout = (struct sctphdr *)((caddr_t)ip6 + len); break; #endif default: len = 0; shout = mtod(mout, struct sctphdr *); break; } #if defined(INET) || defined(INET6) if (port) { if (htons(SCTP_BASE_SYSCTL(sctp_udp_tunneling_port)) == 0) { sctp_m_freem(mout); return; } udp = (struct udphdr *)shout; udp->uh_sport = htons(SCTP_BASE_SYSCTL(sctp_udp_tunneling_port)); udp->uh_dport = port; udp->uh_sum = 0; udp->uh_ulen = htons((uint16_t) (sizeof(struct udphdr) + sizeof(struct sctphdr) + sizeof(struct sctp_chunkhdr) + cause_len + padding_len)); len += sizeof(struct udphdr); shout = (struct sctphdr *)((caddr_t)shout + sizeof(struct udphdr)); } else { udp = NULL; } #endif shout->src_port = sh->dest_port; shout->dest_port = sh->src_port; shout->checksum = 0; if (vtag) { shout->v_tag = htonl(vtag); } else { shout->v_tag = sh->v_tag; } len += sizeof(struct sctphdr); ch = (struct sctp_chunkhdr *)((caddr_t)shout + sizeof(struct sctphdr)); ch->chunk_type = type; if (vtag) { ch->chunk_flags = 0; } else { ch->chunk_flags = SCTP_HAD_NO_TCB; } ch->chunk_length = htons((uint16_t) (sizeof(struct sctp_chunkhdr) + cause_len)); len += sizeof(struct sctp_chunkhdr); len += cause_len + padding_len; if (SCTP_GET_HEADER_FOR_OUTPUT(o_pak)) { sctp_m_freem(mout); return; } SCTP_ATTACH_CHAIN(o_pak, mout, len); switch (dst->sa_family) { #ifdef INET case AF_INET: if (port) { if (V_udp_cksum) { udp->uh_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, udp->uh_ulen + htons(IPPROTO_UDP)); } else { udp->uh_sum = 0; } } ip->ip_len = htons(len); if (port) { #if defined(SCTP_WITH_NO_CSUM) SCTP_STAT_INCR(sctps_sendnocrc); #else shout->checksum = sctp_calculate_cksum(mout, sizeof(struct ip) + sizeof(struct udphdr)); SCTP_STAT_INCR(sctps_sendswcrc); #endif if (V_udp_cksum) { SCTP_ENABLE_UDP_CSUM(o_pak); } } else { #if defined(SCTP_WITH_NO_CSUM) SCTP_STAT_INCR(sctps_sendnocrc); #else mout->m_pkthdr.csum_flags = CSUM_SCTP; mout->m_pkthdr.csum_data = offsetof(struct sctphdr, checksum); SCTP_STAT_INCR(sctps_sendhwcrc); #endif } #ifdef SCTP_PACKET_LOGGING if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_LAST_PACKET_TRACING) { sctp_packet_log(o_pak); } #endif SCTP_IP_OUTPUT(ret, o_pak, NULL, NULL, vrf_id); break; #endif #ifdef INET6 case AF_INET6: ip6->ip6_plen = (uint16_t) (len - sizeof(struct ip6_hdr)); if (port) { #if defined(SCTP_WITH_NO_CSUM) SCTP_STAT_INCR(sctps_sendnocrc); #else shout->checksum = sctp_calculate_cksum(mout, sizeof(struct ip6_hdr) + sizeof(struct udphdr)); SCTP_STAT_INCR(sctps_sendswcrc); #endif if ((udp->uh_sum = in6_cksum(o_pak, IPPROTO_UDP, sizeof(struct ip6_hdr), len - sizeof(struct ip6_hdr))) == 0) { udp->uh_sum = 0xffff; } } else { #if defined(SCTP_WITH_NO_CSUM) SCTP_STAT_INCR(sctps_sendnocrc); #else mout->m_pkthdr.csum_flags = CSUM_SCTP_IPV6; mout->m_pkthdr.csum_data = offsetof(struct sctphdr, checksum); SCTP_STAT_INCR(sctps_sendhwcrc); #endif } #ifdef SCTP_PACKET_LOGGING if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_LAST_PACKET_TRACING) { sctp_packet_log(o_pak); } #endif SCTP_IP6_OUTPUT(ret, o_pak, NULL, NULL, NULL, vrf_id); break; #endif default: SCTPDBG(SCTP_DEBUG_OUTPUT1, "Unknown protocol (TSNH) type %d\n", dst->sa_family); sctp_m_freem(mout); SCTP_LTRACE_ERR_RET_PKT(mout, NULL, NULL, NULL, SCTP_FROM_SCTP_OUTPUT, EFAULT); return; } SCTP_STAT_INCR(sctps_sendpackets); SCTP_STAT_INCR_COUNTER64(sctps_outpackets); SCTP_STAT_INCR_COUNTER64(sctps_outcontrolchunks); return; } void sctp_send_shutdown_complete2(struct sockaddr *src, struct sockaddr *dst, struct sctphdr *sh, uint8_t mflowtype, uint32_t mflowid, uint16_t fibnum, uint32_t vrf_id, uint16_t port) { sctp_send_resp_msg(src, dst, sh, 0, SCTP_SHUTDOWN_COMPLETE, NULL, mflowtype, mflowid, fibnum, vrf_id, port); } void sctp_send_hb(struct sctp_tcb *stcb, struct sctp_nets *net, int so_locked #if !defined(__APPLE__) && !defined(SCTP_SO_LOCK_TESTING) SCTP_UNUSED #endif ) { struct sctp_tmit_chunk *chk; struct sctp_heartbeat_chunk *hb; struct timeval now; SCTP_TCB_LOCK_ASSERT(stcb); if (net == NULL) { return; } (void)SCTP_GETTIME_TIMEVAL(&now); switch (net->ro._l_addr.sa.sa_family) { #ifdef INET case AF_INET: break; #endif #ifdef INET6 case AF_INET6: break; #endif default: return; } sctp_alloc_a_chunk(stcb, chk); if (chk == NULL) { SCTPDBG(SCTP_DEBUG_OUTPUT4, "Gak, can't get a chunk for hb\n"); return; } chk->copy_by_ref = 0; chk->rec.chunk_id.id = SCTP_HEARTBEAT_REQUEST; chk->rec.chunk_id.can_take_data = 1; chk->flags = 0; chk->asoc = &stcb->asoc; chk->send_size = sizeof(struct sctp_heartbeat_chunk); chk->data = sctp_get_mbuf_for_msg(chk->send_size, 0, M_NOWAIT, 1, MT_HEADER); if (chk->data == NULL) { sctp_free_a_chunk(stcb, chk, so_locked); return; } SCTP_BUF_RESV_UF(chk->data, SCTP_MIN_OVERHEAD); SCTP_BUF_LEN(chk->data) = chk->send_size; chk->sent = SCTP_DATAGRAM_UNSENT; chk->snd_count = 0; chk->whoTo = net; atomic_add_int(&chk->whoTo->ref_count, 1); /* Now we have a mbuf that we can fill in with the details */ hb = mtod(chk->data, struct sctp_heartbeat_chunk *); memset(hb, 0, sizeof(struct sctp_heartbeat_chunk)); /* fill out chunk header */ hb->ch.chunk_type = SCTP_HEARTBEAT_REQUEST; hb->ch.chunk_flags = 0; hb->ch.chunk_length = htons(chk->send_size); /* Fill out hb parameter */ hb->heartbeat.hb_info.ph.param_type = htons(SCTP_HEARTBEAT_INFO); hb->heartbeat.hb_info.ph.param_length = htons(sizeof(struct sctp_heartbeat_info_param)); hb->heartbeat.hb_info.time_value_1 = now.tv_sec; hb->heartbeat.hb_info.time_value_2 = now.tv_usec; /* Did our user request this one, put it in */ hb->heartbeat.hb_info.addr_family = (uint8_t) net->ro._l_addr.sa.sa_family; hb->heartbeat.hb_info.addr_len = net->ro._l_addr.sa.sa_len; if (net->dest_state & SCTP_ADDR_UNCONFIRMED) { /* * we only take from the entropy pool if the address is not * confirmed. */ net->heartbeat_random1 = hb->heartbeat.hb_info.random_value1 = sctp_select_initial_TSN(&stcb->sctp_ep->sctp_ep); net->heartbeat_random2 = hb->heartbeat.hb_info.random_value2 = sctp_select_initial_TSN(&stcb->sctp_ep->sctp_ep); } else { net->heartbeat_random1 = hb->heartbeat.hb_info.random_value1 = 0; net->heartbeat_random2 = hb->heartbeat.hb_info.random_value2 = 0; } switch (net->ro._l_addr.sa.sa_family) { #ifdef INET case AF_INET: memcpy(hb->heartbeat.hb_info.address, &net->ro._l_addr.sin.sin_addr, sizeof(net->ro._l_addr.sin.sin_addr)); break; #endif #ifdef INET6 case AF_INET6: memcpy(hb->heartbeat.hb_info.address, &net->ro._l_addr.sin6.sin6_addr, sizeof(net->ro._l_addr.sin6.sin6_addr)); break; #endif default: if (chk->data) { sctp_m_freem(chk->data); chk->data = NULL; } sctp_free_a_chunk(stcb, chk, so_locked); return; break; } net->hb_responded = 0; TAILQ_INSERT_TAIL(&stcb->asoc.control_send_queue, chk, sctp_next); stcb->asoc.ctrl_queue_cnt++; SCTP_STAT_INCR(sctps_sendheartbeat); return; } void sctp_send_ecn_echo(struct sctp_tcb *stcb, struct sctp_nets *net, uint32_t high_tsn) { struct sctp_association *asoc; struct sctp_ecne_chunk *ecne; struct sctp_tmit_chunk *chk; if (net == NULL) { return; } asoc = &stcb->asoc; SCTP_TCB_LOCK_ASSERT(stcb); TAILQ_FOREACH(chk, &asoc->control_send_queue, sctp_next) { if ((chk->rec.chunk_id.id == SCTP_ECN_ECHO) && (net == chk->whoTo)) { /* found a previous ECN_ECHO update it if needed */ uint32_t cnt, ctsn; ecne = mtod(chk->data, struct sctp_ecne_chunk *); ctsn = ntohl(ecne->tsn); if (SCTP_TSN_GT(high_tsn, ctsn)) { ecne->tsn = htonl(high_tsn); SCTP_STAT_INCR(sctps_queue_upd_ecne); } cnt = ntohl(ecne->num_pkts_since_cwr); cnt++; ecne->num_pkts_since_cwr = htonl(cnt); return; } } /* nope could not find one to update so we must build one */ sctp_alloc_a_chunk(stcb, chk); if (chk == NULL) { return; } SCTP_STAT_INCR(sctps_queue_upd_ecne); chk->copy_by_ref = 0; chk->rec.chunk_id.id = SCTP_ECN_ECHO; chk->rec.chunk_id.can_take_data = 0; chk->flags = 0; chk->asoc = &stcb->asoc; chk->send_size = sizeof(struct sctp_ecne_chunk); chk->data = sctp_get_mbuf_for_msg(chk->send_size, 0, M_NOWAIT, 1, MT_HEADER); if (chk->data == NULL) { sctp_free_a_chunk(stcb, chk, SCTP_SO_NOT_LOCKED); return; } SCTP_BUF_RESV_UF(chk->data, SCTP_MIN_OVERHEAD); SCTP_BUF_LEN(chk->data) = chk->send_size; chk->sent = SCTP_DATAGRAM_UNSENT; chk->snd_count = 0; chk->whoTo = net; atomic_add_int(&chk->whoTo->ref_count, 1); stcb->asoc.ecn_echo_cnt_onq++; ecne = mtod(chk->data, struct sctp_ecne_chunk *); ecne->ch.chunk_type = SCTP_ECN_ECHO; ecne->ch.chunk_flags = 0; ecne->ch.chunk_length = htons(sizeof(struct sctp_ecne_chunk)); ecne->tsn = htonl(high_tsn); ecne->num_pkts_since_cwr = htonl(1); TAILQ_INSERT_HEAD(&stcb->asoc.control_send_queue, chk, sctp_next); asoc->ctrl_queue_cnt++; } void sctp_send_packet_dropped(struct sctp_tcb *stcb, struct sctp_nets *net, struct mbuf *m, int len, int iphlen, int bad_crc) { struct sctp_association *asoc; struct sctp_pktdrop_chunk *drp; struct sctp_tmit_chunk *chk; uint8_t *datap; int was_trunc = 0; int fullsz = 0; long spc; int offset; struct sctp_chunkhdr *ch, chunk_buf; unsigned int chk_length; if (!stcb) { return; } asoc = &stcb->asoc; SCTP_TCB_LOCK_ASSERT(stcb); if (asoc->pktdrop_supported == 0) { /*- * peer must declare support before I send one. */ return; } if (stcb->sctp_socket == NULL) { return; } sctp_alloc_a_chunk(stcb, chk); if (chk == NULL) { return; } chk->copy_by_ref = 0; chk->rec.chunk_id.id = SCTP_PACKET_DROPPED; chk->rec.chunk_id.can_take_data = 1; chk->flags = 0; len -= iphlen; chk->send_size = len; /* Validate that we do not have an ABORT in here. */ offset = iphlen + sizeof(struct sctphdr); ch = (struct sctp_chunkhdr *)sctp_m_getptr(m, offset, sizeof(*ch), (uint8_t *) & chunk_buf); while (ch != NULL) { chk_length = ntohs(ch->chunk_length); if (chk_length < sizeof(*ch)) { /* break to abort land */ break; } switch (ch->chunk_type) { case SCTP_PACKET_DROPPED: case SCTP_ABORT_ASSOCIATION: case SCTP_INITIATION_ACK: /** * We don't respond with an PKT-DROP to an ABORT * or PKT-DROP. We also do not respond to an * INIT-ACK, because we can't know if the initiation * tag is correct or not. */ sctp_free_a_chunk(stcb, chk, SCTP_SO_NOT_LOCKED); return; default: break; } offset += SCTP_SIZE32(chk_length); ch = (struct sctp_chunkhdr *)sctp_m_getptr(m, offset, sizeof(*ch), (uint8_t *) & chunk_buf); } if ((len + SCTP_MAX_OVERHEAD + sizeof(struct sctp_pktdrop_chunk)) > min(stcb->asoc.smallest_mtu, MCLBYTES)) { /* * only send 1 mtu worth, trim off the excess on the end. */ fullsz = len; len = min(stcb->asoc.smallest_mtu, MCLBYTES) - SCTP_MAX_OVERHEAD; was_trunc = 1; } chk->asoc = &stcb->asoc; chk->data = sctp_get_mbuf_for_msg(MCLBYTES, 0, M_NOWAIT, 1, MT_DATA); if (chk->data == NULL) { jump_out: sctp_free_a_chunk(stcb, chk, SCTP_SO_NOT_LOCKED); return; } SCTP_BUF_RESV_UF(chk->data, SCTP_MIN_OVERHEAD); drp = mtod(chk->data, struct sctp_pktdrop_chunk *); if (drp == NULL) { sctp_m_freem(chk->data); chk->data = NULL; goto jump_out; } chk->book_size = SCTP_SIZE32((chk->send_size + sizeof(struct sctp_pktdrop_chunk) + sizeof(struct sctphdr) + SCTP_MED_OVERHEAD)); chk->book_size_scale = 0; if (was_trunc) { drp->ch.chunk_flags = SCTP_PACKET_TRUNCATED; drp->trunc_len = htons(fullsz); /* * Len is already adjusted to size minus overhead above take * out the pkt_drop chunk itself from it. */ chk->send_size = (uint16_t) (len - sizeof(struct sctp_pktdrop_chunk)); len = chk->send_size; } else { /* no truncation needed */ drp->ch.chunk_flags = 0; drp->trunc_len = htons(0); } if (bad_crc) { drp->ch.chunk_flags |= SCTP_BADCRC; } chk->send_size += sizeof(struct sctp_pktdrop_chunk); SCTP_BUF_LEN(chk->data) = chk->send_size; chk->sent = SCTP_DATAGRAM_UNSENT; chk->snd_count = 0; if (net) { /* we should hit here */ chk->whoTo = net; atomic_add_int(&chk->whoTo->ref_count, 1); } else { chk->whoTo = NULL; } drp->ch.chunk_type = SCTP_PACKET_DROPPED; drp->ch.chunk_length = htons(chk->send_size); spc = SCTP_SB_LIMIT_RCV(stcb->sctp_socket); if (spc < 0) { spc = 0; } drp->bottle_bw = htonl(spc); if (asoc->my_rwnd) { drp->current_onq = htonl(asoc->size_on_reasm_queue + asoc->size_on_all_streams + asoc->my_rwnd_control_len + stcb->sctp_socket->so_rcv.sb_cc); } else { /*- * If my rwnd is 0, possibly from mbuf depletion as well as * space used, tell the peer there is NO space aka onq == bw */ drp->current_onq = htonl(spc); } drp->reserved = 0; datap = drp->data; m_copydata(m, iphlen, len, (caddr_t)datap); TAILQ_INSERT_TAIL(&stcb->asoc.control_send_queue, chk, sctp_next); asoc->ctrl_queue_cnt++; } void sctp_send_cwr(struct sctp_tcb *stcb, struct sctp_nets *net, uint32_t high_tsn, uint8_t override) { struct sctp_association *asoc; struct sctp_cwr_chunk *cwr; struct sctp_tmit_chunk *chk; SCTP_TCB_LOCK_ASSERT(stcb); if (net == NULL) { return; } asoc = &stcb->asoc; TAILQ_FOREACH(chk, &asoc->control_send_queue, sctp_next) { if ((chk->rec.chunk_id.id == SCTP_ECN_CWR) && (net == chk->whoTo)) { /* * found a previous CWR queued to same destination * update it if needed */ uint32_t ctsn; cwr = mtod(chk->data, struct sctp_cwr_chunk *); ctsn = ntohl(cwr->tsn); if (SCTP_TSN_GT(high_tsn, ctsn)) { cwr->tsn = htonl(high_tsn); } if (override & SCTP_CWR_REDUCE_OVERRIDE) { /* Make sure override is carried */ cwr->ch.chunk_flags |= SCTP_CWR_REDUCE_OVERRIDE; } return; } } sctp_alloc_a_chunk(stcb, chk); if (chk == NULL) { return; } chk->copy_by_ref = 0; chk->rec.chunk_id.id = SCTP_ECN_CWR; chk->rec.chunk_id.can_take_data = 1; chk->flags = 0; chk->asoc = &stcb->asoc; chk->send_size = sizeof(struct sctp_cwr_chunk); chk->data = sctp_get_mbuf_for_msg(chk->send_size, 0, M_NOWAIT, 1, MT_HEADER); if (chk->data == NULL) { sctp_free_a_chunk(stcb, chk, SCTP_SO_NOT_LOCKED); return; } SCTP_BUF_RESV_UF(chk->data, SCTP_MIN_OVERHEAD); SCTP_BUF_LEN(chk->data) = chk->send_size; chk->sent = SCTP_DATAGRAM_UNSENT; chk->snd_count = 0; chk->whoTo = net; atomic_add_int(&chk->whoTo->ref_count, 1); cwr = mtod(chk->data, struct sctp_cwr_chunk *); cwr->ch.chunk_type = SCTP_ECN_CWR; cwr->ch.chunk_flags = override; cwr->ch.chunk_length = htons(sizeof(struct sctp_cwr_chunk)); cwr->tsn = htonl(high_tsn); TAILQ_INSERT_TAIL(&stcb->asoc.control_send_queue, chk, sctp_next); asoc->ctrl_queue_cnt++; } static int sctp_add_stream_reset_out(struct sctp_tcb *stcb, struct sctp_tmit_chunk *chk, uint32_t seq, uint32_t resp_seq, uint32_t last_sent) { uint16_t len, old_len, i; struct sctp_stream_reset_out_request *req_out; struct sctp_chunkhdr *ch; int at; int number_entries = 0; ch = mtod(chk->data, struct sctp_chunkhdr *); old_len = len = SCTP_SIZE32(ntohs(ch->chunk_length)); /* get to new offset for the param. */ req_out = (struct sctp_stream_reset_out_request *)((caddr_t)ch + len); /* now how long will this param be? */ for (i = 0; i < stcb->asoc.streamoutcnt; i++) { if ((stcb->asoc.strmout[i].state == SCTP_STREAM_RESET_PENDING) && (stcb->asoc.strmout[i].chunks_on_queues == 0) && TAILQ_EMPTY(&stcb->asoc.strmout[i].outqueue)) { number_entries++; } } if (number_entries == 0) { return (0); } if (number_entries == stcb->asoc.streamoutcnt) { number_entries = 0; } if (number_entries > SCTP_MAX_STREAMS_AT_ONCE_RESET) { number_entries = SCTP_MAX_STREAMS_AT_ONCE_RESET; } len = (uint16_t) (sizeof(struct sctp_stream_reset_out_request) + (sizeof(uint16_t) * number_entries)); req_out->ph.param_type = htons(SCTP_STR_RESET_OUT_REQUEST); req_out->ph.param_length = htons(len); req_out->request_seq = htonl(seq); req_out->response_seq = htonl(resp_seq); req_out->send_reset_at_tsn = htonl(last_sent); at = 0; if (number_entries) { for (i = 0; i < stcb->asoc.streamoutcnt; i++) { if ((stcb->asoc.strmout[i].state == SCTP_STREAM_RESET_PENDING) && (stcb->asoc.strmout[i].chunks_on_queues == 0) && TAILQ_EMPTY(&stcb->asoc.strmout[i].outqueue)) { req_out->list_of_streams[at] = htons(i); at++; stcb->asoc.strmout[i].state = SCTP_STREAM_RESET_IN_FLIGHT; if (at >= number_entries) { break; } } } } else { for (i = 0; i < stcb->asoc.streamoutcnt; i++) { stcb->asoc.strmout[i].state = SCTP_STREAM_RESET_IN_FLIGHT; } } if (SCTP_SIZE32(len) > len) { /*- * Need to worry about the pad we may end up adding to the * end. This is easy since the struct is either aligned to 4 * bytes or 2 bytes off. */ req_out->list_of_streams[number_entries] = 0; } /* now fix the chunk length */ ch->chunk_length = htons(len + old_len); chk->book_size = len + old_len; chk->book_size_scale = 0; chk->send_size = SCTP_SIZE32(chk->book_size); SCTP_BUF_LEN(chk->data) = chk->send_size; return (1); } static void sctp_add_stream_reset_in(struct sctp_tmit_chunk *chk, int number_entries, uint16_t * list, uint32_t seq) { uint16_t len, old_len, i; struct sctp_stream_reset_in_request *req_in; struct sctp_chunkhdr *ch; ch = mtod(chk->data, struct sctp_chunkhdr *); old_len = len = SCTP_SIZE32(ntohs(ch->chunk_length)); /* get to new offset for the param. */ req_in = (struct sctp_stream_reset_in_request *)((caddr_t)ch + len); /* now how long will this param be? */ len = (uint16_t) (sizeof(struct sctp_stream_reset_in_request) + (sizeof(uint16_t) * number_entries)); req_in->ph.param_type = htons(SCTP_STR_RESET_IN_REQUEST); req_in->ph.param_length = htons(len); req_in->request_seq = htonl(seq); if (number_entries) { for (i = 0; i < number_entries; i++) { req_in->list_of_streams[i] = htons(list[i]); } } if (SCTP_SIZE32(len) > len) { /*- * Need to worry about the pad we may end up adding to the * end. This is easy since the struct is either aligned to 4 * bytes or 2 bytes off. */ req_in->list_of_streams[number_entries] = 0; } /* now fix the chunk length */ ch->chunk_length = htons(len + old_len); chk->book_size = len + old_len; chk->book_size_scale = 0; chk->send_size = SCTP_SIZE32(chk->book_size); SCTP_BUF_LEN(chk->data) = chk->send_size; return; } static void sctp_add_stream_reset_tsn(struct sctp_tmit_chunk *chk, uint32_t seq) { uint16_t len, old_len; struct sctp_stream_reset_tsn_request *req_tsn; struct sctp_chunkhdr *ch; ch = mtod(chk->data, struct sctp_chunkhdr *); old_len = len = SCTP_SIZE32(ntohs(ch->chunk_length)); /* get to new offset for the param. */ req_tsn = (struct sctp_stream_reset_tsn_request *)((caddr_t)ch + len); /* now how long will this param be? */ len = sizeof(struct sctp_stream_reset_tsn_request); req_tsn->ph.param_type = htons(SCTP_STR_RESET_TSN_REQUEST); req_tsn->ph.param_length = htons(len); req_tsn->request_seq = htonl(seq); /* now fix the chunk length */ ch->chunk_length = htons(len + old_len); chk->send_size = len + old_len; chk->book_size = SCTP_SIZE32(chk->send_size); chk->book_size_scale = 0; SCTP_BUF_LEN(chk->data) = SCTP_SIZE32(chk->send_size); return; } void sctp_add_stream_reset_result(struct sctp_tmit_chunk *chk, uint32_t resp_seq, uint32_t result) { uint16_t len, old_len; struct sctp_stream_reset_response *resp; struct sctp_chunkhdr *ch; ch = mtod(chk->data, struct sctp_chunkhdr *); old_len = len = SCTP_SIZE32(ntohs(ch->chunk_length)); /* get to new offset for the param. */ resp = (struct sctp_stream_reset_response *)((caddr_t)ch + len); /* now how long will this param be? */ len = sizeof(struct sctp_stream_reset_response); resp->ph.param_type = htons(SCTP_STR_RESET_RESPONSE); resp->ph.param_length = htons(len); resp->response_seq = htonl(resp_seq); resp->result = ntohl(result); /* now fix the chunk length */ ch->chunk_length = htons(len + old_len); chk->book_size = len + old_len; chk->book_size_scale = 0; chk->send_size = SCTP_SIZE32(chk->book_size); SCTP_BUF_LEN(chk->data) = chk->send_size; return; } void sctp_send_deferred_reset_response(struct sctp_tcb *stcb, struct sctp_stream_reset_list *ent, int response) { struct sctp_association *asoc; struct sctp_tmit_chunk *chk; struct sctp_chunkhdr *ch; asoc = &stcb->asoc; /* * Reset our last reset action to the new one IP -> response * (PERFORMED probably). This assures that if we fail to send, a * retran from the peer will get the new response. */ asoc->last_reset_action[0] = response; if (asoc->stream_reset_outstanding) { return; } sctp_alloc_a_chunk(stcb, chk); if (chk == NULL) { SCTP_LTRACE_ERR_RET(NULL, stcb, NULL, SCTP_FROM_SCTP_OUTPUT, ENOMEM); return; } chk->copy_by_ref = 0; chk->rec.chunk_id.id = SCTP_STREAM_RESET; chk->rec.chunk_id.can_take_data = 0; chk->flags = 0; chk->asoc = &stcb->asoc; chk->book_size = sizeof(struct sctp_chunkhdr); chk->send_size = SCTP_SIZE32(chk->book_size); chk->book_size_scale = 0; chk->data = sctp_get_mbuf_for_msg(MCLBYTES, 0, M_NOWAIT, 1, MT_DATA); if (chk->data == NULL) { sctp_free_a_chunk(stcb, chk, SCTP_SO_LOCKED); SCTP_LTRACE_ERR_RET(NULL, stcb, NULL, SCTP_FROM_SCTP_OUTPUT, ENOMEM); return; } SCTP_BUF_RESV_UF(chk->data, SCTP_MIN_OVERHEAD); /* setup chunk parameters */ chk->sent = SCTP_DATAGRAM_UNSENT; chk->snd_count = 0; if (stcb->asoc.alternate) { chk->whoTo = stcb->asoc.alternate; } else { chk->whoTo = stcb->asoc.primary_destination; } ch = mtod(chk->data, struct sctp_chunkhdr *); ch->chunk_type = SCTP_STREAM_RESET; ch->chunk_flags = 0; ch->chunk_length = htons(chk->book_size); atomic_add_int(&chk->whoTo->ref_count, 1); SCTP_BUF_LEN(chk->data) = chk->send_size; sctp_add_stream_reset_result(chk, ent->seq, response); /* insert the chunk for sending */ TAILQ_INSERT_TAIL(&asoc->control_send_queue, chk, sctp_next); asoc->ctrl_queue_cnt++; } void sctp_add_stream_reset_result_tsn(struct sctp_tmit_chunk *chk, uint32_t resp_seq, uint32_t result, uint32_t send_una, uint32_t recv_next) { uint16_t len, old_len; struct sctp_stream_reset_response_tsn *resp; struct sctp_chunkhdr *ch; ch = mtod(chk->data, struct sctp_chunkhdr *); old_len = len = SCTP_SIZE32(ntohs(ch->chunk_length)); /* get to new offset for the param. */ resp = (struct sctp_stream_reset_response_tsn *)((caddr_t)ch + len); /* now how long will this param be? */ len = sizeof(struct sctp_stream_reset_response_tsn); resp->ph.param_type = htons(SCTP_STR_RESET_RESPONSE); resp->ph.param_length = htons(len); resp->response_seq = htonl(resp_seq); resp->result = htonl(result); resp->senders_next_tsn = htonl(send_una); resp->receivers_next_tsn = htonl(recv_next); /* now fix the chunk length */ ch->chunk_length = htons(len + old_len); chk->book_size = len + old_len; chk->send_size = SCTP_SIZE32(chk->book_size); chk->book_size_scale = 0; SCTP_BUF_LEN(chk->data) = chk->send_size; return; } static void sctp_add_an_out_stream(struct sctp_tmit_chunk *chk, uint32_t seq, uint16_t adding) { uint16_t len, old_len; struct sctp_chunkhdr *ch; struct sctp_stream_reset_add_strm *addstr; ch = mtod(chk->data, struct sctp_chunkhdr *); old_len = len = SCTP_SIZE32(ntohs(ch->chunk_length)); /* get to new offset for the param. */ addstr = (struct sctp_stream_reset_add_strm *)((caddr_t)ch + len); /* now how long will this param be? */ len = sizeof(struct sctp_stream_reset_add_strm); /* Fill it out. */ addstr->ph.param_type = htons(SCTP_STR_RESET_ADD_OUT_STREAMS); addstr->ph.param_length = htons(len); addstr->request_seq = htonl(seq); addstr->number_of_streams = htons(adding); addstr->reserved = 0; /* now fix the chunk length */ ch->chunk_length = htons(len + old_len); chk->send_size = len + old_len; chk->book_size = SCTP_SIZE32(chk->send_size); chk->book_size_scale = 0; SCTP_BUF_LEN(chk->data) = SCTP_SIZE32(chk->send_size); return; } static void sctp_add_an_in_stream(struct sctp_tmit_chunk *chk, uint32_t seq, uint16_t adding) { uint16_t len, old_len; struct sctp_chunkhdr *ch; struct sctp_stream_reset_add_strm *addstr; ch = mtod(chk->data, struct sctp_chunkhdr *); old_len = len = SCTP_SIZE32(ntohs(ch->chunk_length)); /* get to new offset for the param. */ addstr = (struct sctp_stream_reset_add_strm *)((caddr_t)ch + len); /* now how long will this param be? */ len = sizeof(struct sctp_stream_reset_add_strm); /* Fill it out. */ addstr->ph.param_type = htons(SCTP_STR_RESET_ADD_IN_STREAMS); addstr->ph.param_length = htons(len); addstr->request_seq = htonl(seq); addstr->number_of_streams = htons(adding); addstr->reserved = 0; /* now fix the chunk length */ ch->chunk_length = htons(len + old_len); chk->send_size = len + old_len; chk->book_size = SCTP_SIZE32(chk->send_size); chk->book_size_scale = 0; SCTP_BUF_LEN(chk->data) = SCTP_SIZE32(chk->send_size); return; } int sctp_send_stream_reset_out_if_possible(struct sctp_tcb *stcb, int so_locked) { struct sctp_association *asoc; struct sctp_tmit_chunk *chk; struct sctp_chunkhdr *ch; uint32_t seq; asoc = &stcb->asoc; asoc->trigger_reset = 0; if (asoc->stream_reset_outstanding) { return (EALREADY); } sctp_alloc_a_chunk(stcb, chk); if (chk == NULL) { SCTP_LTRACE_ERR_RET(NULL, stcb, NULL, SCTP_FROM_SCTP_OUTPUT, ENOMEM); return (ENOMEM); } chk->copy_by_ref = 0; chk->rec.chunk_id.id = SCTP_STREAM_RESET; chk->rec.chunk_id.can_take_data = 0; chk->flags = 0; chk->asoc = &stcb->asoc; chk->book_size = sizeof(struct sctp_chunkhdr); chk->send_size = SCTP_SIZE32(chk->book_size); chk->book_size_scale = 0; chk->data = sctp_get_mbuf_for_msg(MCLBYTES, 0, M_NOWAIT, 1, MT_DATA); if (chk->data == NULL) { sctp_free_a_chunk(stcb, chk, so_locked); SCTP_LTRACE_ERR_RET(NULL, stcb, NULL, SCTP_FROM_SCTP_OUTPUT, ENOMEM); return (ENOMEM); } SCTP_BUF_RESV_UF(chk->data, SCTP_MIN_OVERHEAD); /* setup chunk parameters */ chk->sent = SCTP_DATAGRAM_UNSENT; chk->snd_count = 0; if (stcb->asoc.alternate) { chk->whoTo = stcb->asoc.alternate; } else { chk->whoTo = stcb->asoc.primary_destination; } ch = mtod(chk->data, struct sctp_chunkhdr *); ch->chunk_type = SCTP_STREAM_RESET; ch->chunk_flags = 0; ch->chunk_length = htons(chk->book_size); atomic_add_int(&chk->whoTo->ref_count, 1); SCTP_BUF_LEN(chk->data) = chk->send_size; seq = stcb->asoc.str_reset_seq_out; if (sctp_add_stream_reset_out(stcb, chk, seq, (stcb->asoc.str_reset_seq_in - 1), (stcb->asoc.sending_seq - 1))) { seq++; asoc->stream_reset_outstanding++; } else { m_freem(chk->data); chk->data = NULL; sctp_free_a_chunk(stcb, chk, so_locked); return (ENOENT); } asoc->str_reset = chk; /* insert the chunk for sending */ TAILQ_INSERT_TAIL(&asoc->control_send_queue, chk, sctp_next); asoc->ctrl_queue_cnt++; if (stcb->asoc.send_sack) { sctp_send_sack(stcb, so_locked); } sctp_timer_start(SCTP_TIMER_TYPE_STRRESET, stcb->sctp_ep, stcb, chk->whoTo); return (0); } int sctp_send_str_reset_req(struct sctp_tcb *stcb, uint16_t number_entries, uint16_t * list, uint8_t send_in_req, uint8_t send_tsn_req, uint8_t add_stream, uint16_t adding_o, uint16_t adding_i, uint8_t peer_asked) { struct sctp_association *asoc; struct sctp_tmit_chunk *chk; struct sctp_chunkhdr *ch; int can_send_out_req = 0; uint32_t seq; asoc = &stcb->asoc; if (asoc->stream_reset_outstanding) { /*- * Already one pending, must get ACK back to clear the flag. */ SCTP_LTRACE_ERR_RET(NULL, stcb, NULL, SCTP_FROM_SCTP_OUTPUT, EBUSY); return (EBUSY); } if ((send_in_req == 0) && (send_tsn_req == 0) && (add_stream == 0)) { /* nothing to do */ SCTP_LTRACE_ERR_RET(NULL, stcb, NULL, SCTP_FROM_SCTP_OUTPUT, EINVAL); return (EINVAL); } if (send_tsn_req && send_in_req) { /* error, can't do that */ SCTP_LTRACE_ERR_RET(NULL, stcb, NULL, SCTP_FROM_SCTP_OUTPUT, EINVAL); return (EINVAL); } else if (send_in_req) { can_send_out_req = 1; } if (number_entries > (MCLBYTES - SCTP_MIN_OVERHEAD - sizeof(struct sctp_chunkhdr) - sizeof(struct sctp_stream_reset_out_request)) / sizeof(uint16_t)) { SCTP_LTRACE_ERR_RET(NULL, stcb, NULL, SCTP_FROM_SCTP_OUTPUT, ENOMEM); return (ENOMEM); } sctp_alloc_a_chunk(stcb, chk); if (chk == NULL) { SCTP_LTRACE_ERR_RET(NULL, stcb, NULL, SCTP_FROM_SCTP_OUTPUT, ENOMEM); return (ENOMEM); } chk->copy_by_ref = 0; chk->rec.chunk_id.id = SCTP_STREAM_RESET; chk->rec.chunk_id.can_take_data = 0; chk->flags = 0; chk->asoc = &stcb->asoc; chk->book_size = sizeof(struct sctp_chunkhdr); chk->send_size = SCTP_SIZE32(chk->book_size); chk->book_size_scale = 0; chk->data = sctp_get_mbuf_for_msg(MCLBYTES, 0, M_NOWAIT, 1, MT_DATA); if (chk->data == NULL) { sctp_free_a_chunk(stcb, chk, SCTP_SO_LOCKED); SCTP_LTRACE_ERR_RET(NULL, stcb, NULL, SCTP_FROM_SCTP_OUTPUT, ENOMEM); return (ENOMEM); } SCTP_BUF_RESV_UF(chk->data, SCTP_MIN_OVERHEAD); /* setup chunk parameters */ chk->sent = SCTP_DATAGRAM_UNSENT; chk->snd_count = 0; if (stcb->asoc.alternate) { chk->whoTo = stcb->asoc.alternate; } else { chk->whoTo = stcb->asoc.primary_destination; } atomic_add_int(&chk->whoTo->ref_count, 1); ch = mtod(chk->data, struct sctp_chunkhdr *); ch->chunk_type = SCTP_STREAM_RESET; ch->chunk_flags = 0; ch->chunk_length = htons(chk->book_size); SCTP_BUF_LEN(chk->data) = chk->send_size; seq = stcb->asoc.str_reset_seq_out; if (can_send_out_req) { int ret; ret = sctp_add_stream_reset_out(stcb, chk, seq, (stcb->asoc.str_reset_seq_in - 1), (stcb->asoc.sending_seq - 1)); if (ret) { seq++; asoc->stream_reset_outstanding++; } } if ((add_stream & 1) && ((stcb->asoc.strm_realoutsize - stcb->asoc.streamoutcnt) < adding_o)) { /* Need to allocate more */ struct sctp_stream_out *oldstream; struct sctp_stream_queue_pending *sp, *nsp; int i; #if defined(SCTP_DETAILED_STR_STATS) int j; #endif oldstream = stcb->asoc.strmout; /* get some more */ SCTP_MALLOC(stcb->asoc.strmout, struct sctp_stream_out *, (stcb->asoc.streamoutcnt + adding_o) * sizeof(struct sctp_stream_out), SCTP_M_STRMO); if (stcb->asoc.strmout == NULL) { uint8_t x; stcb->asoc.strmout = oldstream; /* Turn off the bit */ x = add_stream & 0xfe; add_stream = x; goto skip_stuff; } /* * Ok now we proceed with copying the old out stuff and * initializing the new stuff. */ SCTP_TCB_SEND_LOCK(stcb); stcb->asoc.ss_functions.sctp_ss_clear(stcb, &stcb->asoc, 0, 1); for (i = 0; i < stcb->asoc.streamoutcnt; i++) { TAILQ_INIT(&stcb->asoc.strmout[i].outqueue); stcb->asoc.strmout[i].chunks_on_queues = oldstream[i].chunks_on_queues; stcb->asoc.strmout[i].next_mid_ordered = oldstream[i].next_mid_ordered; stcb->asoc.strmout[i].next_mid_unordered = oldstream[i].next_mid_unordered; stcb->asoc.strmout[i].last_msg_incomplete = oldstream[i].last_msg_incomplete; stcb->asoc.strmout[i].stream_no = i; stcb->asoc.strmout[i].state = oldstream[i].state; /* FIX ME FIX ME */ /* * This should be a SS_COPY operation FIX ME STREAM * SCHEDULER EXPERT */ stcb->asoc.ss_functions.sctp_ss_init_stream(stcb, &stcb->asoc.strmout[i], &oldstream[i]); /* now anything on those queues? */ TAILQ_FOREACH_SAFE(sp, &oldstream[i].outqueue, next, nsp) { TAILQ_REMOVE(&oldstream[i].outqueue, sp, next); TAILQ_INSERT_TAIL(&stcb->asoc.strmout[i].outqueue, sp, next); } } /* now the new streams */ stcb->asoc.ss_functions.sctp_ss_init(stcb, &stcb->asoc, 1); for (i = stcb->asoc.streamoutcnt; i < (stcb->asoc.streamoutcnt + adding_o); i++) { TAILQ_INIT(&stcb->asoc.strmout[i].outqueue); stcb->asoc.strmout[i].chunks_on_queues = 0; #if defined(SCTP_DETAILED_STR_STATS) for (j = 0; j < SCTP_PR_SCTP_MAX + 1; j++) { stcb->asoc.strmout[i].abandoned_sent[j] = 0; stcb->asoc.strmout[i].abandoned_unsent[j] = 0; } #else stcb->asoc.strmout[i].abandoned_sent[0] = 0; stcb->asoc.strmout[i].abandoned_unsent[0] = 0; #endif stcb->asoc.strmout[i].next_mid_ordered = 0; stcb->asoc.strmout[i].next_mid_unordered = 0; stcb->asoc.strmout[i].stream_no = i; stcb->asoc.strmout[i].last_msg_incomplete = 0; stcb->asoc.ss_functions.sctp_ss_init_stream(stcb, &stcb->asoc.strmout[i], NULL); stcb->asoc.strmout[i].state = SCTP_STREAM_CLOSED; } stcb->asoc.strm_realoutsize = stcb->asoc.streamoutcnt + adding_o; SCTP_FREE(oldstream, SCTP_M_STRMO); SCTP_TCB_SEND_UNLOCK(stcb); } skip_stuff: if ((add_stream & 1) && (adding_o > 0)) { asoc->strm_pending_add_size = adding_o; asoc->peer_req_out = peer_asked; sctp_add_an_out_stream(chk, seq, adding_o); seq++; asoc->stream_reset_outstanding++; } if ((add_stream & 2) && (adding_i > 0)) { sctp_add_an_in_stream(chk, seq, adding_i); seq++; asoc->stream_reset_outstanding++; } if (send_in_req) { sctp_add_stream_reset_in(chk, number_entries, list, seq); seq++; asoc->stream_reset_outstanding++; } if (send_tsn_req) { sctp_add_stream_reset_tsn(chk, seq); asoc->stream_reset_outstanding++; } asoc->str_reset = chk; /* insert the chunk for sending */ TAILQ_INSERT_TAIL(&asoc->control_send_queue, chk, sctp_next); asoc->ctrl_queue_cnt++; if (stcb->asoc.send_sack) { sctp_send_sack(stcb, SCTP_SO_LOCKED); } sctp_timer_start(SCTP_TIMER_TYPE_STRRESET, stcb->sctp_ep, stcb, chk->whoTo); return (0); } void sctp_send_abort(struct mbuf *m, int iphlen, struct sockaddr *src, struct sockaddr *dst, struct sctphdr *sh, uint32_t vtag, struct mbuf *cause, uint8_t mflowtype, uint32_t mflowid, uint16_t fibnum, uint32_t vrf_id, uint16_t port) { /* Don't respond to an ABORT with an ABORT. */ if (sctp_is_there_an_abort_here(m, iphlen, &vtag)) { if (cause) sctp_m_freem(cause); return; } sctp_send_resp_msg(src, dst, sh, vtag, SCTP_ABORT_ASSOCIATION, cause, mflowtype, mflowid, fibnum, vrf_id, port); return; } void sctp_send_operr_to(struct sockaddr *src, struct sockaddr *dst, struct sctphdr *sh, uint32_t vtag, struct mbuf *cause, uint8_t mflowtype, uint32_t mflowid, uint16_t fibnum, uint32_t vrf_id, uint16_t port) { sctp_send_resp_msg(src, dst, sh, vtag, SCTP_OPERATION_ERROR, cause, mflowtype, mflowid, fibnum, vrf_id, port); return; } static struct mbuf * sctp_copy_resume(struct uio *uio, int max_send_len, int user_marks_eor, int *error, uint32_t * sndout, struct mbuf **new_tail) { struct mbuf *m; m = m_uiotombuf(uio, M_WAITOK, max_send_len, 0, (M_PKTHDR | (user_marks_eor ? M_EOR : 0))); if (m == NULL) { SCTP_LTRACE_ERR_RET(NULL, NULL, NULL, SCTP_FROM_SCTP_OUTPUT, ENOBUFS); *error = ENOBUFS; } else { *sndout = m_length(m, NULL); *new_tail = m_last(m); } return (m); } static int sctp_copy_one(struct sctp_stream_queue_pending *sp, struct uio *uio, int resv_upfront) { sp->data = m_uiotombuf(uio, M_WAITOK, sp->length, resv_upfront, 0); if (sp->data == NULL) { SCTP_LTRACE_ERR_RET(NULL, NULL, NULL, SCTP_FROM_SCTP_OUTPUT, ENOBUFS); return (ENOBUFS); } sp->tail_mbuf = m_last(sp->data); return (0); } static struct sctp_stream_queue_pending * sctp_copy_it_in(struct sctp_tcb *stcb, struct sctp_association *asoc, struct sctp_sndrcvinfo *srcv, struct uio *uio, struct sctp_nets *net, int max_send_len, int user_marks_eor, int *error) { /*- * This routine must be very careful in its work. Protocol * processing is up and running so care must be taken to spl...() * when you need to do something that may effect the stcb/asoc. The * sb is locked however. When data is copied the protocol processing * should be enabled since this is a slower operation... */ struct sctp_stream_queue_pending *sp = NULL; int resv_in_first; *error = 0; /* Now can we send this? */ if ((SCTP_GET_STATE(asoc) == SCTP_STATE_SHUTDOWN_SENT) || (SCTP_GET_STATE(asoc) == SCTP_STATE_SHUTDOWN_ACK_SENT) || (SCTP_GET_STATE(asoc) == SCTP_STATE_SHUTDOWN_RECEIVED) || (asoc->state & SCTP_STATE_SHUTDOWN_PENDING)) { /* got data while shutting down */ SCTP_LTRACE_ERR_RET(NULL, stcb, NULL, SCTP_FROM_SCTP_OUTPUT, ECONNRESET); *error = ECONNRESET; goto out_now; } sctp_alloc_a_strmoq(stcb, sp); if (sp == NULL) { SCTP_LTRACE_ERR_RET(NULL, stcb, net, SCTP_FROM_SCTP_OUTPUT, ENOMEM); *error = ENOMEM; goto out_now; } sp->act_flags = 0; sp->sender_all_done = 0; sp->sinfo_flags = srcv->sinfo_flags; sp->timetolive = srcv->sinfo_timetolive; sp->ppid = srcv->sinfo_ppid; sp->context = srcv->sinfo_context; sp->fsn = 0; (void)SCTP_GETTIME_TIMEVAL(&sp->ts); sp->stream = srcv->sinfo_stream; sp->length = (uint32_t) min(uio->uio_resid, max_send_len); if ((sp->length == (uint32_t) uio->uio_resid) && ((user_marks_eor == 0) || (srcv->sinfo_flags & SCTP_EOF) || (user_marks_eor && (srcv->sinfo_flags & SCTP_EOR)))) { sp->msg_is_complete = 1; } else { sp->msg_is_complete = 0; } sp->sender_all_done = 0; sp->some_taken = 0; sp->put_last_out = 0; resv_in_first = sizeof(struct sctp_data_chunk); sp->data = sp->tail_mbuf = NULL; if (sp->length == 0) { *error = 0; goto skip_copy; } if (srcv->sinfo_keynumber_valid) { sp->auth_keyid = srcv->sinfo_keynumber; } else { sp->auth_keyid = stcb->asoc.authinfo.active_keyid; } if (sctp_auth_is_required_chunk(SCTP_DATA, stcb->asoc.peer_auth_chunks)) { sctp_auth_key_acquire(stcb, sp->auth_keyid); sp->holds_key_ref = 1; } *error = sctp_copy_one(sp, uio, resv_in_first); skip_copy: if (*error) { sctp_free_a_strmoq(stcb, sp, SCTP_SO_LOCKED); sp = NULL; } else { if (sp->sinfo_flags & SCTP_ADDR_OVER) { sp->net = net; atomic_add_int(&sp->net->ref_count, 1); } else { sp->net = NULL; } sctp_set_prsctp_policy(sp); } out_now: return (sp); } int sctp_sosend(struct socket *so, struct sockaddr *addr, struct uio *uio, struct mbuf *top, struct mbuf *control, int flags, struct thread *p ) { int error, use_sndinfo = 0; struct sctp_sndrcvinfo sndrcvninfo; struct sockaddr *addr_to_use; #if defined(INET) && defined(INET6) struct sockaddr_in sin; #endif if (control) { /* process cmsg snd/rcv info (maybe a assoc-id) */ if (sctp_find_cmsg(SCTP_SNDRCV, (void *)&sndrcvninfo, control, sizeof(sndrcvninfo))) { /* got one */ use_sndinfo = 1; } } addr_to_use = addr; #if defined(INET) && defined(INET6) if ((addr) && (addr->sa_family == AF_INET6)) { struct sockaddr_in6 *sin6; sin6 = (struct sockaddr_in6 *)addr; if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { in6_sin6_2_sin(&sin, sin6); addr_to_use = (struct sockaddr *)&sin; } } #endif error = sctp_lower_sosend(so, addr_to_use, uio, top, control, flags, use_sndinfo ? &sndrcvninfo : NULL ,p ); return (error); } int sctp_lower_sosend(struct socket *so, struct sockaddr *addr, struct uio *uio, struct mbuf *i_pak, struct mbuf *control, int flags, struct sctp_sndrcvinfo *srcv , struct thread *p ) { unsigned int sndlen = 0, max_len; int error, len; struct mbuf *top = NULL; int queue_only = 0, queue_only_for_init = 0; int free_cnt_applied = 0; int un_sent; int now_filled = 0; unsigned int inqueue_bytes = 0; struct sctp_block_entry be; struct sctp_inpcb *inp; struct sctp_tcb *stcb = NULL; struct timeval now; struct sctp_nets *net; struct sctp_association *asoc; struct sctp_inpcb *t_inp; int user_marks_eor; int create_lock_applied = 0; int nagle_applies = 0; int some_on_control = 0; int got_all_of_the_send = 0; int hold_tcblock = 0; int non_blocking = 0; uint32_t local_add_more, local_soresv = 0; uint16_t port; uint16_t sinfo_flags; sctp_assoc_t sinfo_assoc_id; error = 0; net = NULL; stcb = NULL; asoc = NULL; t_inp = inp = (struct sctp_inpcb *)so->so_pcb; if (inp == NULL) { SCTP_LTRACE_ERR_RET(NULL, NULL, NULL, SCTP_FROM_SCTP_OUTPUT, EINVAL); error = EINVAL; if (i_pak) { SCTP_RELEASE_PKT(i_pak); } return (error); } if ((uio == NULL) && (i_pak == NULL)) { SCTP_LTRACE_ERR_RET(inp, stcb, net, SCTP_FROM_SCTP_OUTPUT, EINVAL); return (EINVAL); } user_marks_eor = sctp_is_feature_on(inp, SCTP_PCB_FLAGS_EXPLICIT_EOR); atomic_add_int(&inp->total_sends, 1); if (uio) { if (uio->uio_resid < 0) { SCTP_LTRACE_ERR_RET(inp, stcb, net, SCTP_FROM_SCTP_OUTPUT, EINVAL); return (EINVAL); } sndlen = (unsigned int)uio->uio_resid; } else { top = SCTP_HEADER_TO_CHAIN(i_pak); sndlen = SCTP_HEADER_LEN(i_pak); } SCTPDBG(SCTP_DEBUG_OUTPUT1, "Send called addr:%p send length %d\n", (void *)addr, sndlen); if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) && (inp->sctp_socket->so_qlimit)) { /* The listener can NOT send */ SCTP_LTRACE_ERR_RET(NULL, NULL, NULL, SCTP_FROM_SCTP_OUTPUT, ENOTCONN); error = ENOTCONN; goto out_unlocked; } /** * Pre-screen address, if one is given the sin-len * must be set correctly! */ if (addr) { union sctp_sockstore *raddr = (union sctp_sockstore *)addr; switch (raddr->sa.sa_family) { #ifdef INET case AF_INET: if (raddr->sin.sin_len != sizeof(struct sockaddr_in)) { SCTP_LTRACE_ERR_RET(inp, stcb, net, SCTP_FROM_SCTP_OUTPUT, EINVAL); error = EINVAL; goto out_unlocked; } port = raddr->sin.sin_port; break; #endif #ifdef INET6 case AF_INET6: if (raddr->sin6.sin6_len != sizeof(struct sockaddr_in6)) { SCTP_LTRACE_ERR_RET(inp, stcb, net, SCTP_FROM_SCTP_OUTPUT, EINVAL); error = EINVAL; goto out_unlocked; } port = raddr->sin6.sin6_port; break; #endif default: SCTP_LTRACE_ERR_RET(inp, stcb, net, SCTP_FROM_SCTP_OUTPUT, EAFNOSUPPORT); error = EAFNOSUPPORT; goto out_unlocked; } } else port = 0; if (srcv) { sinfo_flags = srcv->sinfo_flags; sinfo_assoc_id = srcv->sinfo_assoc_id; if (INVALID_SINFO_FLAG(sinfo_flags) || PR_SCTP_INVALID_POLICY(sinfo_flags)) { SCTP_LTRACE_ERR_RET(inp, stcb, net, SCTP_FROM_SCTP_OUTPUT, EINVAL); error = EINVAL; goto out_unlocked; } if (srcv->sinfo_flags) SCTP_STAT_INCR(sctps_sends_with_flags); } else { sinfo_flags = inp->def_send.sinfo_flags; sinfo_assoc_id = inp->def_send.sinfo_assoc_id; } if (sinfo_flags & SCTP_SENDALL) { /* its a sendall */ error = sctp_sendall(inp, uio, top, srcv); top = NULL; goto out_unlocked; } if ((sinfo_flags & SCTP_ADDR_OVER) && (addr == NULL)) { SCTP_LTRACE_ERR_RET(inp, stcb, net, SCTP_FROM_SCTP_OUTPUT, EINVAL); error = EINVAL; goto out_unlocked; } /* now we must find the assoc */ if ((inp->sctp_flags & SCTP_PCB_FLAGS_CONNECTED) || (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL)) { SCTP_INP_RLOCK(inp); stcb = LIST_FIRST(&inp->sctp_asoc_list); if (stcb) { SCTP_TCB_LOCK(stcb); hold_tcblock = 1; } SCTP_INP_RUNLOCK(inp); } else if (sinfo_assoc_id) { stcb = sctp_findassociation_ep_asocid(inp, sinfo_assoc_id, 1); if (stcb != NULL) { hold_tcblock = 1; } } else if (addr) { /*- * Since we did not use findep we must * increment it, and if we don't find a tcb * decrement it. */ SCTP_INP_WLOCK(inp); SCTP_INP_INCR_REF(inp); SCTP_INP_WUNLOCK(inp); stcb = sctp_findassociation_ep_addr(&t_inp, addr, &net, NULL, NULL); if (stcb == NULL) { SCTP_INP_WLOCK(inp); SCTP_INP_DECR_REF(inp); SCTP_INP_WUNLOCK(inp); } else { hold_tcblock = 1; } } if ((stcb == NULL) && (addr)) { /* Possible implicit send? */ SCTP_ASOC_CREATE_LOCK(inp); create_lock_applied = 1; if ((inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE) || (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_ALLGONE)) { /* Should I really unlock ? */ SCTP_LTRACE_ERR_RET(NULL, NULL, NULL, SCTP_FROM_SCTP_OUTPUT, EINVAL); error = EINVAL; goto out_unlocked; } if (((inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) == 0) && (addr->sa_family == AF_INET6)) { SCTP_LTRACE_ERR_RET(inp, stcb, net, SCTP_FROM_SCTP_OUTPUT, EINVAL); error = EINVAL; goto out_unlocked; } SCTP_INP_WLOCK(inp); SCTP_INP_INCR_REF(inp); SCTP_INP_WUNLOCK(inp); /* With the lock applied look again */ stcb = sctp_findassociation_ep_addr(&t_inp, addr, &net, NULL, NULL); if ((stcb == NULL) && (control != NULL) && (port > 0)) { stcb = sctp_findassociation_cmsgs(&t_inp, port, control, &net, &error); } if (stcb == NULL) { SCTP_INP_WLOCK(inp); SCTP_INP_DECR_REF(inp); SCTP_INP_WUNLOCK(inp); } else { hold_tcblock = 1; } if (error) { goto out_unlocked; } if (t_inp != inp) { SCTP_LTRACE_ERR_RET(inp, stcb, net, SCTP_FROM_SCTP_OUTPUT, ENOTCONN); error = ENOTCONN; goto out_unlocked; } } if (stcb == NULL) { if (addr == NULL) { SCTP_LTRACE_ERR_RET(inp, stcb, net, SCTP_FROM_SCTP_OUTPUT, ENOENT); error = ENOENT; goto out_unlocked; } else { /* We must go ahead and start the INIT process */ uint32_t vrf_id; if ((sinfo_flags & SCTP_ABORT) || ((sinfo_flags & SCTP_EOF) && (sndlen == 0))) { /*- * User asks to abort a non-existant assoc, * or EOF a non-existant assoc with no data */ SCTP_LTRACE_ERR_RET(inp, stcb, net, SCTP_FROM_SCTP_OUTPUT, ENOENT); error = ENOENT; goto out_unlocked; } /* get an asoc/stcb struct */ vrf_id = inp->def_vrf_id; #ifdef INVARIANTS if (create_lock_applied == 0) { panic("Error, should hold create lock and I don't?"); } #endif stcb = sctp_aloc_assoc(inp, addr, &error, 0, vrf_id, inp->sctp_ep.pre_open_stream_count, inp->sctp_ep.port, p); if (stcb == NULL) { /* Error is setup for us in the call */ goto out_unlocked; } if (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) { stcb->sctp_ep->sctp_flags |= SCTP_PCB_FLAGS_CONNECTED; /* * Set the connected flag so we can queue * data */ soisconnecting(so); } hold_tcblock = 1; if (create_lock_applied) { SCTP_ASOC_CREATE_UNLOCK(inp); create_lock_applied = 0; } else { SCTP_PRINTF("Huh-3? create lock should have been on??\n"); } /* * Turn on queue only flag to prevent data from * being sent */ queue_only = 1; asoc = &stcb->asoc; SCTP_SET_STATE(asoc, SCTP_STATE_COOKIE_WAIT); (void)SCTP_GETTIME_TIMEVAL(&asoc->time_entered); /* initialize authentication params for the assoc */ sctp_initialize_auth_params(inp, stcb); if (control) { if (sctp_process_cmsgs_for_init(stcb, control, &error)) { sctp_free_assoc(inp, stcb, SCTP_PCBFREE_FORCE, SCTP_FROM_SCTP_OUTPUT + SCTP_LOC_5); hold_tcblock = 0; stcb = NULL; goto out_unlocked; } } /* out with the INIT */ queue_only_for_init = 1; /*- * we may want to dig in after this call and adjust the MTU * value. It defaulted to 1500 (constant) but the ro * structure may now have an update and thus we may need to * change it BEFORE we append the message. */ } } else asoc = &stcb->asoc; if (srcv == NULL) srcv = (struct sctp_sndrcvinfo *)&asoc->def_send; if (srcv->sinfo_flags & SCTP_ADDR_OVER) { if (addr) net = sctp_findnet(stcb, addr); else net = NULL; if ((net == NULL) || ((port != 0) && (port != stcb->rport))) { SCTP_LTRACE_ERR_RET(inp, stcb, net, SCTP_FROM_SCTP_OUTPUT, EINVAL); error = EINVAL; goto out_unlocked; } } else { if (stcb->asoc.alternate) { net = stcb->asoc.alternate; } else { net = stcb->asoc.primary_destination; } } atomic_add_int(&stcb->total_sends, 1); /* Keep the stcb from being freed under our feet */ atomic_add_int(&asoc->refcnt, 1); free_cnt_applied = 1; if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_NO_FRAGMENT)) { if (sndlen > asoc->smallest_mtu) { SCTP_LTRACE_ERR_RET(inp, stcb, net, SCTP_FROM_SCTP_OUTPUT, EMSGSIZE); error = EMSGSIZE; goto out_unlocked; } } if (SCTP_SO_IS_NBIO(so) || (flags & MSG_NBIO) ) { non_blocking = 1; } /* would we block? */ if (non_blocking) { if (hold_tcblock == 0) { SCTP_TCB_LOCK(stcb); hold_tcblock = 1; } inqueue_bytes = stcb->asoc.total_output_queue_size - (stcb->asoc.chunks_on_out_queue * sizeof(struct sctp_data_chunk)); if ((SCTP_SB_LIMIT_SND(so) < (sndlen + inqueue_bytes + stcb->asoc.sb_send_resv)) || (stcb->asoc.chunks_on_out_queue >= SCTP_BASE_SYSCTL(sctp_max_chunks_on_queue))) { SCTP_LTRACE_ERR_RET(inp, stcb, net, SCTP_FROM_SCTP_OUTPUT, EWOULDBLOCK); if (sndlen > SCTP_SB_LIMIT_SND(so)) error = EMSGSIZE; else error = EWOULDBLOCK; goto out_unlocked; } stcb->asoc.sb_send_resv += sndlen; SCTP_TCB_UNLOCK(stcb); hold_tcblock = 0; } else { atomic_add_int(&stcb->asoc.sb_send_resv, sndlen); } local_soresv = sndlen; if (stcb->asoc.state & SCTP_STATE_ABOUT_TO_BE_FREED) { SCTP_LTRACE_ERR_RET(NULL, stcb, NULL, SCTP_FROM_SCTP_OUTPUT, ECONNRESET); error = ECONNRESET; goto out_unlocked; } if (create_lock_applied) { SCTP_ASOC_CREATE_UNLOCK(inp); create_lock_applied = 0; } /* Is the stream no. valid? */ if (srcv->sinfo_stream >= asoc->streamoutcnt) { /* Invalid stream number */ SCTP_LTRACE_ERR_RET(inp, stcb, net, SCTP_FROM_SCTP_OUTPUT, EINVAL); error = EINVAL; goto out_unlocked; } if ((asoc->strmout[srcv->sinfo_stream].state != SCTP_STREAM_OPEN) && (asoc->strmout[srcv->sinfo_stream].state != SCTP_STREAM_OPENING)) { /* * Can't queue any data while stream reset is underway. */ if (asoc->strmout[srcv->sinfo_stream].state > SCTP_STREAM_OPEN) { error = EAGAIN; } else { error = EINVAL; } SCTP_LTRACE_ERR_RET(inp, stcb, net, SCTP_FROM_SCTP_OUTPUT, error); goto out_unlocked; } if ((SCTP_GET_STATE(asoc) == SCTP_STATE_COOKIE_WAIT) || (SCTP_GET_STATE(asoc) == SCTP_STATE_COOKIE_ECHOED)) { queue_only = 1; } /* we are now done with all control */ if (control) { sctp_m_freem(control); control = NULL; } if ((SCTP_GET_STATE(asoc) == SCTP_STATE_SHUTDOWN_SENT) || (SCTP_GET_STATE(asoc) == SCTP_STATE_SHUTDOWN_RECEIVED) || (SCTP_GET_STATE(asoc) == SCTP_STATE_SHUTDOWN_ACK_SENT) || (asoc->state & SCTP_STATE_SHUTDOWN_PENDING)) { if (srcv->sinfo_flags & SCTP_ABORT) { ; } else { SCTP_LTRACE_ERR_RET(NULL, stcb, NULL, SCTP_FROM_SCTP_OUTPUT, ECONNRESET); error = ECONNRESET; goto out_unlocked; } } /* Ok, we will attempt a msgsnd :> */ if (p) { p->td_ru.ru_msgsnd++; } /* Are we aborting? */ if (srcv->sinfo_flags & SCTP_ABORT) { struct mbuf *mm; int tot_demand, tot_out = 0, max_out; SCTP_STAT_INCR(sctps_sends_with_abort); if ((SCTP_GET_STATE(asoc) == SCTP_STATE_COOKIE_WAIT) || (SCTP_GET_STATE(asoc) == SCTP_STATE_COOKIE_ECHOED)) { /* It has to be up before we abort */ /* how big is the user initiated abort? */ SCTP_LTRACE_ERR_RET(inp, stcb, net, SCTP_FROM_SCTP_OUTPUT, EINVAL); error = EINVAL; goto out; } if (hold_tcblock) { SCTP_TCB_UNLOCK(stcb); hold_tcblock = 0; } if (top) { struct mbuf *cntm = NULL; mm = sctp_get_mbuf_for_msg(sizeof(struct sctp_paramhdr), 0, M_WAITOK, 1, MT_DATA); if (sndlen != 0) { for (cntm = top; cntm; cntm = SCTP_BUF_NEXT(cntm)) { tot_out += SCTP_BUF_LEN(cntm); } } } else { /* Must fit in a MTU */ tot_out = sndlen; tot_demand = (tot_out + sizeof(struct sctp_paramhdr)); if (tot_demand > SCTP_DEFAULT_ADD_MORE) { /* To big */ SCTP_LTRACE_ERR_RET(NULL, stcb, net, SCTP_FROM_SCTP_OUTPUT, EMSGSIZE); error = EMSGSIZE; goto out; } mm = sctp_get_mbuf_for_msg(tot_demand, 0, M_WAITOK, 1, MT_DATA); } if (mm == NULL) { SCTP_LTRACE_ERR_RET(NULL, stcb, net, SCTP_FROM_SCTP_OUTPUT, ENOMEM); error = ENOMEM; goto out; } max_out = asoc->smallest_mtu - sizeof(struct sctp_paramhdr); max_out -= sizeof(struct sctp_abort_msg); if (tot_out > max_out) { tot_out = max_out; } if (mm) { struct sctp_paramhdr *ph; /* now move forward the data pointer */ ph = mtod(mm, struct sctp_paramhdr *); ph->param_type = htons(SCTP_CAUSE_USER_INITIATED_ABT); ph->param_length = htons((uint16_t) (sizeof(struct sctp_paramhdr) + tot_out)); ph++; SCTP_BUF_LEN(mm) = tot_out + sizeof(struct sctp_paramhdr); if (top == NULL) { error = uiomove((caddr_t)ph, (int)tot_out, uio); if (error) { /*- * Here if we can't get his data we * still abort we just don't get to * send the users note :-0 */ sctp_m_freem(mm); mm = NULL; } } else { if (sndlen != 0) { SCTP_BUF_NEXT(mm) = top; } } } if (hold_tcblock == 0) { SCTP_TCB_LOCK(stcb); } atomic_add_int(&stcb->asoc.refcnt, -1); free_cnt_applied = 0; /* release this lock, otherwise we hang on ourselves */ sctp_abort_an_association(stcb->sctp_ep, stcb, mm, SCTP_SO_LOCKED); /* now relock the stcb so everything is sane */ hold_tcblock = 0; stcb = NULL; /* * In this case top is already chained to mm avoid double * free, since we free it below if top != NULL and driver * would free it after sending the packet out */ if (sndlen != 0) { top = NULL; } goto out_unlocked; } /* Calculate the maximum we can send */ inqueue_bytes = stcb->asoc.total_output_queue_size - (stcb->asoc.chunks_on_out_queue * sizeof(struct sctp_data_chunk)); if (SCTP_SB_LIMIT_SND(so) > inqueue_bytes) { if (non_blocking) { /* we already checked for non-blocking above. */ max_len = sndlen; } else { max_len = SCTP_SB_LIMIT_SND(so) - inqueue_bytes; } } else { max_len = 0; } if (hold_tcblock) { SCTP_TCB_UNLOCK(stcb); hold_tcblock = 0; } if (asoc->strmout == NULL) { /* huh? software error */ SCTP_LTRACE_ERR_RET(inp, stcb, net, SCTP_FROM_SCTP_OUTPUT, EFAULT); error = EFAULT; goto out_unlocked; } /* Unless E_EOR mode is on, we must make a send FIT in one call. */ if ((user_marks_eor == 0) && (sndlen > SCTP_SB_LIMIT_SND(stcb->sctp_socket))) { /* It will NEVER fit */ SCTP_LTRACE_ERR_RET(NULL, stcb, net, SCTP_FROM_SCTP_OUTPUT, EMSGSIZE); error = EMSGSIZE; goto out_unlocked; } if ((uio == NULL) && user_marks_eor) { /*- * We do not support eeor mode for * sending with mbuf chains (like sendfile). */ SCTP_LTRACE_ERR_RET(NULL, stcb, net, SCTP_FROM_SCTP_OUTPUT, EINVAL); error = EINVAL; goto out_unlocked; } if (user_marks_eor) { local_add_more = min(SCTP_SB_LIMIT_SND(so), SCTP_BASE_SYSCTL(sctp_add_more_threshold)); } else { /*- * For non-eeor the whole message must fit in * the socket send buffer. */ local_add_more = sndlen; } len = 0; if (non_blocking) { goto skip_preblock; } if (((max_len <= local_add_more) && (SCTP_SB_LIMIT_SND(so) >= local_add_more)) || (max_len == 0) || ((stcb->asoc.chunks_on_out_queue + stcb->asoc.stream_queue_cnt) >= SCTP_BASE_SYSCTL(sctp_max_chunks_on_queue))) { /* No room right now ! */ SOCKBUF_LOCK(&so->so_snd); inqueue_bytes = stcb->asoc.total_output_queue_size - (stcb->asoc.chunks_on_out_queue * sizeof(struct sctp_data_chunk)); while ((SCTP_SB_LIMIT_SND(so) < (inqueue_bytes + local_add_more)) || ((stcb->asoc.stream_queue_cnt + stcb->asoc.chunks_on_out_queue) >= SCTP_BASE_SYSCTL(sctp_max_chunks_on_queue))) { SCTPDBG(SCTP_DEBUG_OUTPUT1, "pre_block limit:%u <(inq:%d + %d) || (%d+%d > %d)\n", (unsigned int)SCTP_SB_LIMIT_SND(so), inqueue_bytes, local_add_more, stcb->asoc.stream_queue_cnt, stcb->asoc.chunks_on_out_queue, SCTP_BASE_SYSCTL(sctp_max_chunks_on_queue)); if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_BLK_LOGGING_ENABLE) { sctp_log_block(SCTP_BLOCK_LOG_INTO_BLKA, asoc, sndlen); } be.error = 0; stcb->block_entry = &be; error = sbwait(&so->so_snd); stcb->block_entry = NULL; if (error || so->so_error || be.error) { if (error == 0) { if (so->so_error) error = so->so_error; if (be.error) { error = be.error; } } SOCKBUF_UNLOCK(&so->so_snd); goto out_unlocked; } if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_BLK_LOGGING_ENABLE) { sctp_log_block(SCTP_BLOCK_LOG_OUTOF_BLK, asoc, stcb->asoc.total_output_queue_size); } if (stcb->asoc.state & SCTP_STATE_ABOUT_TO_BE_FREED) { SOCKBUF_UNLOCK(&so->so_snd); goto out_unlocked; } inqueue_bytes = stcb->asoc.total_output_queue_size - (stcb->asoc.chunks_on_out_queue * sizeof(struct sctp_data_chunk)); } if (SCTP_SB_LIMIT_SND(so) > inqueue_bytes) { max_len = SCTP_SB_LIMIT_SND(so) - inqueue_bytes; } else { max_len = 0; } SOCKBUF_UNLOCK(&so->so_snd); } skip_preblock: if (stcb->asoc.state & SCTP_STATE_ABOUT_TO_BE_FREED) { goto out_unlocked; } /* * sndlen covers for mbuf case uio_resid covers for the non-mbuf * case NOTE: uio will be null when top/mbuf is passed */ if (sndlen == 0) { if (srcv->sinfo_flags & SCTP_EOF) { got_all_of_the_send = 1; goto dataless_eof; } else { SCTP_LTRACE_ERR_RET(inp, stcb, net, SCTP_FROM_SCTP_OUTPUT, EINVAL); error = EINVAL; goto out; } } if (top == NULL) { struct sctp_stream_queue_pending *sp; struct sctp_stream_out *strm; uint32_t sndout; SCTP_TCB_SEND_LOCK(stcb); if ((asoc->stream_locked) && (asoc->stream_locked_on != srcv->sinfo_stream)) { SCTP_TCB_SEND_UNLOCK(stcb); SCTP_LTRACE_ERR_RET(inp, stcb, net, SCTP_FROM_SCTP_OUTPUT, EINVAL); error = EINVAL; goto out; } SCTP_TCB_SEND_UNLOCK(stcb); strm = &stcb->asoc.strmout[srcv->sinfo_stream]; if (strm->last_msg_incomplete == 0) { do_a_copy_in: sp = sctp_copy_it_in(stcb, asoc, srcv, uio, net, max_len, user_marks_eor, &error); if ((sp == NULL) || (error)) { goto out; } SCTP_TCB_SEND_LOCK(stcb); if (sp->msg_is_complete) { strm->last_msg_incomplete = 0; asoc->stream_locked = 0; } else { /* * Just got locked to this guy in case of an * interrupt. */ strm->last_msg_incomplete = 1; if (stcb->asoc.idata_supported == 0) { asoc->stream_locked = 1; asoc->stream_locked_on = srcv->sinfo_stream; } sp->sender_all_done = 0; } sctp_snd_sb_alloc(stcb, sp->length); atomic_add_int(&asoc->stream_queue_cnt, 1); if (srcv->sinfo_flags & SCTP_UNORDERED) { SCTP_STAT_INCR(sctps_sends_with_unord); } TAILQ_INSERT_TAIL(&strm->outqueue, sp, next); stcb->asoc.ss_functions.sctp_ss_add_to_stream(stcb, asoc, strm, sp, 1); SCTP_TCB_SEND_UNLOCK(stcb); } else { SCTP_TCB_SEND_LOCK(stcb); sp = TAILQ_LAST(&strm->outqueue, sctp_streamhead); SCTP_TCB_SEND_UNLOCK(stcb); if (sp == NULL) { /* ???? Huh ??? last msg is gone */ #ifdef INVARIANTS panic("Warning: Last msg marked incomplete, yet nothing left?"); #else SCTP_PRINTF("Warning: Last msg marked incomplete, yet nothing left?\n"); strm->last_msg_incomplete = 0; #endif goto do_a_copy_in; } } while (uio->uio_resid > 0) { /* How much room do we have? */ struct mbuf *new_tail, *mm; if (SCTP_SB_LIMIT_SND(so) > stcb->asoc.total_output_queue_size) max_len = SCTP_SB_LIMIT_SND(so) - stcb->asoc.total_output_queue_size; else max_len = 0; if ((max_len > SCTP_BASE_SYSCTL(sctp_add_more_threshold)) || (max_len && (SCTP_SB_LIMIT_SND(so) < SCTP_BASE_SYSCTL(sctp_add_more_threshold))) || (uio->uio_resid && (uio->uio_resid <= (int)max_len))) { sndout = 0; new_tail = NULL; if (hold_tcblock) { SCTP_TCB_UNLOCK(stcb); hold_tcblock = 0; } mm = sctp_copy_resume(uio, max_len, user_marks_eor, &error, &sndout, &new_tail); if ((mm == NULL) || error) { if (mm) { sctp_m_freem(mm); } goto out; } /* Update the mbuf and count */ SCTP_TCB_SEND_LOCK(stcb); if (stcb->asoc.state & SCTP_STATE_ABOUT_TO_BE_FREED) { /* * we need to get out. Peer probably * aborted. */ sctp_m_freem(mm); if (stcb->asoc.state & SCTP_PCB_FLAGS_WAS_ABORTED) { SCTP_LTRACE_ERR_RET(NULL, stcb, NULL, SCTP_FROM_SCTP_OUTPUT, ECONNRESET); error = ECONNRESET; } SCTP_TCB_SEND_UNLOCK(stcb); goto out; } if (sp->tail_mbuf) { /* tack it to the end */ SCTP_BUF_NEXT(sp->tail_mbuf) = mm; sp->tail_mbuf = new_tail; } else { /* A stolen mbuf */ sp->data = mm; sp->tail_mbuf = new_tail; } sctp_snd_sb_alloc(stcb, sndout); atomic_add_int(&sp->length, sndout); len += sndout; if (srcv->sinfo_flags & SCTP_SACK_IMMEDIATELY) { sp->sinfo_flags |= SCTP_SACK_IMMEDIATELY; } /* Did we reach EOR? */ if ((uio->uio_resid == 0) && ((user_marks_eor == 0) || (srcv->sinfo_flags & SCTP_EOF) || (user_marks_eor && (srcv->sinfo_flags & SCTP_EOR)))) { sp->msg_is_complete = 1; } else { sp->msg_is_complete = 0; } SCTP_TCB_SEND_UNLOCK(stcb); } if (uio->uio_resid == 0) { /* got it all? */ continue; } /* PR-SCTP? */ if ((asoc->prsctp_supported) && (asoc->sent_queue_cnt_removeable > 0)) { /* * This is ugly but we must assure locking * order */ if (hold_tcblock == 0) { SCTP_TCB_LOCK(stcb); hold_tcblock = 1; } sctp_prune_prsctp(stcb, asoc, srcv, sndlen); inqueue_bytes = stcb->asoc.total_output_queue_size - (stcb->asoc.chunks_on_out_queue * sizeof(struct sctp_data_chunk)); if (SCTP_SB_LIMIT_SND(so) > stcb->asoc.total_output_queue_size) max_len = SCTP_SB_LIMIT_SND(so) - inqueue_bytes; else max_len = 0; if (max_len > 0) { continue; } SCTP_TCB_UNLOCK(stcb); hold_tcblock = 0; } /* wait for space now */ if (non_blocking) { /* Non-blocking io in place out */ goto skip_out_eof; } /* What about the INIT, send it maybe */ if (queue_only_for_init) { if (hold_tcblock == 0) { SCTP_TCB_LOCK(stcb); hold_tcblock = 1; } if (SCTP_GET_STATE(&stcb->asoc) == SCTP_STATE_OPEN) { /* a collision took us forward? */ queue_only = 0; } else { sctp_send_initiate(inp, stcb, SCTP_SO_LOCKED); SCTP_SET_STATE(asoc, SCTP_STATE_COOKIE_WAIT); queue_only = 1; } } if ((net->flight_size > net->cwnd) && (asoc->sctp_cmt_on_off == 0)) { SCTP_STAT_INCR(sctps_send_cwnd_avoid); queue_only = 1; } else if (asoc->ifp_had_enobuf) { SCTP_STAT_INCR(sctps_ifnomemqueued); if (net->flight_size > (2 * net->mtu)) { queue_only = 1; } asoc->ifp_had_enobuf = 0; } un_sent = ((stcb->asoc.total_output_queue_size - stcb->asoc.total_flight) + (stcb->asoc.stream_queue_cnt * sizeof(struct sctp_data_chunk))); if ((sctp_is_feature_off(inp, SCTP_PCB_FLAGS_NODELAY)) && (stcb->asoc.total_flight > 0) && (stcb->asoc.stream_queue_cnt < SCTP_MAX_DATA_BUNDLING) && (un_sent < (int)(stcb->asoc.smallest_mtu - SCTP_MIN_OVERHEAD))) { /*- * Ok, Nagle is set on and we have data outstanding. * Don't send anything and let SACKs drive out the * data unless we have a "full" segment to send. */ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_NAGLE_LOGGING_ENABLE) { sctp_log_nagle_event(stcb, SCTP_NAGLE_APPLIED); } SCTP_STAT_INCR(sctps_naglequeued); nagle_applies = 1; } else { if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_NAGLE_LOGGING_ENABLE) { if (sctp_is_feature_off(inp, SCTP_PCB_FLAGS_NODELAY)) sctp_log_nagle_event(stcb, SCTP_NAGLE_SKIPPED); } SCTP_STAT_INCR(sctps_naglesent); nagle_applies = 0; } if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_BLK_LOGGING_ENABLE) { sctp_misc_ints(SCTP_CWNDLOG_PRESEND, queue_only_for_init, queue_only, nagle_applies, un_sent); sctp_misc_ints(SCTP_CWNDLOG_PRESEND, stcb->asoc.total_output_queue_size, stcb->asoc.total_flight, stcb->asoc.chunks_on_out_queue, stcb->asoc.total_flight_count); } if (queue_only_for_init) queue_only_for_init = 0; if ((queue_only == 0) && (nagle_applies == 0)) { /*- * need to start chunk output * before blocking.. note that if * a lock is already applied, then * the input via the net is happening * and I don't need to start output :-D */ if (hold_tcblock == 0) { if (SCTP_TCB_TRYLOCK(stcb)) { hold_tcblock = 1; sctp_chunk_output(inp, stcb, SCTP_OUTPUT_FROM_USR_SEND, SCTP_SO_LOCKED); } } else { sctp_chunk_output(inp, stcb, SCTP_OUTPUT_FROM_USR_SEND, SCTP_SO_LOCKED); } if (hold_tcblock == 1) { SCTP_TCB_UNLOCK(stcb); hold_tcblock = 0; } } SOCKBUF_LOCK(&so->so_snd); /*- * This is a bit strange, but I think it will * work. The total_output_queue_size is locked and * protected by the TCB_LOCK, which we just released. * There is a race that can occur between releasing it * above, and me getting the socket lock, where sacks * come in but we have not put the SB_WAIT on the * so_snd buffer to get the wakeup. After the LOCK * is applied the sack_processing will also need to * LOCK the so->so_snd to do the actual sowwakeup(). So * once we have the socket buffer lock if we recheck the * size we KNOW we will get to sleep safely with the * wakeup flag in place. */ if (SCTP_SB_LIMIT_SND(so) <= (stcb->asoc.total_output_queue_size + min(SCTP_BASE_SYSCTL(sctp_add_more_threshold), SCTP_SB_LIMIT_SND(so)))) { if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_BLK_LOGGING_ENABLE) { sctp_log_block(SCTP_BLOCK_LOG_INTO_BLK, asoc, (size_t)uio->uio_resid); } be.error = 0; stcb->block_entry = &be; error = sbwait(&so->so_snd); stcb->block_entry = NULL; if (error || so->so_error || be.error) { if (error == 0) { if (so->so_error) error = so->so_error; if (be.error) { error = be.error; } } SOCKBUF_UNLOCK(&so->so_snd); goto out_unlocked; } if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_BLK_LOGGING_ENABLE) { sctp_log_block(SCTP_BLOCK_LOG_OUTOF_BLK, asoc, stcb->asoc.total_output_queue_size); } } SOCKBUF_UNLOCK(&so->so_snd); if (stcb->asoc.state & SCTP_STATE_ABOUT_TO_BE_FREED) { goto out_unlocked; } } SCTP_TCB_SEND_LOCK(stcb); if (stcb->asoc.state & SCTP_STATE_ABOUT_TO_BE_FREED) { SCTP_TCB_SEND_UNLOCK(stcb); goto out_unlocked; } if (sp) { if (sp->msg_is_complete == 0) { strm->last_msg_incomplete = 1; if (stcb->asoc.idata_supported == 0) { asoc->stream_locked = 1; asoc->stream_locked_on = srcv->sinfo_stream; } } else { sp->sender_all_done = 1; strm->last_msg_incomplete = 0; asoc->stream_locked = 0; } } else { SCTP_PRINTF("Huh no sp TSNH?\n"); strm->last_msg_incomplete = 0; asoc->stream_locked = 0; } SCTP_TCB_SEND_UNLOCK(stcb); if (uio->uio_resid == 0) { got_all_of_the_send = 1; } } else { /* We send in a 0, since we do NOT have any locks */ error = sctp_msg_append(stcb, net, top, srcv, 0); top = NULL; if (srcv->sinfo_flags & SCTP_EOF) { /* * This should only happen for Panda for the mbuf * send case, which does NOT yet support EEOR mode. * Thus, we can just set this flag to do the proper * EOF handling. */ got_all_of_the_send = 1; } } if (error) { goto out; } dataless_eof: /* EOF thing ? */ if ((srcv->sinfo_flags & SCTP_EOF) && (got_all_of_the_send == 1)) { SCTP_STAT_INCR(sctps_sends_with_eof); error = 0; if (hold_tcblock == 0) { SCTP_TCB_LOCK(stcb); hold_tcblock = 1; } if (TAILQ_EMPTY(&asoc->send_queue) && TAILQ_EMPTY(&asoc->sent_queue) && sctp_is_there_unsent_data(stcb, SCTP_SO_LOCKED) == 0) { if ((*asoc->ss_functions.sctp_ss_is_user_msgs_incomplete) (stcb, asoc)) { goto abort_anyway; } /* there is nothing queued to send, so I'm done... */ if ((SCTP_GET_STATE(asoc) != SCTP_STATE_SHUTDOWN_SENT) && (SCTP_GET_STATE(asoc) != SCTP_STATE_SHUTDOWN_RECEIVED) && (SCTP_GET_STATE(asoc) != SCTP_STATE_SHUTDOWN_ACK_SENT)) { struct sctp_nets *netp; /* only send SHUTDOWN the first time through */ if (SCTP_GET_STATE(asoc) == SCTP_STATE_OPEN) { SCTP_STAT_DECR_GAUGE32(sctps_currestab); } SCTP_SET_STATE(asoc, SCTP_STATE_SHUTDOWN_SENT); SCTP_CLEAR_SUBSTATE(asoc, SCTP_STATE_SHUTDOWN_PENDING); sctp_stop_timers_for_shutdown(stcb); if (stcb->asoc.alternate) { netp = stcb->asoc.alternate; } else { netp = stcb->asoc.primary_destination; } sctp_send_shutdown(stcb, netp); sctp_timer_start(SCTP_TIMER_TYPE_SHUTDOWN, stcb->sctp_ep, stcb, netp); sctp_timer_start(SCTP_TIMER_TYPE_SHUTDOWNGUARD, stcb->sctp_ep, stcb, asoc->primary_destination); } } else { /*- * we still got (or just got) data to send, so set * SHUTDOWN_PENDING */ /*- * XXX sockets draft says that SCTP_EOF should be * sent with no data. currently, we will allow user * data to be sent first and move to * SHUTDOWN-PENDING */ if ((SCTP_GET_STATE(asoc) != SCTP_STATE_SHUTDOWN_SENT) && (SCTP_GET_STATE(asoc) != SCTP_STATE_SHUTDOWN_RECEIVED) && (SCTP_GET_STATE(asoc) != SCTP_STATE_SHUTDOWN_ACK_SENT)) { if (hold_tcblock == 0) { SCTP_TCB_LOCK(stcb); hold_tcblock = 1; } if ((*asoc->ss_functions.sctp_ss_is_user_msgs_incomplete) (stcb, asoc)) { asoc->state |= SCTP_STATE_PARTIAL_MSG_LEFT; } asoc->state |= SCTP_STATE_SHUTDOWN_PENDING; if (TAILQ_EMPTY(&asoc->send_queue) && TAILQ_EMPTY(&asoc->sent_queue) && (asoc->state & SCTP_STATE_PARTIAL_MSG_LEFT)) { struct mbuf *op_err; char msg[SCTP_DIAG_INFO_LEN]; abort_anyway: if (free_cnt_applied) { atomic_add_int(&stcb->asoc.refcnt, -1); free_cnt_applied = 0; } snprintf(msg, sizeof(msg), "%s:%d at %s", __FILE__, __LINE__, __func__); op_err = sctp_generate_cause(SCTP_BASE_SYSCTL(sctp_diag_info_code), msg); sctp_abort_an_association(stcb->sctp_ep, stcb, op_err, SCTP_SO_LOCKED); /* * now relock the stcb so everything * is sane */ hold_tcblock = 0; stcb = NULL; goto out; } sctp_timer_start(SCTP_TIMER_TYPE_SHUTDOWNGUARD, stcb->sctp_ep, stcb, asoc->primary_destination); sctp_feature_off(inp, SCTP_PCB_FLAGS_NODELAY); } } } skip_out_eof: if (!TAILQ_EMPTY(&stcb->asoc.control_send_queue)) { some_on_control = 1; } if (queue_only_for_init) { if (hold_tcblock == 0) { SCTP_TCB_LOCK(stcb); hold_tcblock = 1; } if (SCTP_GET_STATE(&stcb->asoc) == SCTP_STATE_OPEN) { /* a collision took us forward? */ queue_only = 0; } else { sctp_send_initiate(inp, stcb, SCTP_SO_LOCKED); SCTP_SET_STATE(&stcb->asoc, SCTP_STATE_COOKIE_WAIT); queue_only = 1; } } if ((net->flight_size > net->cwnd) && (stcb->asoc.sctp_cmt_on_off == 0)) { SCTP_STAT_INCR(sctps_send_cwnd_avoid); queue_only = 1; } else if (asoc->ifp_had_enobuf) { SCTP_STAT_INCR(sctps_ifnomemqueued); if (net->flight_size > (2 * net->mtu)) { queue_only = 1; } asoc->ifp_had_enobuf = 0; } un_sent = ((stcb->asoc.total_output_queue_size - stcb->asoc.total_flight) + (stcb->asoc.stream_queue_cnt * sizeof(struct sctp_data_chunk))); if ((sctp_is_feature_off(inp, SCTP_PCB_FLAGS_NODELAY)) && (stcb->asoc.total_flight > 0) && (stcb->asoc.stream_queue_cnt < SCTP_MAX_DATA_BUNDLING) && (un_sent < (int)(stcb->asoc.smallest_mtu - SCTP_MIN_OVERHEAD))) { /*- * Ok, Nagle is set on and we have data outstanding. * Don't send anything and let SACKs drive out the * data unless wen have a "full" segment to send. */ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_NAGLE_LOGGING_ENABLE) { sctp_log_nagle_event(stcb, SCTP_NAGLE_APPLIED); } SCTP_STAT_INCR(sctps_naglequeued); nagle_applies = 1; } else { if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_NAGLE_LOGGING_ENABLE) { if (sctp_is_feature_off(inp, SCTP_PCB_FLAGS_NODELAY)) sctp_log_nagle_event(stcb, SCTP_NAGLE_SKIPPED); } SCTP_STAT_INCR(sctps_naglesent); nagle_applies = 0; } if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_BLK_LOGGING_ENABLE) { sctp_misc_ints(SCTP_CWNDLOG_PRESEND, queue_only_for_init, queue_only, nagle_applies, un_sent); sctp_misc_ints(SCTP_CWNDLOG_PRESEND, stcb->asoc.total_output_queue_size, stcb->asoc.total_flight, stcb->asoc.chunks_on_out_queue, stcb->asoc.total_flight_count); } if ((queue_only == 0) && (nagle_applies == 0) && (stcb->asoc.peers_rwnd && un_sent)) { /* we can attempt to send too. */ if (hold_tcblock == 0) { /* * If there is activity recv'ing sacks no need to * send */ if (SCTP_TCB_TRYLOCK(stcb)) { sctp_chunk_output(inp, stcb, SCTP_OUTPUT_FROM_USR_SEND, SCTP_SO_LOCKED); hold_tcblock = 1; } } else { sctp_chunk_output(inp, stcb, SCTP_OUTPUT_FROM_USR_SEND, SCTP_SO_LOCKED); } } else if ((queue_only == 0) && (stcb->asoc.peers_rwnd == 0) && (stcb->asoc.total_flight == 0)) { /* We get to have a probe outstanding */ if (hold_tcblock == 0) { hold_tcblock = 1; SCTP_TCB_LOCK(stcb); } sctp_chunk_output(inp, stcb, SCTP_OUTPUT_FROM_USR_SEND, SCTP_SO_LOCKED); } else if (some_on_control) { int num_out, reason, frag_point; /* Here we do control only */ if (hold_tcblock == 0) { hold_tcblock = 1; SCTP_TCB_LOCK(stcb); } frag_point = sctp_get_frag_point(stcb, &stcb->asoc); (void)sctp_med_chunk_output(inp, stcb, &stcb->asoc, &num_out, &reason, 1, 1, &now, &now_filled, frag_point, SCTP_SO_LOCKED); } SCTPDBG(SCTP_DEBUG_OUTPUT1, "USR Send complete qo:%d prw:%d unsent:%d tf:%d cooq:%d toqs:%d err:%d\n", queue_only, stcb->asoc.peers_rwnd, un_sent, stcb->asoc.total_flight, stcb->asoc.chunks_on_out_queue, stcb->asoc.total_output_queue_size, error); out: out_unlocked: if (local_soresv && stcb) { atomic_subtract_int(&stcb->asoc.sb_send_resv, sndlen); } if (create_lock_applied) { SCTP_ASOC_CREATE_UNLOCK(inp); } if ((stcb) && hold_tcblock) { SCTP_TCB_UNLOCK(stcb); } if (stcb && free_cnt_applied) { atomic_add_int(&stcb->asoc.refcnt, -1); } #ifdef INVARIANTS if (stcb) { if (mtx_owned(&stcb->tcb_mtx)) { panic("Leaving with tcb mtx owned?"); } if (mtx_owned(&stcb->tcb_send_mtx)) { panic("Leaving with tcb send mtx owned?"); } } #endif if (top) { sctp_m_freem(top); } if (control) { sctp_m_freem(control); } return (error); } /* * generate an AUTHentication chunk, if required */ struct mbuf * sctp_add_auth_chunk(struct mbuf *m, struct mbuf **m_end, struct sctp_auth_chunk **auth_ret, uint32_t * offset, struct sctp_tcb *stcb, uint8_t chunk) { struct mbuf *m_auth; struct sctp_auth_chunk *auth; int chunk_len; struct mbuf *cn; if ((m_end == NULL) || (auth_ret == NULL) || (offset == NULL) || (stcb == NULL)) return (m); if (stcb->asoc.auth_supported == 0) { return (m); } /* does the requested chunk require auth? */ if (!sctp_auth_is_required_chunk(chunk, stcb->asoc.peer_auth_chunks)) { return (m); } m_auth = sctp_get_mbuf_for_msg(sizeof(*auth), 0, M_NOWAIT, 1, MT_HEADER); if (m_auth == NULL) { /* no mbuf's */ return (m); } /* reserve some space if this will be the first mbuf */ if (m == NULL) SCTP_BUF_RESV_UF(m_auth, SCTP_MIN_OVERHEAD); /* fill in the AUTH chunk details */ auth = mtod(m_auth, struct sctp_auth_chunk *); bzero(auth, sizeof(*auth)); auth->ch.chunk_type = SCTP_AUTHENTICATION; auth->ch.chunk_flags = 0; chunk_len = sizeof(*auth) + sctp_get_hmac_digest_len(stcb->asoc.peer_hmac_id); auth->ch.chunk_length = htons(chunk_len); auth->hmac_id = htons(stcb->asoc.peer_hmac_id); /* key id and hmac digest will be computed and filled in upon send */ /* save the offset where the auth was inserted into the chain */ *offset = 0; for (cn = m; cn; cn = SCTP_BUF_NEXT(cn)) { *offset += SCTP_BUF_LEN(cn); } /* update length and return pointer to the auth chunk */ SCTP_BUF_LEN(m_auth) = chunk_len; m = sctp_copy_mbufchain(m_auth, m, m_end, 1, chunk_len, 0); if (auth_ret != NULL) *auth_ret = auth; return (m); } #ifdef INET6 int sctp_v6src_match_nexthop(struct sockaddr_in6 *src6, sctp_route_t * ro) { struct nd_prefix *pfx = NULL; struct nd_pfxrouter *pfxrtr = NULL; struct sockaddr_in6 gw6; if (ro == NULL || ro->ro_rt == NULL || src6->sin6_family != AF_INET6) return (0); /* get prefix entry of address */ + ND6_RLOCK(); LIST_FOREACH(pfx, &MODULE_GLOBAL(nd_prefix), ndpr_entry) { if (pfx->ndpr_stateflags & NDPRF_DETACHED) continue; if (IN6_ARE_MASKED_ADDR_EQUAL(&pfx->ndpr_prefix.sin6_addr, &src6->sin6_addr, &pfx->ndpr_mask)) break; } /* no prefix entry in the prefix list */ if (pfx == NULL) { + ND6_RUNLOCK(); SCTPDBG(SCTP_DEBUG_OUTPUT2, "No prefix entry for "); SCTPDBG_ADDR(SCTP_DEBUG_OUTPUT2, (struct sockaddr *)src6); return (0); } SCTPDBG(SCTP_DEBUG_OUTPUT2, "v6src_match_nexthop(), Prefix entry is "); SCTPDBG_ADDR(SCTP_DEBUG_OUTPUT2, (struct sockaddr *)src6); /* search installed gateway from prefix entry */ LIST_FOREACH(pfxrtr, &pfx->ndpr_advrtrs, pfr_entry) { memset(&gw6, 0, sizeof(struct sockaddr_in6)); gw6.sin6_family = AF_INET6; gw6.sin6_len = sizeof(struct sockaddr_in6); memcpy(&gw6.sin6_addr, &pfxrtr->router->rtaddr, sizeof(struct in6_addr)); SCTPDBG(SCTP_DEBUG_OUTPUT2, "prefix router is "); SCTPDBG_ADDR(SCTP_DEBUG_OUTPUT2, (struct sockaddr *)&gw6); SCTPDBG(SCTP_DEBUG_OUTPUT2, "installed router is "); SCTPDBG_ADDR(SCTP_DEBUG_OUTPUT2, ro->ro_rt->rt_gateway); if (sctp_cmpaddr((struct sockaddr *)&gw6, ro->ro_rt->rt_gateway)) { SCTPDBG(SCTP_DEBUG_OUTPUT2, "pfxrouter is installed\n"); + ND6_RUNLOCK(); return (1); } } + ND6_RUNLOCK(); SCTPDBG(SCTP_DEBUG_OUTPUT2, "pfxrouter is not installed\n"); return (0); } #endif int sctp_v4src_match_nexthop(struct sctp_ifa *sifa, sctp_route_t * ro) { #ifdef INET struct sockaddr_in *sin, *mask; struct ifaddr *ifa; struct in_addr srcnetaddr, gwnetaddr; if (ro == NULL || ro->ro_rt == NULL || sifa->address.sa.sa_family != AF_INET) { return (0); } ifa = (struct ifaddr *)sifa->ifa; mask = (struct sockaddr_in *)(ifa->ifa_netmask); sin = &sifa->address.sin; srcnetaddr.s_addr = (sin->sin_addr.s_addr & mask->sin_addr.s_addr); SCTPDBG(SCTP_DEBUG_OUTPUT1, "match_nexthop4: src address is "); SCTPDBG_ADDR(SCTP_DEBUG_OUTPUT2, &sifa->address.sa); SCTPDBG(SCTP_DEBUG_OUTPUT1, "network address is %x\n", srcnetaddr.s_addr); sin = (struct sockaddr_in *)ro->ro_rt->rt_gateway; gwnetaddr.s_addr = (sin->sin_addr.s_addr & mask->sin_addr.s_addr); SCTPDBG(SCTP_DEBUG_OUTPUT1, "match_nexthop4: nexthop is "); SCTPDBG_ADDR(SCTP_DEBUG_OUTPUT2, ro->ro_rt->rt_gateway); SCTPDBG(SCTP_DEBUG_OUTPUT1, "network address is %x\n", gwnetaddr.s_addr); if (srcnetaddr.s_addr == gwnetaddr.s_addr) { return (1); } #endif return (0); } Index: user/alc/PQ_LAUNDRY/sys/netinet6/in6.c =================================================================== --- user/alc/PQ_LAUNDRY/sys/netinet6/in6.c (revision 306831) +++ user/alc/PQ_LAUNDRY/sys/netinet6/in6.c (revision 306832) @@ -1,2496 +1,2501 @@ /*- * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $KAME: in6.c,v 1.259 2002/01/21 11:37:50 keiichi Exp $ */ /*- * Copyright (c) 1982, 1986, 1991, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)in.c 8.2 (Berkeley) 11/15/93 */ #include __FBSDID("$FreeBSD$"); #include "opt_compat.h" #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include VNET_DECLARE(int, icmp6_nodeinfo_oldmcprefix); #define V_icmp6_nodeinfo_oldmcprefix VNET(icmp6_nodeinfo_oldmcprefix) /* * Definitions of some costant IP6 addresses. */ const struct in6_addr in6addr_any = IN6ADDR_ANY_INIT; const struct in6_addr in6addr_loopback = IN6ADDR_LOOPBACK_INIT; const struct in6_addr in6addr_nodelocal_allnodes = IN6ADDR_NODELOCAL_ALLNODES_INIT; const struct in6_addr in6addr_linklocal_allnodes = IN6ADDR_LINKLOCAL_ALLNODES_INIT; const struct in6_addr in6addr_linklocal_allrouters = IN6ADDR_LINKLOCAL_ALLROUTERS_INIT; const struct in6_addr in6addr_linklocal_allv2routers = IN6ADDR_LINKLOCAL_ALLV2ROUTERS_INIT; const struct in6_addr in6mask0 = IN6MASK0; const struct in6_addr in6mask32 = IN6MASK32; const struct in6_addr in6mask64 = IN6MASK64; const struct in6_addr in6mask96 = IN6MASK96; const struct in6_addr in6mask128 = IN6MASK128; const struct sockaddr_in6 sa6_any = { sizeof(sa6_any), AF_INET6, 0, 0, IN6ADDR_ANY_INIT, 0 }; static int in6_notify_ifa(struct ifnet *, struct in6_ifaddr *, struct in6_aliasreq *, int); static void in6_unlink_ifa(struct in6_ifaddr *, struct ifnet *); static int in6_validate_ifra(struct ifnet *, struct in6_aliasreq *, struct in6_ifaddr *, int); static struct in6_ifaddr *in6_alloc_ifa(struct ifnet *, struct in6_aliasreq *, int flags); static int in6_update_ifa_internal(struct ifnet *, struct in6_aliasreq *, struct in6_ifaddr *, int, int); static int in6_broadcast_ifa(struct ifnet *, struct in6_aliasreq *, struct in6_ifaddr *, int); #define ifa2ia6(ifa) ((struct in6_ifaddr *)(ifa)) #define ia62ifa(ia6) (&((ia6)->ia_ifa)) void in6_newaddrmsg(struct in6_ifaddr *ia, int cmd) { struct sockaddr_dl gateway; struct sockaddr_in6 mask, addr; struct rtentry rt; /* * initialize for rtmsg generation */ bzero(&gateway, sizeof(gateway)); gateway.sdl_len = sizeof(gateway); gateway.sdl_family = AF_LINK; bzero(&rt, sizeof(rt)); rt.rt_gateway = (struct sockaddr *)&gateway; memcpy(&mask, &ia->ia_prefixmask, sizeof(ia->ia_prefixmask)); memcpy(&addr, &ia->ia_addr, sizeof(ia->ia_addr)); rt_mask(&rt) = (struct sockaddr *)&mask; rt_key(&rt) = (struct sockaddr *)&addr; rt.rt_flags = RTF_HOST | RTF_STATIC; if (cmd == RTM_ADD) rt.rt_flags |= RTF_UP; /* Announce arrival of local address to all FIBs. */ rt_newaddrmsg(cmd, &ia->ia_ifa, 0, &rt); } int in6_mask2len(struct in6_addr *mask, u_char *lim0) { int x = 0, y; u_char *lim = lim0, *p; /* ignore the scope_id part */ if (lim0 == NULL || lim0 - (u_char *)mask > sizeof(*mask)) lim = (u_char *)mask + sizeof(*mask); for (p = (u_char *)mask; p < lim; x++, p++) { if (*p != 0xff) break; } y = 0; if (p < lim) { for (y = 0; y < 8; y++) { if ((*p & (0x80 >> y)) == 0) break; } } /* * when the limit pointer is given, do a stricter check on the * remaining bits. */ if (p < lim) { if (y != 0 && (*p & (0x00ff >> y)) != 0) return (-1); for (p = p + 1; p < lim; p++) if (*p != 0) return (-1); } return x * 8 + y; } #ifdef COMPAT_FREEBSD32 struct in6_ndifreq32 { char ifname[IFNAMSIZ]; uint32_t ifindex; }; #define SIOCGDEFIFACE32_IN6 _IOWR('i', 86, struct in6_ndifreq32) #endif int in6_control(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp, struct thread *td) { struct in6_ifreq *ifr = (struct in6_ifreq *)data; struct in6_ifaddr *ia = NULL; struct in6_aliasreq *ifra = (struct in6_aliasreq *)data; struct sockaddr_in6 *sa6; int carp_attached = 0; int error; u_long ocmd = cmd; /* * Compat to make pre-10.x ifconfig(8) operable. */ if (cmd == OSIOCAIFADDR_IN6) cmd = SIOCAIFADDR_IN6; switch (cmd) { case SIOCGETSGCNT_IN6: case SIOCGETMIFCNT_IN6: /* * XXX mrt_ioctl has a 3rd, unused, FIB argument in route.c. * We cannot see how that would be needed, so do not adjust the * KPI blindly; more likely should clean up the IPv4 variant. */ return (mrt6_ioctl ? mrt6_ioctl(cmd, data) : EOPNOTSUPP); } switch (cmd) { case SIOCAADDRCTL_POLICY: case SIOCDADDRCTL_POLICY: if (td != NULL) { error = priv_check(td, PRIV_NETINET_ADDRCTRL6); if (error) return (error); } return (in6_src_ioctl(cmd, data)); } if (ifp == NULL) return (EOPNOTSUPP); switch (cmd) { case SIOCSNDFLUSH_IN6: case SIOCSPFXFLUSH_IN6: case SIOCSRTRFLUSH_IN6: case SIOCSDEFIFACE_IN6: case SIOCSIFINFO_FLAGS: case SIOCSIFINFO_IN6: if (td != NULL) { error = priv_check(td, PRIV_NETINET_ND6); if (error) return (error); } /* FALLTHROUGH */ case OSIOCGIFINFO_IN6: case SIOCGIFINFO_IN6: case SIOCGNBRINFO_IN6: case SIOCGDEFIFACE_IN6: return (nd6_ioctl(cmd, data, ifp)); #ifdef COMPAT_FREEBSD32 case SIOCGDEFIFACE32_IN6: { struct in6_ndifreq ndif; struct in6_ndifreq32 *ndif32; error = nd6_ioctl(SIOCGDEFIFACE_IN6, (caddr_t)&ndif, ifp); if (error) return (error); ndif32 = (struct in6_ndifreq32 *)data; ndif32->ifindex = ndif.ifindex; return (0); } #endif } switch (cmd) { case SIOCSIFPREFIX_IN6: case SIOCDIFPREFIX_IN6: case SIOCAIFPREFIX_IN6: case SIOCCIFPREFIX_IN6: case SIOCSGIFPREFIX_IN6: case SIOCGIFPREFIX_IN6: log(LOG_NOTICE, "prefix ioctls are now invalidated. " "please use ifconfig.\n"); return (EOPNOTSUPP); } switch (cmd) { case SIOCSSCOPE6: if (td != NULL) { error = priv_check(td, PRIV_NETINET_SCOPE6); if (error) return (error); } /* FALLTHROUGH */ case SIOCGSCOPE6: case SIOCGSCOPE6DEF: return (scope6_ioctl(cmd, data, ifp)); } /* * Find address for this interface, if it exists. * * In netinet code, we have checked ifra_addr in SIOCSIF*ADDR operation * only, and used the first interface address as the target of other * operations (without checking ifra_addr). This was because netinet * code/API assumed at most 1 interface address per interface. * Since IPv6 allows a node to assign multiple addresses * on a single interface, we almost always look and check the * presence of ifra_addr, and reject invalid ones here. * It also decreases duplicated code among SIOC*_IN6 operations. */ switch (cmd) { case SIOCAIFADDR_IN6: case SIOCSIFPHYADDR_IN6: sa6 = &ifra->ifra_addr; break; case SIOCSIFADDR_IN6: case SIOCGIFADDR_IN6: case SIOCSIFDSTADDR_IN6: case SIOCSIFNETMASK_IN6: case SIOCGIFDSTADDR_IN6: case SIOCGIFNETMASK_IN6: case SIOCDIFADDR_IN6: case SIOCGIFPSRCADDR_IN6: case SIOCGIFPDSTADDR_IN6: case SIOCGIFAFLAG_IN6: case SIOCSNDFLUSH_IN6: case SIOCSPFXFLUSH_IN6: case SIOCSRTRFLUSH_IN6: case SIOCGIFALIFETIME_IN6: case SIOCGIFSTAT_IN6: case SIOCGIFSTAT_ICMP6: sa6 = &ifr->ifr_addr; break; case SIOCSIFADDR: case SIOCSIFBRDADDR: case SIOCSIFDSTADDR: case SIOCSIFNETMASK: /* * Although we should pass any non-INET6 ioctl requests * down to driver, we filter some legacy INET requests. * Drivers trust SIOCSIFADDR et al to come from an already * privileged layer, and do not perform any credentials * checks or input validation. */ return (EINVAL); default: sa6 = NULL; break; } if (sa6 && sa6->sin6_family == AF_INET6) { if (sa6->sin6_scope_id != 0) error = sa6_embedscope(sa6, 0); else error = in6_setscope(&sa6->sin6_addr, ifp, NULL); if (error != 0) return (error); if (td != NULL && (error = prison_check_ip6(td->td_ucred, &sa6->sin6_addr)) != 0) return (error); ia = in6ifa_ifpwithaddr(ifp, &sa6->sin6_addr); } else ia = NULL; switch (cmd) { case SIOCSIFADDR_IN6: case SIOCSIFDSTADDR_IN6: case SIOCSIFNETMASK_IN6: /* * Since IPv6 allows a node to assign multiple addresses * on a single interface, SIOCSIFxxx ioctls are deprecated. */ /* we decided to obsolete this command (20000704) */ error = EINVAL; goto out; case SIOCDIFADDR_IN6: /* * for IPv4, we look for existing in_ifaddr here to allow * "ifconfig if0 delete" to remove the first IPv4 address on * the interface. For IPv6, as the spec allows multiple * interface address from the day one, we consider "remove the * first one" semantics to be not preferable. */ if (ia == NULL) { error = EADDRNOTAVAIL; goto out; } /* FALLTHROUGH */ case SIOCAIFADDR_IN6: /* * We always require users to specify a valid IPv6 address for * the corresponding operation. */ if (ifra->ifra_addr.sin6_family != AF_INET6 || ifra->ifra_addr.sin6_len != sizeof(struct sockaddr_in6)) { error = EAFNOSUPPORT; goto out; } if (td != NULL) { error = priv_check(td, (cmd == SIOCDIFADDR_IN6) ? PRIV_NET_DELIFADDR : PRIV_NET_ADDIFADDR); if (error) goto out; } /* FALLTHROUGH */ case SIOCGIFSTAT_IN6: case SIOCGIFSTAT_ICMP6: if (ifp->if_afdata[AF_INET6] == NULL) { error = EPFNOSUPPORT; goto out; } break; case SIOCGIFADDR_IN6: /* This interface is basically deprecated. use SIOCGIFCONF. */ /* FALLTHROUGH */ case SIOCGIFAFLAG_IN6: case SIOCGIFNETMASK_IN6: case SIOCGIFDSTADDR_IN6: case SIOCGIFALIFETIME_IN6: /* must think again about its semantics */ if (ia == NULL) { error = EADDRNOTAVAIL; goto out; } break; } switch (cmd) { case SIOCGIFADDR_IN6: ifr->ifr_addr = ia->ia_addr; if ((error = sa6_recoverscope(&ifr->ifr_addr)) != 0) goto out; break; case SIOCGIFDSTADDR_IN6: if ((ifp->if_flags & IFF_POINTOPOINT) == 0) { error = EINVAL; goto out; } /* * XXX: should we check if ifa_dstaddr is NULL and return * an error? */ ifr->ifr_dstaddr = ia->ia_dstaddr; if ((error = sa6_recoverscope(&ifr->ifr_dstaddr)) != 0) goto out; break; case SIOCGIFNETMASK_IN6: ifr->ifr_addr = ia->ia_prefixmask; break; case SIOCGIFAFLAG_IN6: ifr->ifr_ifru.ifru_flags6 = ia->ia6_flags; break; case SIOCGIFSTAT_IN6: COUNTER_ARRAY_COPY(((struct in6_ifextra *) ifp->if_afdata[AF_INET6])->in6_ifstat, &ifr->ifr_ifru.ifru_stat, sizeof(struct in6_ifstat) / sizeof(uint64_t)); break; case SIOCGIFSTAT_ICMP6: COUNTER_ARRAY_COPY(((struct in6_ifextra *) ifp->if_afdata[AF_INET6])->icmp6_ifstat, &ifr->ifr_ifru.ifru_icmp6stat, sizeof(struct icmp6_ifstat) / sizeof(uint64_t)); break; case SIOCGIFALIFETIME_IN6: ifr->ifr_ifru.ifru_lifetime = ia->ia6_lifetime; if (ia->ia6_lifetime.ia6t_vltime != ND6_INFINITE_LIFETIME) { time_t maxexpire; struct in6_addrlifetime *retlt = &ifr->ifr_ifru.ifru_lifetime; /* * XXX: adjust expiration time assuming time_t is * signed. */ maxexpire = (-1) & ~((time_t)1 << ((sizeof(maxexpire) * 8) - 1)); if (ia->ia6_lifetime.ia6t_vltime < maxexpire - ia->ia6_updatetime) { retlt->ia6t_expire = ia->ia6_updatetime + ia->ia6_lifetime.ia6t_vltime; } else retlt->ia6t_expire = maxexpire; } if (ia->ia6_lifetime.ia6t_pltime != ND6_INFINITE_LIFETIME) { time_t maxexpire; struct in6_addrlifetime *retlt = &ifr->ifr_ifru.ifru_lifetime; /* * XXX: adjust expiration time assuming time_t is * signed. */ maxexpire = (-1) & ~((time_t)1 << ((sizeof(maxexpire) * 8) - 1)); if (ia->ia6_lifetime.ia6t_pltime < maxexpire - ia->ia6_updatetime) { retlt->ia6t_preferred = ia->ia6_updatetime + ia->ia6_lifetime.ia6t_pltime; } else retlt->ia6t_preferred = maxexpire; } break; case SIOCAIFADDR_IN6: { struct nd_prefixctl pr0; struct nd_prefix *pr; /* * first, make or update the interface address structure, * and link it to the list. */ if ((error = in6_update_ifa(ifp, ifra, ia, 0)) != 0) goto out; if (ia != NULL) ifa_free(&ia->ia_ifa); if ((ia = in6ifa_ifpwithaddr(ifp, &ifra->ifra_addr.sin6_addr)) == NULL) { /* * this can happen when the user specify the 0 valid * lifetime. */ break; } if (cmd == ocmd && ifra->ifra_vhid > 0) { if (carp_attach_p != NULL) error = (*carp_attach_p)(&ia->ia_ifa, ifra->ifra_vhid); else error = EPROTONOSUPPORT; if (error) goto out; else carp_attached = 1; } /* * then, make the prefix on-link on the interface. * XXX: we'd rather create the prefix before the address, but * we need at least one address to install the corresponding * interface route, so we configure the address first. */ /* * convert mask to prefix length (prefixmask has already * been validated in in6_update_ifa(). */ bzero(&pr0, sizeof(pr0)); pr0.ndpr_ifp = ifp; pr0.ndpr_plen = in6_mask2len(&ifra->ifra_prefixmask.sin6_addr, NULL); if (pr0.ndpr_plen == 128) { /* we don't need to install a host route. */ goto aifaddr_out; } pr0.ndpr_prefix = ifra->ifra_addr; /* apply the mask for safety. */ IN6_MASK_ADDR(&pr0.ndpr_prefix.sin6_addr, &ifra->ifra_prefixmask.sin6_addr); /* * XXX: since we don't have an API to set prefix (not address) * lifetimes, we just use the same lifetimes as addresses. * The (temporarily) installed lifetimes can be overridden by * later advertised RAs (when accept_rtadv is non 0), which is * an intended behavior. */ pr0.ndpr_raf_onlink = 1; /* should be configurable? */ pr0.ndpr_raf_auto = ((ifra->ifra_flags & IN6_IFF_AUTOCONF) != 0); pr0.ndpr_vltime = ifra->ifra_lifetime.ia6t_vltime; pr0.ndpr_pltime = ifra->ifra_lifetime.ia6t_pltime; /* add the prefix if not yet. */ if ((pr = nd6_prefix_lookup(&pr0)) == NULL) { /* * nd6_prelist_add will install the corresponding * interface route. */ if ((error = nd6_prelist_add(&pr0, NULL, &pr)) != 0) { if (carp_attached) (*carp_detach_p)(&ia->ia_ifa); goto out; } } /* relate the address to the prefix */ if (ia->ia6_ndpr == NULL) { ia->ia6_ndpr = pr; pr->ndpr_addrcnt++; /* * If this is the first autoconf address from the * prefix, create a temporary address as well * (when required). */ if ((ia->ia6_flags & IN6_IFF_AUTOCONF) && V_ip6_use_tempaddr && pr->ndpr_addrcnt == 1) { int e; if ((e = in6_tmpifadd(ia, 1, 0)) != 0) { log(LOG_NOTICE, "in6_control: failed " "to create a temporary address, " "errno=%d\n", e); } } } + nd6_prefix_rele(pr); /* * this might affect the status of autoconfigured addresses, * that is, this address might make other addresses detached. */ pfxlist_onlink_check(); aifaddr_out: /* * Try to clear the flag when a new IPv6 address is added * onto an IFDISABLED interface and it succeeds. */ if (ND_IFINFO(ifp)->flags & ND6_IFF_IFDISABLED) { struct in6_ndireq nd; memset(&nd, 0, sizeof(nd)); nd.ndi.flags = ND_IFINFO(ifp)->flags; nd.ndi.flags &= ~ND6_IFF_IFDISABLED; if (nd6_ioctl(SIOCSIFINFO_FLAGS, (caddr_t)&nd, ifp) < 0) log(LOG_NOTICE, "SIOCAIFADDR_IN6: " "SIOCSIFINFO_FLAGS for -ifdisabled " "failed."); /* * Ignore failure of clearing the flag intentionally. * The failure means address duplication was detected. */ } EVENTHANDLER_INVOKE(ifaddr_event, ifp); break; } case SIOCDIFADDR_IN6: { struct nd_prefix *pr; /* * If the address being deleted is the only one that owns * the corresponding prefix, expire the prefix as well. * XXX: theoretically, we don't have to worry about such * relationship, since we separate the address management * and the prefix management. We do this, however, to provide * as much backward compatibility as possible in terms of * the ioctl operation. * Note that in6_purgeaddr() will decrement ndpr_addrcnt. */ pr = ia->ia6_ndpr; in6_purgeaddr(&ia->ia_ifa); - if (pr && pr->ndpr_addrcnt == 0) - prelist_remove(pr); + if (pr != NULL && pr->ndpr_addrcnt == 0) { + ND6_WLOCK(); + nd6_prefix_unlink(pr, NULL); + ND6_WUNLOCK(); + nd6_prefix_del(pr); + } EVENTHANDLER_INVOKE(ifaddr_event, ifp); break; } default: if (ifp->if_ioctl == NULL) { error = EOPNOTSUPP; goto out; } error = (*ifp->if_ioctl)(ifp, cmd, data); goto out; } error = 0; out: if (ia != NULL) ifa_free(&ia->ia_ifa); return (error); } /* * Join necessary multicast groups. Factored out from in6_update_ifa(). * This entire work should only be done once, for the default FIB. */ static int in6_update_ifa_join_mc(struct ifnet *ifp, struct in6_aliasreq *ifra, struct in6_ifaddr *ia, int flags, struct in6_multi **in6m_sol) { char ip6buf[INET6_ADDRSTRLEN]; struct in6_addr mltaddr; struct in6_multi_mship *imm; int delay, error; KASSERT(in6m_sol != NULL, ("%s: in6m_sol is NULL", __func__)); /* Join solicited multicast addr for new host id. */ bzero(&mltaddr, sizeof(struct in6_addr)); mltaddr.s6_addr32[0] = IPV6_ADDR_INT32_MLL; mltaddr.s6_addr32[2] = htonl(1); mltaddr.s6_addr32[3] = ifra->ifra_addr.sin6_addr.s6_addr32[3]; mltaddr.s6_addr8[12] = 0xff; if ((error = in6_setscope(&mltaddr, ifp, NULL)) != 0) { /* XXX: should not happen */ log(LOG_ERR, "%s: in6_setscope failed\n", __func__); goto cleanup; } delay = error = 0; if ((flags & IN6_IFAUPDATE_DADDELAY)) { /* * We need a random delay for DAD on the address being * configured. It also means delaying transmission of the * corresponding MLD report to avoid report collision. * [RFC 4861, Section 6.3.7] */ delay = arc4random() % (MAX_RTR_SOLICITATION_DELAY * hz); } imm = in6_joingroup(ifp, &mltaddr, &error, delay); if (imm == NULL) { nd6log((LOG_WARNING, "%s: in6_joingroup failed for %s on %s " "(errno=%d)\n", __func__, ip6_sprintf(ip6buf, &mltaddr), if_name(ifp), error)); goto cleanup; } LIST_INSERT_HEAD(&ia->ia6_memberships, imm, i6mm_chain); *in6m_sol = imm->i6mm_maddr; /* * Join link-local all-nodes address. */ mltaddr = in6addr_linklocal_allnodes; if ((error = in6_setscope(&mltaddr, ifp, NULL)) != 0) goto cleanup; /* XXX: should not fail */ imm = in6_joingroup(ifp, &mltaddr, &error, 0); if (imm == NULL) { nd6log((LOG_WARNING, "%s: in6_joingroup failed for %s on %s " "(errno=%d)\n", __func__, ip6_sprintf(ip6buf, &mltaddr), if_name(ifp), error)); goto cleanup; } LIST_INSERT_HEAD(&ia->ia6_memberships, imm, i6mm_chain); /* * Join node information group address. */ delay = 0; if ((flags & IN6_IFAUPDATE_DADDELAY)) { /* * The spec does not say anything about delay for this group, * but the same logic should apply. */ delay = arc4random() % (MAX_RTR_SOLICITATION_DELAY * hz); } if (in6_nigroup(ifp, NULL, -1, &mltaddr) == 0) { /* XXX jinmei */ imm = in6_joingroup(ifp, &mltaddr, &error, delay); if (imm == NULL) nd6log((LOG_WARNING, "%s: in6_joingroup failed for %s on %s " "(errno=%d)\n", __func__, ip6_sprintf(ip6buf, &mltaddr), if_name(ifp), error)); /* XXX not very fatal, go on... */ else LIST_INSERT_HEAD(&ia->ia6_memberships, imm, i6mm_chain); } if (V_icmp6_nodeinfo_oldmcprefix && in6_nigroup_oldmcprefix(ifp, NULL, -1, &mltaddr) == 0) { imm = in6_joingroup(ifp, &mltaddr, &error, delay); if (imm == NULL) nd6log((LOG_WARNING, "%s: in6_joingroup failed for %s on %s " "(errno=%d)\n", __func__, ip6_sprintf(ip6buf, &mltaddr), if_name(ifp), error)); /* XXX not very fatal, go on... */ else LIST_INSERT_HEAD(&ia->ia6_memberships, imm, i6mm_chain); } /* * Join interface-local all-nodes address. * (ff01::1%ifN, and ff01::%ifN/32) */ mltaddr = in6addr_nodelocal_allnodes; if ((error = in6_setscope(&mltaddr, ifp, NULL)) != 0) goto cleanup; /* XXX: should not fail */ imm = in6_joingroup(ifp, &mltaddr, &error, 0); if (imm == NULL) { nd6log((LOG_WARNING, "%s: in6_joingroup failed for %s on %s " "(errno=%d)\n", __func__, ip6_sprintf(ip6buf, &mltaddr), if_name(ifp), error)); goto cleanup; } LIST_INSERT_HEAD(&ia->ia6_memberships, imm, i6mm_chain); cleanup: return (error); } /* * Update parameters of an IPv6 interface address. * If necessary, a new entry is created and linked into address chains. * This function is separated from in6_control(). */ int in6_update_ifa(struct ifnet *ifp, struct in6_aliasreq *ifra, struct in6_ifaddr *ia, int flags) { int error, hostIsNew = 0; if ((error = in6_validate_ifra(ifp, ifra, ia, flags)) != 0) return (error); if (ia == NULL) { hostIsNew = 1; if ((ia = in6_alloc_ifa(ifp, ifra, flags)) == NULL) return (ENOBUFS); } error = in6_update_ifa_internal(ifp, ifra, ia, hostIsNew, flags); if (error != 0) { if (hostIsNew != 0) { in6_unlink_ifa(ia, ifp); ifa_free(&ia->ia_ifa); } return (error); } if (hostIsNew) error = in6_broadcast_ifa(ifp, ifra, ia, flags); return (error); } /* * Fill in basic IPv6 address request info. */ void in6_prepare_ifra(struct in6_aliasreq *ifra, const struct in6_addr *addr, const struct in6_addr *mask) { memset(ifra, 0, sizeof(struct in6_aliasreq)); ifra->ifra_addr.sin6_family = AF_INET6; ifra->ifra_addr.sin6_len = sizeof(struct sockaddr_in6); if (addr != NULL) ifra->ifra_addr.sin6_addr = *addr; ifra->ifra_prefixmask.sin6_family = AF_INET6; ifra->ifra_prefixmask.sin6_len = sizeof(struct sockaddr_in6); if (mask != NULL) ifra->ifra_prefixmask.sin6_addr = *mask; } static int in6_validate_ifra(struct ifnet *ifp, struct in6_aliasreq *ifra, struct in6_ifaddr *ia, int flags) { int plen = -1; struct sockaddr_in6 dst6; struct in6_addrlifetime *lt; char ip6buf[INET6_ADDRSTRLEN]; /* Validate parameters */ if (ifp == NULL || ifra == NULL) /* this maybe redundant */ return (EINVAL); /* * The destination address for a p2p link must have a family * of AF_UNSPEC or AF_INET6. */ if ((ifp->if_flags & IFF_POINTOPOINT) != 0 && ifra->ifra_dstaddr.sin6_family != AF_INET6 && ifra->ifra_dstaddr.sin6_family != AF_UNSPEC) return (EAFNOSUPPORT); /* * Validate address */ if (ifra->ifra_addr.sin6_len != sizeof(struct sockaddr_in6) || ifra->ifra_addr.sin6_family != AF_INET6) return (EINVAL); /* * validate ifra_prefixmask. don't check sin6_family, netmask * does not carry fields other than sin6_len. */ if (ifra->ifra_prefixmask.sin6_len > sizeof(struct sockaddr_in6)) return (EINVAL); /* * Because the IPv6 address architecture is classless, we require * users to specify a (non 0) prefix length (mask) for a new address. * We also require the prefix (when specified) mask is valid, and thus * reject a non-consecutive mask. */ if (ia == NULL && ifra->ifra_prefixmask.sin6_len == 0) return (EINVAL); if (ifra->ifra_prefixmask.sin6_len != 0) { plen = in6_mask2len(&ifra->ifra_prefixmask.sin6_addr, (u_char *)&ifra->ifra_prefixmask + ifra->ifra_prefixmask.sin6_len); if (plen <= 0) return (EINVAL); } else { /* * In this case, ia must not be NULL. We just use its prefix * length. */ plen = in6_mask2len(&ia->ia_prefixmask.sin6_addr, NULL); } /* * If the destination address on a p2p interface is specified, * and the address is a scoped one, validate/set the scope * zone identifier. */ dst6 = ifra->ifra_dstaddr; if ((ifp->if_flags & (IFF_POINTOPOINT|IFF_LOOPBACK)) != 0 && (dst6.sin6_family == AF_INET6)) { struct in6_addr in6_tmp; u_int32_t zoneid; in6_tmp = dst6.sin6_addr; if (in6_setscope(&in6_tmp, ifp, &zoneid)) return (EINVAL); /* XXX: should be impossible */ if (dst6.sin6_scope_id != 0) { if (dst6.sin6_scope_id != zoneid) return (EINVAL); } else /* user omit to specify the ID. */ dst6.sin6_scope_id = zoneid; /* convert into the internal form */ if (sa6_embedscope(&dst6, 0)) return (EINVAL); /* XXX: should be impossible */ } /* Modify original ifra_dstaddr to reflect changes */ ifra->ifra_dstaddr = dst6; /* * The destination address can be specified only for a p2p or a * loopback interface. If specified, the corresponding prefix length * must be 128. */ if (ifra->ifra_dstaddr.sin6_family == AF_INET6) { if ((ifp->if_flags & (IFF_POINTOPOINT|IFF_LOOPBACK)) == 0) { /* XXX: noisy message */ nd6log((LOG_INFO, "in6_update_ifa: a destination can " "be specified for a p2p or a loopback IF only\n")); return (EINVAL); } if (plen != 128) { nd6log((LOG_INFO, "in6_update_ifa: prefixlen should " "be 128 when dstaddr is specified\n")); return (EINVAL); } } /* lifetime consistency check */ lt = &ifra->ifra_lifetime; if (lt->ia6t_pltime > lt->ia6t_vltime) return (EINVAL); if (lt->ia6t_vltime == 0) { /* * the following log might be noisy, but this is a typical * configuration mistake or a tool's bug. */ nd6log((LOG_INFO, "in6_update_ifa: valid lifetime is 0 for %s\n", ip6_sprintf(ip6buf, &ifra->ifra_addr.sin6_addr))); if (ia == NULL) return (0); /* there's nothing to do */ } /* Check prefix mask */ if (ia != NULL && ifra->ifra_prefixmask.sin6_len != 0) { /* * We prohibit changing the prefix length of an existing * address, because * + such an operation should be rare in IPv6, and * + the operation would confuse prefix management. */ if (ia->ia_prefixmask.sin6_len != 0 && in6_mask2len(&ia->ia_prefixmask.sin6_addr, NULL) != plen) { nd6log((LOG_INFO, "in6_validate_ifa: the prefix length " "of an existing %s address should not be changed\n", ip6_sprintf(ip6buf, &ia->ia_addr.sin6_addr))); return (EINVAL); } } return (0); } /* * Allocate a new ifaddr and link it into chains. */ static struct in6_ifaddr * in6_alloc_ifa(struct ifnet *ifp, struct in6_aliasreq *ifra, int flags) { struct in6_ifaddr *ia; /* * When in6_alloc_ifa() is called in a process of a received * RA, it is called under an interrupt context. So, we should * call malloc with M_NOWAIT. */ ia = (struct in6_ifaddr *)ifa_alloc(sizeof(*ia), M_NOWAIT); if (ia == NULL) return (NULL); LIST_INIT(&ia->ia6_memberships); /* Initialize the address and masks, and put time stamp */ ia->ia_ifa.ifa_addr = (struct sockaddr *)&ia->ia_addr; ia->ia_addr.sin6_family = AF_INET6; ia->ia_addr.sin6_len = sizeof(ia->ia_addr); /* XXX: Can we assign ,sin6_addr and skip the rest? */ ia->ia_addr = ifra->ifra_addr; ia->ia6_createtime = time_uptime; if ((ifp->if_flags & (IFF_POINTOPOINT | IFF_LOOPBACK)) != 0) { /* * Some functions expect that ifa_dstaddr is not * NULL for p2p interfaces. */ ia->ia_ifa.ifa_dstaddr = (struct sockaddr *)&ia->ia_dstaddr; } else { ia->ia_ifa.ifa_dstaddr = NULL; } /* set prefix mask if any */ ia->ia_ifa.ifa_netmask = (struct sockaddr *)&ia->ia_prefixmask; if (ifra->ifra_prefixmask.sin6_len != 0) { ia->ia_prefixmask.sin6_family = AF_INET6; ia->ia_prefixmask.sin6_len = ifra->ifra_prefixmask.sin6_len; ia->ia_prefixmask.sin6_addr = ifra->ifra_prefixmask.sin6_addr; } ia->ia_ifp = ifp; ifa_ref(&ia->ia_ifa); /* if_addrhead */ IF_ADDR_WLOCK(ifp); TAILQ_INSERT_TAIL(&ifp->if_addrhead, &ia->ia_ifa, ifa_link); IF_ADDR_WUNLOCK(ifp); ifa_ref(&ia->ia_ifa); /* in6_ifaddrhead */ IN6_IFADDR_WLOCK(); TAILQ_INSERT_TAIL(&V_in6_ifaddrhead, ia, ia_link); LIST_INSERT_HEAD(IN6ADDR_HASH(&ia->ia_addr.sin6_addr), ia, ia6_hash); IN6_IFADDR_WUNLOCK(); return (ia); } /* * Update/configure interface address parameters: * * 1) Update lifetime * 2) Update interface metric ad flags * 3) Notify other subsystems */ static int in6_update_ifa_internal(struct ifnet *ifp, struct in6_aliasreq *ifra, struct in6_ifaddr *ia, int hostIsNew, int flags) { int error; /* update timestamp */ ia->ia6_updatetime = time_uptime; /* * Set lifetimes. We do not refer to ia6t_expire and ia6t_preferred * to see if the address is deprecated or invalidated, but initialize * these members for applications. */ ia->ia6_lifetime = ifra->ifra_lifetime; if (ia->ia6_lifetime.ia6t_vltime != ND6_INFINITE_LIFETIME) { ia->ia6_lifetime.ia6t_expire = time_uptime + ia->ia6_lifetime.ia6t_vltime; } else ia->ia6_lifetime.ia6t_expire = 0; if (ia->ia6_lifetime.ia6t_pltime != ND6_INFINITE_LIFETIME) { ia->ia6_lifetime.ia6t_preferred = time_uptime + ia->ia6_lifetime.ia6t_pltime; } else ia->ia6_lifetime.ia6t_preferred = 0; /* * backward compatibility - if IN6_IFF_DEPRECATED is set from the * userland, make it deprecated. */ if ((ifra->ifra_flags & IN6_IFF_DEPRECATED) != 0) { ia->ia6_lifetime.ia6t_pltime = 0; ia->ia6_lifetime.ia6t_preferred = time_uptime; } /* * configure address flags. */ ia->ia6_flags = ifra->ifra_flags; /* * Make the address tentative before joining multicast addresses, * so that corresponding MLD responses would not have a tentative * source address. */ ia->ia6_flags &= ~IN6_IFF_DUPLICATED; /* safety */ /* * DAD should be performed for an new address or addresses on * an interface with ND6_IFF_IFDISABLED. */ if (in6if_do_dad(ifp) && (hostIsNew || (ND_IFINFO(ifp)->flags & ND6_IFF_IFDISABLED))) ia->ia6_flags |= IN6_IFF_TENTATIVE; /* notify other subsystems */ error = in6_notify_ifa(ifp, ia, ifra, hostIsNew); return (error); } /* * Do link-level ifa job: * 1) Add lle entry for added address * 2) Notifies routing socket users about new address * 3) join appropriate multicast group * 4) start DAD if enabled */ static int in6_broadcast_ifa(struct ifnet *ifp, struct in6_aliasreq *ifra, struct in6_ifaddr *ia, int flags) { struct in6_multi *in6m_sol; int error = 0; /* Add local address to lltable, if necessary (ex. on p2p link). */ if ((error = nd6_add_ifa_lle(ia)) != 0) { in6_purgeaddr(&ia->ia_ifa); ifa_free(&ia->ia_ifa); return (error); } /* Join necessary multicast groups. */ in6m_sol = NULL; if ((ifp->if_flags & IFF_MULTICAST) != 0) { error = in6_update_ifa_join_mc(ifp, ifra, ia, flags, &in6m_sol); if (error != 0) { in6_purgeaddr(&ia->ia_ifa); ifa_free(&ia->ia_ifa); return (error); } } /* Perform DAD, if the address is TENTATIVE. */ if ((ia->ia6_flags & IN6_IFF_TENTATIVE)) { int delay, mindelay, maxdelay; delay = 0; if ((flags & IN6_IFAUPDATE_DADDELAY)) { /* * We need to impose a delay before sending an NS * for DAD. Check if we also needed a delay for the * corresponding MLD message. If we did, the delay * should be larger than the MLD delay (this could be * relaxed a bit, but this simple logic is at least * safe). * XXX: Break data hiding guidelines and look at * state for the solicited multicast group. */ mindelay = 0; if (in6m_sol != NULL && in6m_sol->in6m_state == MLD_REPORTING_MEMBER) { mindelay = in6m_sol->in6m_timer; } maxdelay = MAX_RTR_SOLICITATION_DELAY * hz; if (maxdelay - mindelay == 0) delay = 0; else { delay = (arc4random() % (maxdelay - mindelay)) + mindelay; } } nd6_dad_start((struct ifaddr *)ia, delay); } in6_newaddrmsg(ia, RTM_ADD); ifa_free(&ia->ia_ifa); return (error); } void in6_purgeaddr(struct ifaddr *ifa) { struct ifnet *ifp = ifa->ifa_ifp; struct in6_ifaddr *ia = (struct in6_ifaddr *) ifa; struct in6_multi_mship *imm; int plen, error; if (ifa->ifa_carp) (*carp_detach_p)(ifa); /* * Remove the loopback route to the interface address. * The check for the current setting of "nd6_useloopback" * is not needed. */ if (ia->ia_flags & IFA_RTSELF) { error = ifa_del_loopback_route((struct ifaddr *)ia, (struct sockaddr *)&ia->ia_addr); if (error == 0) ia->ia_flags &= ~IFA_RTSELF; } /* stop DAD processing */ nd6_dad_stop(ifa); /* Leave multicast groups. */ while ((imm = LIST_FIRST(&ia->ia6_memberships)) != NULL) { LIST_REMOVE(imm, i6mm_chain); in6_leavegroup(imm); } plen = in6_mask2len(&ia->ia_prefixmask.sin6_addr, NULL); /* XXX */ if ((ia->ia_flags & IFA_ROUTE) && plen == 128) { error = rtinit(&(ia->ia_ifa), RTM_DELETE, ia->ia_flags | (ia->ia_dstaddr.sin6_family == AF_INET6 ? RTF_HOST : 0)); if (error != 0) log(LOG_INFO, "%s: err=%d, destination address delete " "failed\n", __func__, error); ia->ia_flags &= ~IFA_ROUTE; } in6_newaddrmsg(ia, RTM_DELETE); in6_unlink_ifa(ia, ifp); } static void in6_unlink_ifa(struct in6_ifaddr *ia, struct ifnet *ifp) { char ip6buf[INET6_ADDRSTRLEN]; int remove_lle; IF_ADDR_WLOCK(ifp); TAILQ_REMOVE(&ifp->if_addrhead, &ia->ia_ifa, ifa_link); IF_ADDR_WUNLOCK(ifp); ifa_free(&ia->ia_ifa); /* if_addrhead */ /* * Defer the release of what might be the last reference to the * in6_ifaddr so that it can't be freed before the remainder of the * cleanup. */ IN6_IFADDR_WLOCK(); TAILQ_REMOVE(&V_in6_ifaddrhead, ia, ia_link); LIST_REMOVE(ia, ia6_hash); IN6_IFADDR_WUNLOCK(); /* * Release the reference to the base prefix. There should be a * positive reference. */ remove_lle = 0; if (ia->ia6_ndpr == NULL) { nd6log((LOG_NOTICE, "in6_unlink_ifa: autoconf'ed address " "%s has no prefix\n", ip6_sprintf(ip6buf, IA6_IN6(ia)))); } else { ia->ia6_ndpr->ndpr_addrcnt--; /* Do not delete lles within prefix if refcont != 0 */ if (ia->ia6_ndpr->ndpr_addrcnt == 0) remove_lle = 1; ia->ia6_ndpr = NULL; } nd6_rem_ifa_lle(ia, remove_lle); /* * Also, if the address being removed is autoconf'ed, call * pfxlist_onlink_check() since the release might affect the status of * other (detached) addresses. */ if ((ia->ia6_flags & IN6_IFF_AUTOCONF)) { pfxlist_onlink_check(); } ifa_free(&ia->ia_ifa); /* in6_ifaddrhead */ } /* * Notifies other subsystems about address change/arrival: * 1) Notifies device handler on the first IPv6 address assignment * 2) Handle routing table changes for P2P links and route * 3) Handle routing table changes for address host route */ static int in6_notify_ifa(struct ifnet *ifp, struct in6_ifaddr *ia, struct in6_aliasreq *ifra, int hostIsNew) { int error = 0, plen, ifacount = 0; struct ifaddr *ifa; struct sockaddr_in6 *pdst; char ip6buf[INET6_ADDRSTRLEN]; /* * Give the interface a chance to initialize * if this is its first address, */ if (hostIsNew != 0) { IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; ifacount++; } IF_ADDR_RUNLOCK(ifp); } if (ifacount <= 1 && ifp->if_ioctl) { error = (*ifp->if_ioctl)(ifp, SIOCSIFADDR, (caddr_t)ia); if (error) return (error); } /* * If a new destination address is specified, scrub the old one and * install the new destination. Note that the interface must be * p2p or loopback. */ pdst = &ifra->ifra_dstaddr; if (pdst->sin6_family == AF_INET6 && !IN6_ARE_ADDR_EQUAL(&pdst->sin6_addr, &ia->ia_dstaddr.sin6_addr)) { if ((ia->ia_flags & IFA_ROUTE) != 0 && (rtinit(&(ia->ia_ifa), (int)RTM_DELETE, RTF_HOST) != 0)) { nd6log((LOG_ERR, "in6_update_ifa_internal: failed to " "remove a route to the old destination: %s\n", ip6_sprintf(ip6buf, &ia->ia_addr.sin6_addr))); /* proceed anyway... */ } else ia->ia_flags &= ~IFA_ROUTE; ia->ia_dstaddr = *pdst; } /* * If a new destination address is specified for a point-to-point * interface, install a route to the destination as an interface * direct route. * XXX: the logic below rejects assigning multiple addresses on a p2p * interface that share the same destination. */ plen = in6_mask2len(&ia->ia_prefixmask.sin6_addr, NULL); /* XXX */ if (!(ia->ia_flags & IFA_ROUTE) && plen == 128 && ia->ia_dstaddr.sin6_family == AF_INET6) { int rtflags = RTF_UP | RTF_HOST; /* * Handle the case for ::1 . */ if (ifp->if_flags & IFF_LOOPBACK) ia->ia_flags |= IFA_RTSELF; error = rtinit(&ia->ia_ifa, RTM_ADD, ia->ia_flags | rtflags); if (error) return (error); ia->ia_flags |= IFA_ROUTE; } /* * add a loopback route to self if not exists */ if (!(ia->ia_flags & IFA_RTSELF) && V_nd6_useloopback) { error = ifa_add_loopback_route((struct ifaddr *)ia, (struct sockaddr *)&ia->ia_addr); if (error == 0) ia->ia_flags |= IFA_RTSELF; } return (error); } /* * Find an IPv6 interface link-local address specific to an interface. * ifaddr is returned referenced. */ struct in6_ifaddr * in6ifa_ifpforlinklocal(struct ifnet *ifp, int ignoreflags) { struct ifaddr *ifa; IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; if (IN6_IS_ADDR_LINKLOCAL(IFA_IN6(ifa))) { if ((((struct in6_ifaddr *)ifa)->ia6_flags & ignoreflags) != 0) continue; ifa_ref(ifa); break; } } IF_ADDR_RUNLOCK(ifp); return ((struct in6_ifaddr *)ifa); } /* * find the internet address corresponding to a given address. * ifaddr is returned referenced. */ struct in6_ifaddr * in6ifa_ifwithaddr(const struct in6_addr *addr, uint32_t zoneid) { struct rm_priotracker in6_ifa_tracker; struct in6_ifaddr *ia; IN6_IFADDR_RLOCK(&in6_ifa_tracker); LIST_FOREACH(ia, IN6ADDR_HASH(addr), ia6_hash) { if (IN6_ARE_ADDR_EQUAL(IA6_IN6(ia), addr)) { if (zoneid != 0 && zoneid != ia->ia_addr.sin6_scope_id) continue; ifa_ref(&ia->ia_ifa); break; } } IN6_IFADDR_RUNLOCK(&in6_ifa_tracker); return (ia); } /* * find the internet address corresponding to a given interface and address. * ifaddr is returned referenced. */ struct in6_ifaddr * in6ifa_ifpwithaddr(struct ifnet *ifp, const struct in6_addr *addr) { struct ifaddr *ifa; IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; if (IN6_ARE_ADDR_EQUAL(addr, IFA_IN6(ifa))) { ifa_ref(ifa); break; } } IF_ADDR_RUNLOCK(ifp); return ((struct in6_ifaddr *)ifa); } /* * Find a link-local scoped address on ifp and return it if any. */ struct in6_ifaddr * in6ifa_llaonifp(struct ifnet *ifp) { struct sockaddr_in6 *sin6; struct ifaddr *ifa; if (ND_IFINFO(ifp)->flags & ND6_IFF_IFDISABLED) return (NULL); IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; sin6 = (struct sockaddr_in6 *)ifa->ifa_addr; if (IN6_IS_SCOPE_LINKLOCAL(&sin6->sin6_addr) || IN6_IS_ADDR_MC_INTFACELOCAL(&sin6->sin6_addr) || IN6_IS_ADDR_MC_NODELOCAL(&sin6->sin6_addr)) break; } IF_ADDR_RUNLOCK(ifp); return ((struct in6_ifaddr *)ifa); } /* * Convert IP6 address to printable (loggable) representation. Caller * has to make sure that ip6buf is at least INET6_ADDRSTRLEN long. */ static char digits[] = "0123456789abcdef"; char * ip6_sprintf(char *ip6buf, const struct in6_addr *addr) { int i, cnt = 0, maxcnt = 0, idx = 0, index = 0; char *cp; const u_int16_t *a = (const u_int16_t *)addr; const u_int8_t *d; int dcolon = 0, zero = 0; cp = ip6buf; for (i = 0; i < 8; i++) { if (*(a + i) == 0) { cnt++; if (cnt == 1) idx = i; } else if (maxcnt < cnt) { maxcnt = cnt; index = idx; cnt = 0; } } if (maxcnt < cnt) { maxcnt = cnt; index = idx; } for (i = 0; i < 8; i++) { if (dcolon == 1) { if (*a == 0) { if (i == 7) *cp++ = ':'; a++; continue; } else dcolon = 2; } if (*a == 0) { if (dcolon == 0 && *(a + 1) == 0 && i == index) { if (i == 0) *cp++ = ':'; *cp++ = ':'; dcolon = 1; } else { *cp++ = '0'; *cp++ = ':'; } a++; continue; } d = (const u_char *)a; /* Try to eliminate leading zeros in printout like in :0001. */ zero = 1; *cp = digits[*d >> 4]; if (*cp != '0') { zero = 0; cp++; } *cp = digits[*d++ & 0xf]; if (zero == 0 || (*cp != '0')) { zero = 0; cp++; } *cp = digits[*d >> 4]; if (zero == 0 || (*cp != '0')) { zero = 0; cp++; } *cp++ = digits[*d & 0xf]; *cp++ = ':'; a++; } *--cp = '\0'; return (ip6buf); } int in6_localaddr(struct in6_addr *in6) { struct rm_priotracker in6_ifa_tracker; struct in6_ifaddr *ia; if (IN6_IS_ADDR_LOOPBACK(in6) || IN6_IS_ADDR_LINKLOCAL(in6)) return 1; IN6_IFADDR_RLOCK(&in6_ifa_tracker); TAILQ_FOREACH(ia, &V_in6_ifaddrhead, ia_link) { if (IN6_ARE_MASKED_ADDR_EQUAL(in6, &ia->ia_addr.sin6_addr, &ia->ia_prefixmask.sin6_addr)) { IN6_IFADDR_RUNLOCK(&in6_ifa_tracker); return 1; } } IN6_IFADDR_RUNLOCK(&in6_ifa_tracker); return (0); } /* * Return 1 if an internet address is for the local host and configured * on one of its interfaces. */ int in6_localip(struct in6_addr *in6) { struct rm_priotracker in6_ifa_tracker; struct in6_ifaddr *ia; IN6_IFADDR_RLOCK(&in6_ifa_tracker); LIST_FOREACH(ia, IN6ADDR_HASH(in6), ia6_hash) { if (IN6_ARE_ADDR_EQUAL(in6, &ia->ia_addr.sin6_addr)) { IN6_IFADDR_RUNLOCK(&in6_ifa_tracker); return (1); } } IN6_IFADDR_RUNLOCK(&in6_ifa_tracker); return (0); } /* * Return 1 if an internet address is configured on an interface. */ int in6_ifhasaddr(struct ifnet *ifp, struct in6_addr *addr) { struct in6_addr in6; struct ifaddr *ifa; struct in6_ifaddr *ia6; in6 = *addr; if (in6_clearscope(&in6)) return (0); in6_setscope(&in6, ifp, NULL); IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; ia6 = (struct in6_ifaddr *)ifa; if (IN6_ARE_ADDR_EQUAL(&ia6->ia_addr.sin6_addr, &in6)) { IF_ADDR_RUNLOCK(ifp); return (1); } } IF_ADDR_RUNLOCK(ifp); return (0); } int in6_is_addr_deprecated(struct sockaddr_in6 *sa6) { struct rm_priotracker in6_ifa_tracker; struct in6_ifaddr *ia; IN6_IFADDR_RLOCK(&in6_ifa_tracker); LIST_FOREACH(ia, IN6ADDR_HASH(&sa6->sin6_addr), ia6_hash) { if (IN6_ARE_ADDR_EQUAL(IA6_IN6(ia), &sa6->sin6_addr)) { if (ia->ia6_flags & IN6_IFF_DEPRECATED) { IN6_IFADDR_RUNLOCK(&in6_ifa_tracker); return (1); /* true */ } break; } } IN6_IFADDR_RUNLOCK(&in6_ifa_tracker); return (0); /* false */ } /* * return length of part which dst and src are equal * hard coding... */ int in6_matchlen(struct in6_addr *src, struct in6_addr *dst) { int match = 0; u_char *s = (u_char *)src, *d = (u_char *)dst; u_char *lim = s + 16, r; while (s < lim) if ((r = (*d++ ^ *s++)) != 0) { while (r < 128) { match++; r <<= 1; } break; } else match += 8; return match; } /* XXX: to be scope conscious */ int in6_are_prefix_equal(struct in6_addr *p1, struct in6_addr *p2, int len) { int bytelen, bitlen; /* sanity check */ if (0 > len || len > 128) { log(LOG_ERR, "in6_are_prefix_equal: invalid prefix length(%d)\n", len); return (0); } bytelen = len / 8; bitlen = len % 8; if (bcmp(&p1->s6_addr, &p2->s6_addr, bytelen)) return (0); if (bitlen != 0 && p1->s6_addr[bytelen] >> (8 - bitlen) != p2->s6_addr[bytelen] >> (8 - bitlen)) return (0); return (1); } void in6_prefixlen2mask(struct in6_addr *maskp, int len) { u_char maskarray[8] = {0x80, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc, 0xfe, 0xff}; int bytelen, bitlen, i; /* sanity check */ if (0 > len || len > 128) { log(LOG_ERR, "in6_prefixlen2mask: invalid prefix length(%d)\n", len); return; } bzero(maskp, sizeof(*maskp)); bytelen = len / 8; bitlen = len % 8; for (i = 0; i < bytelen; i++) maskp->s6_addr[i] = 0xff; if (bitlen) maskp->s6_addr[bytelen] = maskarray[bitlen - 1]; } /* * return the best address out of the same scope. if no address was * found, return the first valid address from designated IF. */ struct in6_ifaddr * in6_ifawithifp(struct ifnet *ifp, struct in6_addr *dst) { int dst_scope = in6_addrscope(dst), blen = -1, tlen; struct ifaddr *ifa; struct in6_ifaddr *besta = NULL; struct in6_ifaddr *dep[2]; /* last-resort: deprecated */ dep[0] = dep[1] = NULL; /* * We first look for addresses in the same scope. * If there is one, return it. * If two or more, return one which matches the dst longest. * If none, return one of global addresses assigned other ifs. */ IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; if (((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_ANYCAST) continue; /* XXX: is there any case to allow anycast? */ if (((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_NOTREADY) continue; /* don't use this interface */ if (((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_DETACHED) continue; if (((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_DEPRECATED) { if (V_ip6_use_deprecated) dep[0] = (struct in6_ifaddr *)ifa; continue; } if (dst_scope == in6_addrscope(IFA_IN6(ifa))) { /* * call in6_matchlen() as few as possible */ if (besta) { if (blen == -1) blen = in6_matchlen(&besta->ia_addr.sin6_addr, dst); tlen = in6_matchlen(IFA_IN6(ifa), dst); if (tlen > blen) { blen = tlen; besta = (struct in6_ifaddr *)ifa; } } else besta = (struct in6_ifaddr *)ifa; } } if (besta) { ifa_ref(&besta->ia_ifa); IF_ADDR_RUNLOCK(ifp); return (besta); } TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; if (((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_ANYCAST) continue; /* XXX: is there any case to allow anycast? */ if (((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_NOTREADY) continue; /* don't use this interface */ if (((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_DETACHED) continue; if (((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_DEPRECATED) { if (V_ip6_use_deprecated) dep[1] = (struct in6_ifaddr *)ifa; continue; } if (ifa != NULL) ifa_ref(ifa); IF_ADDR_RUNLOCK(ifp); return (struct in6_ifaddr *)ifa; } /* use the last-resort values, that are, deprecated addresses */ if (dep[0]) { ifa_ref((struct ifaddr *)dep[0]); IF_ADDR_RUNLOCK(ifp); return dep[0]; } if (dep[1]) { ifa_ref((struct ifaddr *)dep[1]); IF_ADDR_RUNLOCK(ifp); return dep[1]; } IF_ADDR_RUNLOCK(ifp); return NULL; } /* * perform DAD when interface becomes IFF_UP. */ void in6_if_up(struct ifnet *ifp) { struct ifaddr *ifa; struct in6_ifaddr *ia; IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; ia = (struct in6_ifaddr *)ifa; if (ia->ia6_flags & IN6_IFF_TENTATIVE) { /* * The TENTATIVE flag was likely set by hand * beforehand, implicitly indicating the need for DAD. * We may be able to skip the random delay in this * case, but we impose delays just in case. */ nd6_dad_start(ifa, arc4random() % (MAX_RTR_SOLICITATION_DELAY * hz)); } } IF_ADDR_RUNLOCK(ifp); /* * special cases, like 6to4, are handled in in6_ifattach */ in6_ifattach(ifp, NULL); } int in6if_do_dad(struct ifnet *ifp) { if ((ifp->if_flags & IFF_LOOPBACK) != 0) return (0); if ((ND_IFINFO(ifp)->flags & ND6_IFF_IFDISABLED) || (ND_IFINFO(ifp)->flags & ND6_IFF_NO_DAD)) return (0); /* * Our DAD routine requires the interface up and running. * However, some interfaces can be up before the RUNNING * status. Additionally, users may try to assign addresses * before the interface becomes up (or running). * This function returns EAGAIN in that case. * The caller should mark "tentative" on the address instead of * performing DAD immediately. */ if (!((ifp->if_flags & IFF_UP) && (ifp->if_drv_flags & IFF_DRV_RUNNING))) return (EAGAIN); return (1); } /* * Calculate max IPv6 MTU through all the interfaces and store it * to in6_maxmtu. */ void in6_setmaxmtu(void) { unsigned long maxmtu = 0; struct ifnet *ifp; IFNET_RLOCK_NOSLEEP(); TAILQ_FOREACH(ifp, &V_ifnet, if_link) { /* this function can be called during ifnet initialization */ if (!ifp->if_afdata[AF_INET6]) continue; if ((ifp->if_flags & IFF_LOOPBACK) == 0 && IN6_LINKMTU(ifp) > maxmtu) maxmtu = IN6_LINKMTU(ifp); } IFNET_RUNLOCK_NOSLEEP(); if (maxmtu) /* update only when maxmtu is positive */ V_in6_maxmtu = maxmtu; } /* * Provide the length of interface identifiers to be used for the link attached * to the given interface. The length should be defined in "IPv6 over * xxx-link" document. Note that address architecture might also define * the length for a particular set of address prefixes, regardless of the * link type. As clarified in rfc2462bis, those two definitions should be * consistent, and those really are as of August 2004. */ int in6_if2idlen(struct ifnet *ifp) { switch (ifp->if_type) { case IFT_ETHER: /* RFC2464 */ case IFT_PROPVIRTUAL: /* XXX: no RFC. treat it as ether */ case IFT_L2VLAN: /* ditto */ case IFT_IEEE80211: /* ditto */ case IFT_BRIDGE: /* bridge(4) only does Ethernet-like links */ case IFT_INFINIBAND: return (64); case IFT_FDDI: /* RFC2467 */ return (64); case IFT_ISO88025: /* RFC2470 (IPv6 over Token Ring) */ return (64); case IFT_PPP: /* RFC2472 */ return (64); case IFT_ARCNET: /* RFC2497 */ return (64); case IFT_FRELAY: /* RFC2590 */ return (64); case IFT_IEEE1394: /* RFC3146 */ return (64); case IFT_GIF: return (64); /* draft-ietf-v6ops-mech-v2-07 */ case IFT_LOOP: return (64); /* XXX: is this really correct? */ default: /* * Unknown link type: * It might be controversial to use the today's common constant * of 64 for these cases unconditionally. For full compliance, * we should return an error in this case. On the other hand, * if we simply miss the standard for the link type or a new * standard is defined for a new link type, the IFID length * is very likely to be the common constant. As a compromise, * we always use the constant, but make an explicit notice * indicating the "unknown" case. */ printf("in6_if2idlen: unknown link type (%d)\n", ifp->if_type); return (64); } } #include struct in6_llentry { struct llentry base; }; #define IN6_LLTBL_DEFAULT_HSIZE 32 #define IN6_LLTBL_HASH(k, h) \ (((((((k >> 8) ^ k) >> 8) ^ k) >> 8) ^ k) & ((h) - 1)) /* * Do actual deallocation of @lle. */ static void in6_lltable_destroy_lle_unlocked(struct llentry *lle) { LLE_LOCK_DESTROY(lle); LLE_REQ_DESTROY(lle); free(lle, M_LLTABLE); } /* * Called by LLE_FREE_LOCKED when number of references * drops to zero. */ static void in6_lltable_destroy_lle(struct llentry *lle) { LLE_WUNLOCK(lle); in6_lltable_destroy_lle_unlocked(lle); } static struct llentry * in6_lltable_new(const struct in6_addr *addr6, u_int flags) { struct in6_llentry *lle; lle = malloc(sizeof(struct in6_llentry), M_LLTABLE, M_NOWAIT | M_ZERO); if (lle == NULL) /* NB: caller generates msg */ return NULL; lle->base.r_l3addr.addr6 = *addr6; lle->base.lle_refcnt = 1; lle->base.lle_free = in6_lltable_destroy_lle; LLE_LOCK_INIT(&lle->base); LLE_REQ_INIT(&lle->base); callout_init(&lle->base.lle_timer, 1); return (&lle->base); } static int in6_lltable_match_prefix(const struct sockaddr *saddr, const struct sockaddr *smask, u_int flags, struct llentry *lle) { const struct in6_addr *addr, *mask, *lle_addr; addr = &((const struct sockaddr_in6 *)saddr)->sin6_addr; mask = &((const struct sockaddr_in6 *)smask)->sin6_addr; lle_addr = &lle->r_l3addr.addr6; if (IN6_ARE_MASKED_ADDR_EQUAL(lle_addr, addr, mask) == 0) return (0); if (lle->la_flags & LLE_IFADDR) { /* * Delete LLE_IFADDR records IFF address & flag matches. * Note that addr is the interface address within prefix * being matched. */ if (IN6_ARE_ADDR_EQUAL(addr, lle_addr) && (flags & LLE_STATIC) != 0) return (1); return (0); } /* flags & LLE_STATIC means deleting both dynamic and static entries */ if ((flags & LLE_STATIC) || !(lle->la_flags & LLE_STATIC)) return (1); return (0); } static void in6_lltable_free_entry(struct lltable *llt, struct llentry *lle) { struct ifnet *ifp; LLE_WLOCK_ASSERT(lle); KASSERT(llt != NULL, ("lltable is NULL")); /* Unlink entry from table */ if ((lle->la_flags & LLE_LINKED) != 0) { ifp = llt->llt_ifp; IF_AFDATA_WLOCK_ASSERT(ifp); lltable_unlink_entry(llt, lle); } if (callout_stop(&lle->lle_timer) > 0) LLE_REMREF(lle); llentry_free(lle); } static int in6_lltable_rtcheck(struct ifnet *ifp, u_int flags, const struct sockaddr *l3addr) { const struct sockaddr_in6 *sin6; struct nhop6_basic nh6; struct in6_addr dst; uint32_t scopeid; int error; char ip6buf[INET6_ADDRSTRLEN]; KASSERT(l3addr->sa_family == AF_INET6, ("sin_family %d", l3addr->sa_family)); /* Our local addresses are always only installed on the default FIB. */ sin6 = (const struct sockaddr_in6 *)l3addr; in6_splitscope(&sin6->sin6_addr, &dst, &scopeid); error = fib6_lookup_nh_basic(RT_DEFAULT_FIB, &dst, scopeid, 0, 0, &nh6); if (error != 0 || (nh6.nh_flags & NHF_GATEWAY) || nh6.nh_ifp != ifp) { struct ifaddr *ifa; /* * Create an ND6 cache for an IPv6 neighbor * that is not covered by our own prefix. */ ifa = ifaof_ifpforaddr(l3addr, ifp); if (ifa != NULL) { ifa_free(ifa); return 0; } log(LOG_INFO, "IPv6 address: \"%s\" is not on the network\n", ip6_sprintf(ip6buf, &sin6->sin6_addr)); return EINVAL; } return 0; } static inline uint32_t in6_lltable_hash_dst(const struct in6_addr *dst, uint32_t hsize) { return (IN6_LLTBL_HASH(dst->s6_addr32[3], hsize)); } static uint32_t in6_lltable_hash(const struct llentry *lle, uint32_t hsize) { return (in6_lltable_hash_dst(&lle->r_l3addr.addr6, hsize)); } static void in6_lltable_fill_sa_entry(const struct llentry *lle, struct sockaddr *sa) { struct sockaddr_in6 *sin6; sin6 = (struct sockaddr_in6 *)sa; bzero(sin6, sizeof(*sin6)); sin6->sin6_family = AF_INET6; sin6->sin6_len = sizeof(*sin6); sin6->sin6_addr = lle->r_l3addr.addr6; } static inline struct llentry * in6_lltable_find_dst(struct lltable *llt, const struct in6_addr *dst) { struct llentry *lle; struct llentries *lleh; u_int hashidx; hashidx = in6_lltable_hash_dst(dst, llt->llt_hsize); lleh = &llt->lle_head[hashidx]; LIST_FOREACH(lle, lleh, lle_next) { if (lle->la_flags & LLE_DELETED) continue; if (IN6_ARE_ADDR_EQUAL(&lle->r_l3addr.addr6, dst)) break; } return (lle); } static void in6_lltable_delete_entry(struct lltable *llt, struct llentry *lle) { lle->la_flags |= LLE_DELETED; EVENTHANDLER_INVOKE(lle_event, lle, LLENTRY_DELETED); #ifdef DIAGNOSTIC log(LOG_INFO, "ifaddr cache = %p is deleted\n", lle); #endif llentry_free(lle); } static struct llentry * in6_lltable_alloc(struct lltable *llt, u_int flags, const struct sockaddr *l3addr) { const struct sockaddr_in6 *sin6 = (const struct sockaddr_in6 *)l3addr; struct ifnet *ifp = llt->llt_ifp; struct llentry *lle; char linkhdr[LLE_MAX_LINKHDR]; size_t linkhdrsize; int lladdr_off; KASSERT(l3addr->sa_family == AF_INET6, ("sin_family %d", l3addr->sa_family)); /* * A route that covers the given address must have * been installed 1st because we are doing a resolution, * verify this. */ if (!(flags & LLE_IFADDR) && in6_lltable_rtcheck(ifp, flags, l3addr) != 0) return (NULL); lle = in6_lltable_new(&sin6->sin6_addr, flags); if (lle == NULL) { log(LOG_INFO, "lla_lookup: new lle malloc failed\n"); return (NULL); } lle->la_flags = flags; if ((flags & LLE_IFADDR) == LLE_IFADDR) { linkhdrsize = LLE_MAX_LINKHDR; if (lltable_calc_llheader(ifp, AF_INET6, IF_LLADDR(ifp), linkhdr, &linkhdrsize, &lladdr_off) != 0) { in6_lltable_destroy_lle_unlocked(lle); return (NULL); } lltable_set_entry_addr(ifp, lle, linkhdr, linkhdrsize, lladdr_off); lle->la_flags |= LLE_STATIC; } if ((lle->la_flags & LLE_STATIC) != 0) lle->ln_state = ND6_LLINFO_REACHABLE; return (lle); } static struct llentry * in6_lltable_lookup(struct lltable *llt, u_int flags, const struct sockaddr *l3addr) { const struct sockaddr_in6 *sin6 = (const struct sockaddr_in6 *)l3addr; struct llentry *lle; IF_AFDATA_LOCK_ASSERT(llt->llt_ifp); KASSERT(l3addr->sa_family == AF_INET6, ("sin_family %d", l3addr->sa_family)); lle = in6_lltable_find_dst(llt, &sin6->sin6_addr); if (lle == NULL) return (NULL); KASSERT((flags & (LLE_UNLOCKED|LLE_EXCLUSIVE)) != (LLE_UNLOCKED|LLE_EXCLUSIVE),("wrong lle request flags: 0x%X", flags)); if (flags & LLE_UNLOCKED) return (lle); if (flags & LLE_EXCLUSIVE) LLE_WLOCK(lle); else LLE_RLOCK(lle); return (lle); } static int in6_lltable_dump_entry(struct lltable *llt, struct llentry *lle, struct sysctl_req *wr) { struct ifnet *ifp = llt->llt_ifp; /* XXX stack use */ struct { struct rt_msghdr rtm; struct sockaddr_in6 sin6; /* * ndp.c assumes that sdl is word aligned */ #ifdef __LP64__ uint32_t pad; #endif struct sockaddr_dl sdl; } ndpc; struct sockaddr_dl *sdl; int error; bzero(&ndpc, sizeof(ndpc)); /* skip deleted entries */ if ((lle->la_flags & LLE_DELETED) == LLE_DELETED) return (0); /* Skip if jailed and not a valid IP of the prison. */ lltable_fill_sa_entry(lle, (struct sockaddr *)&ndpc.sin6); if (prison_if(wr->td->td_ucred, (struct sockaddr *)&ndpc.sin6) != 0) return (0); /* * produce a msg made of: * struct rt_msghdr; * struct sockaddr_in6 (IPv6) * struct sockaddr_dl; */ ndpc.rtm.rtm_msglen = sizeof(ndpc); ndpc.rtm.rtm_version = RTM_VERSION; ndpc.rtm.rtm_type = RTM_GET; ndpc.rtm.rtm_flags = RTF_UP; ndpc.rtm.rtm_addrs = RTA_DST | RTA_GATEWAY; if (V_deembed_scopeid) sa6_recoverscope(&ndpc.sin6); /* publish */ if (lle->la_flags & LLE_PUB) ndpc.rtm.rtm_flags |= RTF_ANNOUNCE; sdl = &ndpc.sdl; sdl->sdl_family = AF_LINK; sdl->sdl_len = sizeof(*sdl); sdl->sdl_index = ifp->if_index; sdl->sdl_type = ifp->if_type; if ((lle->la_flags & LLE_VALID) == LLE_VALID) { sdl->sdl_alen = ifp->if_addrlen; bcopy(lle->ll_addr, LLADDR(sdl), ifp->if_addrlen); } else { sdl->sdl_alen = 0; bzero(LLADDR(sdl), ifp->if_addrlen); } if (lle->la_expire != 0) ndpc.rtm.rtm_rmx.rmx_expire = lle->la_expire + lle->lle_remtime / hz + time_second - time_uptime; ndpc.rtm.rtm_flags |= (RTF_HOST | RTF_LLDATA); if (lle->la_flags & LLE_STATIC) ndpc.rtm.rtm_flags |= RTF_STATIC; if (lle->la_flags & LLE_IFADDR) ndpc.rtm.rtm_flags |= RTF_PINNED; if (lle->ln_router != 0) ndpc.rtm.rtm_flags |= RTF_GATEWAY; ndpc.rtm.rtm_rmx.rmx_pksent = lle->la_asked; /* Store state in rmx_weight value */ ndpc.rtm.rtm_rmx.rmx_state = lle->ln_state; ndpc.rtm.rtm_index = ifp->if_index; error = SYSCTL_OUT(wr, &ndpc, sizeof(ndpc)); return (error); } static struct lltable * in6_lltattach(struct ifnet *ifp) { struct lltable *llt; llt = lltable_allocate_htbl(IN6_LLTBL_DEFAULT_HSIZE); llt->llt_af = AF_INET6; llt->llt_ifp = ifp; llt->llt_lookup = in6_lltable_lookup; llt->llt_alloc_entry = in6_lltable_alloc; llt->llt_delete_entry = in6_lltable_delete_entry; llt->llt_dump_entry = in6_lltable_dump_entry; llt->llt_hash = in6_lltable_hash; llt->llt_fill_sa_entry = in6_lltable_fill_sa_entry; llt->llt_free_entry = in6_lltable_free_entry; llt->llt_match_prefix = in6_lltable_match_prefix; lltable_link(llt); return (llt); } void * in6_domifattach(struct ifnet *ifp) { struct in6_ifextra *ext; /* There are not IPv6-capable interfaces. */ switch (ifp->if_type) { case IFT_PFLOG: case IFT_PFSYNC: case IFT_USB: return (NULL); } ext = (struct in6_ifextra *)malloc(sizeof(*ext), M_IFADDR, M_WAITOK); bzero(ext, sizeof(*ext)); ext->in6_ifstat = malloc(sizeof(counter_u64_t) * sizeof(struct in6_ifstat) / sizeof(uint64_t), M_IFADDR, M_WAITOK); COUNTER_ARRAY_ALLOC(ext->in6_ifstat, sizeof(struct in6_ifstat) / sizeof(uint64_t), M_WAITOK); ext->icmp6_ifstat = malloc(sizeof(counter_u64_t) * sizeof(struct icmp6_ifstat) / sizeof(uint64_t), M_IFADDR, M_WAITOK); COUNTER_ARRAY_ALLOC(ext->icmp6_ifstat, sizeof(struct icmp6_ifstat) / sizeof(uint64_t), M_WAITOK); ext->nd_ifinfo = nd6_ifattach(ifp); ext->scope6_id = scope6_ifattach(ifp); ext->lltable = in6_lltattach(ifp); ext->mld_ifinfo = mld_domifattach(ifp); return ext; } int in6_domifmtu(struct ifnet *ifp) { if (ifp->if_afdata[AF_INET6] == NULL) return ifp->if_mtu; return (IN6_LINKMTU(ifp)); } void in6_domifdetach(struct ifnet *ifp, void *aux) { struct in6_ifextra *ext = (struct in6_ifextra *)aux; mld_domifdetach(ifp); scope6_ifdetach(ext->scope6_id); nd6_ifdetach(ifp, ext->nd_ifinfo); lltable_free(ext->lltable); COUNTER_ARRAY_FREE(ext->in6_ifstat, sizeof(struct in6_ifstat) / sizeof(uint64_t)); free(ext->in6_ifstat, M_IFADDR); COUNTER_ARRAY_FREE(ext->icmp6_ifstat, sizeof(struct icmp6_ifstat) / sizeof(uint64_t)); free(ext->icmp6_ifstat, M_IFADDR); free(ext, M_IFADDR); } /* * Convert sockaddr_in6 to sockaddr_in. Original sockaddr_in6 must be * v4 mapped addr or v4 compat addr */ void in6_sin6_2_sin(struct sockaddr_in *sin, struct sockaddr_in6 *sin6) { bzero(sin, sizeof(*sin)); sin->sin_len = sizeof(struct sockaddr_in); sin->sin_family = AF_INET; sin->sin_port = sin6->sin6_port; sin->sin_addr.s_addr = sin6->sin6_addr.s6_addr32[3]; } /* Convert sockaddr_in to sockaddr_in6 in v4 mapped addr format. */ void in6_sin_2_v4mapsin6(struct sockaddr_in *sin, struct sockaddr_in6 *sin6) { bzero(sin6, sizeof(*sin6)); sin6->sin6_len = sizeof(struct sockaddr_in6); sin6->sin6_family = AF_INET6; sin6->sin6_port = sin->sin_port; sin6->sin6_addr.s6_addr32[0] = 0; sin6->sin6_addr.s6_addr32[1] = 0; sin6->sin6_addr.s6_addr32[2] = IPV6_ADDR_INT32_SMP; sin6->sin6_addr.s6_addr32[3] = sin->sin_addr.s_addr; } /* Convert sockaddr_in6 into sockaddr_in. */ void in6_sin6_2_sin_in_sock(struct sockaddr *nam) { struct sockaddr_in *sin_p; struct sockaddr_in6 sin6; /* * Save original sockaddr_in6 addr and convert it * to sockaddr_in. */ sin6 = *(struct sockaddr_in6 *)nam; sin_p = (struct sockaddr_in *)nam; in6_sin6_2_sin(sin_p, &sin6); } /* Convert sockaddr_in into sockaddr_in6 in v4 mapped addr format. */ void in6_sin_2_v4mapsin6_in_sock(struct sockaddr **nam) { struct sockaddr_in *sin_p; struct sockaddr_in6 *sin6_p; sin6_p = malloc(sizeof *sin6_p, M_SONAME, M_WAITOK); sin_p = (struct sockaddr_in *)*nam; in6_sin_2_v4mapsin6(sin_p, sin6_p); free(*nam, M_SONAME); *nam = (struct sockaddr *)sin6_p; } Index: user/alc/PQ_LAUNDRY/sys/netinet6/in6_ifattach.c =================================================================== --- user/alc/PQ_LAUNDRY/sys/netinet6/in6_ifattach.c (revision 306831) +++ user/alc/PQ_LAUNDRY/sys/netinet6/in6_ifattach.c (revision 306832) @@ -1,946 +1,937 @@ /*- * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $KAME: in6_ifattach.c,v 1.118 2001/05/24 07:44:00 itojun Exp $ */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include VNET_DEFINE(unsigned long, in6_maxmtu) = 0; #ifdef IP6_AUTO_LINKLOCAL VNET_DEFINE(int, ip6_auto_linklocal) = IP6_AUTO_LINKLOCAL; #else VNET_DEFINE(int, ip6_auto_linklocal) = 1; /* enabled by default */ #endif VNET_DEFINE(struct callout, in6_tmpaddrtimer_ch); #define V_in6_tmpaddrtimer_ch VNET(in6_tmpaddrtimer_ch) VNET_DECLARE(struct inpcbinfo, ripcbinfo); #define V_ripcbinfo VNET(ripcbinfo) static int get_rand_ifid(struct ifnet *, struct in6_addr *); static int generate_tmp_ifid(u_int8_t *, const u_int8_t *, u_int8_t *); static int get_ifid(struct ifnet *, struct ifnet *, struct in6_addr *); static int in6_ifattach_linklocal(struct ifnet *, struct ifnet *); static int in6_ifattach_loopback(struct ifnet *); static void in6_purgemaddrs(struct ifnet *); #define EUI64_GBIT 0x01 #define EUI64_UBIT 0x02 #define EUI64_TO_IFID(in6) do {(in6)->s6_addr[8] ^= EUI64_UBIT; } while (0) #define EUI64_GROUP(in6) ((in6)->s6_addr[8] & EUI64_GBIT) #define EUI64_INDIVIDUAL(in6) (!EUI64_GROUP(in6)) #define EUI64_LOCAL(in6) ((in6)->s6_addr[8] & EUI64_UBIT) #define EUI64_UNIVERSAL(in6) (!EUI64_LOCAL(in6)) #define IFID_LOCAL(in6) (!EUI64_LOCAL(in6)) #define IFID_UNIVERSAL(in6) (!EUI64_UNIVERSAL(in6)) /* * Generate a last-resort interface identifier, when the machine has no * IEEE802/EUI64 address sources. * The goal here is to get an interface identifier that is * (1) random enough and (2) does not change across reboot. * We currently use MD5(hostname) for it. * * in6 - upper 64bits are preserved */ static int get_rand_ifid(struct ifnet *ifp, struct in6_addr *in6) { MD5_CTX ctxt; struct prison *pr; u_int8_t digest[16]; int hostnamelen; pr = curthread->td_ucred->cr_prison; mtx_lock(&pr->pr_mtx); hostnamelen = strlen(pr->pr_hostname); #if 0 /* we need at least several letters as seed for ifid */ if (hostnamelen < 3) { mtx_unlock(&pr->pr_mtx); return -1; } #endif /* generate 8 bytes of pseudo-random value. */ bzero(&ctxt, sizeof(ctxt)); MD5Init(&ctxt); MD5Update(&ctxt, pr->pr_hostname, hostnamelen); mtx_unlock(&pr->pr_mtx); MD5Final(digest, &ctxt); /* assumes sizeof(digest) > sizeof(ifid) */ bcopy(digest, &in6->s6_addr[8], 8); /* make sure to set "u" bit to local, and "g" bit to individual. */ in6->s6_addr[8] &= ~EUI64_GBIT; /* g bit to "individual" */ in6->s6_addr[8] |= EUI64_UBIT; /* u bit to "local" */ /* convert EUI64 into IPv6 interface identifier */ EUI64_TO_IFID(in6); return 0; } static int generate_tmp_ifid(u_int8_t *seed0, const u_int8_t *seed1, u_int8_t *ret) { MD5_CTX ctxt; u_int8_t seed[16], digest[16], nullbuf[8]; u_int32_t val32; /* If there's no history, start with a random seed. */ bzero(nullbuf, sizeof(nullbuf)); if (bcmp(nullbuf, seed0, sizeof(nullbuf)) == 0) { int i; for (i = 0; i < 2; i++) { val32 = arc4random(); bcopy(&val32, seed + sizeof(val32) * i, sizeof(val32)); } } else bcopy(seed0, seed, 8); /* copy the right-most 64-bits of the given address */ /* XXX assumption on the size of IFID */ bcopy(seed1, &seed[8], 8); if (0) { /* for debugging purposes only */ int i; printf("generate_tmp_ifid: new randomized ID from: "); for (i = 0; i < 16; i++) printf("%02x", seed[i]); printf(" "); } /* generate 16 bytes of pseudo-random value. */ bzero(&ctxt, sizeof(ctxt)); MD5Init(&ctxt); MD5Update(&ctxt, seed, sizeof(seed)); MD5Final(digest, &ctxt); /* * RFC 3041 3.2.1. (3) * Take the left-most 64-bits of the MD5 digest and set bit 6 (the * left-most bit is numbered 0) to zero. */ bcopy(digest, ret, 8); ret[0] &= ~EUI64_UBIT; /* * XXX: we'd like to ensure that the generated value is not zero * for simplicity. If the caclculated digest happens to be zero, * use a random non-zero value as the last resort. */ if (bcmp(nullbuf, ret, sizeof(nullbuf)) == 0) { nd6log((LOG_INFO, "generate_tmp_ifid: computed MD5 value is zero.\n")); val32 = arc4random(); val32 = 1 + (val32 % (0xffffffff - 1)); } /* * RFC 3041 3.2.1. (4) * Take the rightmost 64-bits of the MD5 digest and save them in * stable storage as the history value to be used in the next * iteration of the algorithm. */ bcopy(&digest[8], seed0, 8); if (0) { /* for debugging purposes only */ int i; printf("to: "); for (i = 0; i < 16; i++) printf("%02x", digest[i]); printf("\n"); } return 0; } /* * Get interface identifier for the specified interface. * XXX assumes single sockaddr_dl (AF_LINK address) per an interface * * in6 - upper 64bits are preserved */ int in6_get_hw_ifid(struct ifnet *ifp, struct in6_addr *in6) { struct ifaddr *ifa; struct sockaddr_dl *sdl; u_int8_t *addr; size_t addrlen; static u_int8_t allzero[8] = { 0, 0, 0, 0, 0, 0, 0, 0 }; static u_int8_t allone[8] = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }; IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_LINK) continue; sdl = (struct sockaddr_dl *)ifa->ifa_addr; if (sdl == NULL) continue; if (sdl->sdl_alen == 0) continue; goto found; } IF_ADDR_RUNLOCK(ifp); return -1; found: IF_ADDR_LOCK_ASSERT(ifp); addr = LLADDR(sdl); addrlen = sdl->sdl_alen; /* get EUI64 */ switch (ifp->if_type) { case IFT_BRIDGE: case IFT_ETHER: case IFT_L2VLAN: case IFT_FDDI: case IFT_ISO88025: case IFT_ATM: case IFT_IEEE1394: case IFT_IEEE80211: /* IEEE802/EUI64 cases - what others? */ /* IEEE1394 uses 16byte length address starting with EUI64 */ if (addrlen > 8) addrlen = 8; /* look at IEEE802/EUI64 only */ if (addrlen != 8 && addrlen != 6) { IF_ADDR_RUNLOCK(ifp); return -1; } /* * check for invalid MAC address - on bsdi, we see it a lot * since wildboar configures all-zero MAC on pccard before * card insertion. */ if (bcmp(addr, allzero, addrlen) == 0) { IF_ADDR_RUNLOCK(ifp); return -1; } if (bcmp(addr, allone, addrlen) == 0) { IF_ADDR_RUNLOCK(ifp); return -1; } /* make EUI64 address */ if (addrlen == 8) bcopy(addr, &in6->s6_addr[8], 8); else if (addrlen == 6) { in6->s6_addr[8] = addr[0]; in6->s6_addr[9] = addr[1]; in6->s6_addr[10] = addr[2]; in6->s6_addr[11] = 0xff; in6->s6_addr[12] = 0xfe; in6->s6_addr[13] = addr[3]; in6->s6_addr[14] = addr[4]; in6->s6_addr[15] = addr[5]; } break; case IFT_ARCNET: if (addrlen != 1) { IF_ADDR_RUNLOCK(ifp); return -1; } if (!addr[0]) { IF_ADDR_RUNLOCK(ifp); return -1; } bzero(&in6->s6_addr[8], 8); in6->s6_addr[15] = addr[0]; /* * due to insufficient bitwidth, we mark it local. */ in6->s6_addr[8] &= ~EUI64_GBIT; /* g bit to "individual" */ in6->s6_addr[8] |= EUI64_UBIT; /* u bit to "local" */ break; case IFT_GIF: case IFT_STF: /* * RFC2893 says: "SHOULD use IPv4 address as ifid source". * however, IPv4 address is not very suitable as unique * identifier source (can be renumbered). * we don't do this. */ IF_ADDR_RUNLOCK(ifp); return -1; default: IF_ADDR_RUNLOCK(ifp); return -1; } /* sanity check: g bit must not indicate "group" */ if (EUI64_GROUP(in6)) { IF_ADDR_RUNLOCK(ifp); return -1; } /* convert EUI64 into IPv6 interface identifier */ EUI64_TO_IFID(in6); /* * sanity check: ifid must not be all zero, avoid conflict with * subnet router anycast */ if ((in6->s6_addr[8] & ~(EUI64_GBIT | EUI64_UBIT)) == 0x00 && bcmp(&in6->s6_addr[9], allzero, 7) == 0) { IF_ADDR_RUNLOCK(ifp); return -1; } IF_ADDR_RUNLOCK(ifp); return 0; } /* * Get interface identifier for the specified interface. If it is not * available on ifp0, borrow interface identifier from other information * sources. * * altifp - secondary EUI64 source */ static int get_ifid(struct ifnet *ifp0, struct ifnet *altifp, struct in6_addr *in6) { struct ifnet *ifp; /* first, try to get it from the interface itself */ if (in6_get_hw_ifid(ifp0, in6) == 0) { nd6log((LOG_DEBUG, "%s: got interface identifier from itself\n", if_name(ifp0))); goto success; } /* try secondary EUI64 source. this basically is for ATM PVC */ if (altifp && in6_get_hw_ifid(altifp, in6) == 0) { nd6log((LOG_DEBUG, "%s: got interface identifier from %s\n", if_name(ifp0), if_name(altifp))); goto success; } /* next, try to get it from some other hardware interface */ IFNET_RLOCK_NOSLEEP(); TAILQ_FOREACH(ifp, &V_ifnet, if_link) { if (ifp == ifp0) continue; if (in6_get_hw_ifid(ifp, in6) != 0) continue; /* * to borrow ifid from other interface, ifid needs to be * globally unique */ if (IFID_UNIVERSAL(in6)) { nd6log((LOG_DEBUG, "%s: borrow interface identifier from %s\n", if_name(ifp0), if_name(ifp))); IFNET_RUNLOCK_NOSLEEP(); goto success; } } IFNET_RUNLOCK_NOSLEEP(); /* last resort: get from random number source */ if (get_rand_ifid(ifp, in6) == 0) { nd6log((LOG_DEBUG, "%s: interface identifier generated by random number\n", if_name(ifp0))); goto success; } printf("%s: failed to get interface identifier\n", if_name(ifp0)); return -1; success: nd6log((LOG_INFO, "%s: ifid: %02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x\n", if_name(ifp0), in6->s6_addr[8], in6->s6_addr[9], in6->s6_addr[10], in6->s6_addr[11], in6->s6_addr[12], in6->s6_addr[13], in6->s6_addr[14], in6->s6_addr[15])); return 0; } /* * altifp - secondary EUI64 source */ static int in6_ifattach_linklocal(struct ifnet *ifp, struct ifnet *altifp) { struct in6_ifaddr *ia; struct in6_aliasreq ifra; struct nd_prefixctl pr0; + struct nd_prefix *pr; int error; /* * configure link-local address. */ in6_prepare_ifra(&ifra, NULL, &in6mask64); ifra.ifra_addr.sin6_addr.s6_addr32[0] = htonl(0xfe800000); ifra.ifra_addr.sin6_addr.s6_addr32[1] = 0; if ((ifp->if_flags & IFF_LOOPBACK) != 0) { ifra.ifra_addr.sin6_addr.s6_addr32[2] = 0; ifra.ifra_addr.sin6_addr.s6_addr32[3] = htonl(1); } else { if (get_ifid(ifp, altifp, &ifra.ifra_addr.sin6_addr) != 0) { nd6log((LOG_ERR, "%s: no ifid available\n", if_name(ifp))); return (-1); } } if (in6_setscope(&ifra.ifra_addr.sin6_addr, ifp, NULL)) return (-1); /* link-local addresses should NEVER expire. */ ifra.ifra_lifetime.ia6t_vltime = ND6_INFINITE_LIFETIME; ifra.ifra_lifetime.ia6t_pltime = ND6_INFINITE_LIFETIME; /* * Now call in6_update_ifa() to do a bunch of procedures to configure * a link-local address. We can set the 3rd argument to NULL, because * we know there's no other link-local address on the interface * and therefore we are adding one (instead of updating one). */ if ((error = in6_update_ifa(ifp, &ifra, NULL, IN6_IFAUPDATE_DADDELAY)) != 0) { /* * XXX: When the interface does not support IPv6, this call * would fail in the SIOCSIFADDR ioctl. I believe the * notification is rather confusing in this case, so just * suppress it. (jinmei@kame.net 20010130) */ if (error != EAFNOSUPPORT) nd6log((LOG_NOTICE, "in6_ifattach_linklocal: failed to " "configure a link-local address on %s " "(errno=%d)\n", if_name(ifp), error)); return (-1); } ia = in6ifa_ifpforlinklocal(ifp, 0); /* ia must not be NULL */ KASSERT(ia != NULL, ("%s: ia == NULL, ifp=%p", __func__, ifp)); ifa_free(&ia->ia_ifa); /* * Make the link-local prefix (fe80::%link/64) as on-link. * Since we'd like to manage prefixes separately from addresses, * we make an ND6 prefix structure for the link-local prefix, * and add it to the prefix list as a never-expire prefix. * XXX: this change might affect some existing code base... */ bzero(&pr0, sizeof(pr0)); pr0.ndpr_ifp = ifp; /* this should be 64 at this moment. */ pr0.ndpr_plen = in6_mask2len(&ifra.ifra_prefixmask.sin6_addr, NULL); pr0.ndpr_prefix = ifra.ifra_addr; /* apply the mask for safety. (nd6_prelist_add will apply it again) */ IN6_MASK_ADDR(&pr0.ndpr_prefix.sin6_addr, &in6mask64); /* * Initialize parameters. The link-local prefix must always be * on-link, and its lifetimes never expire. */ pr0.ndpr_raf_onlink = 1; pr0.ndpr_raf_auto = 1; /* probably meaningless */ pr0.ndpr_vltime = ND6_INFINITE_LIFETIME; pr0.ndpr_pltime = ND6_INFINITE_LIFETIME; /* * Since there is no other link-local addresses, nd6_prefix_lookup() * probably returns NULL. However, we cannot always expect the result. * For example, if we first remove the (only) existing link-local * address, and then reconfigure another one, the prefix is still * valid with referring to the old link-local address. */ - if (nd6_prefix_lookup(&pr0) == NULL) { + if ((pr = nd6_prefix_lookup(&pr0)) == NULL) { if ((error = nd6_prelist_add(&pr0, NULL, NULL)) != 0) return (error); - } + } else + nd6_prefix_rele(pr); return 0; } /* * ifp - must be IFT_LOOP */ static int in6_ifattach_loopback(struct ifnet *ifp) { struct in6_aliasreq ifra; int error; in6_prepare_ifra(&ifra, &in6addr_loopback, &in6mask128); /* * Always initialize ia_dstaddr (= broadcast address) to loopback * address. Follows IPv4 practice - see in_ifinit(). */ ifra.ifra_dstaddr.sin6_len = sizeof(struct sockaddr_in6); ifra.ifra_dstaddr.sin6_family = AF_INET6; ifra.ifra_dstaddr.sin6_addr = in6addr_loopback; /* the loopback address should NEVER expire. */ ifra.ifra_lifetime.ia6t_vltime = ND6_INFINITE_LIFETIME; ifra.ifra_lifetime.ia6t_pltime = ND6_INFINITE_LIFETIME; /* * We are sure that this is a newly assigned address, so we can set * NULL to the 3rd arg. */ if ((error = in6_update_ifa(ifp, &ifra, NULL, 0)) != 0) { nd6log((LOG_ERR, "in6_ifattach_loopback: failed to configure " "the loopback address on %s (errno=%d)\n", if_name(ifp), error)); return (-1); } return 0; } /* * compute NI group address, based on the current hostname setting. * see RFC 4620. * * when ifp == NULL, the caller is responsible for filling scopeid. * * If oldmcprefix == 1, FF02:0:0:0:0:2::/96 is used for NI group address * while it is FF02:0:0:0:0:2:FF00::/104 in RFC 4620. */ static int in6_nigroup0(struct ifnet *ifp, const char *name, int namelen, struct in6_addr *in6, int oldmcprefix) { struct prison *pr; const char *p; u_char *q; MD5_CTX ctxt; u_int8_t digest[16]; char l; char n[64]; /* a single label must not exceed 63 chars */ /* * If no name is given and namelen is -1, * we try to do the hostname lookup ourselves. */ if (!name && namelen == -1) { pr = curthread->td_ucred->cr_prison; mtx_lock(&pr->pr_mtx); name = pr->pr_hostname; namelen = strlen(name); } else pr = NULL; if (!name || !namelen) { if (pr != NULL) mtx_unlock(&pr->pr_mtx); return -1; } p = name; while (p && *p && *p != '.' && p - name < namelen) p++; if (p == name || p - name > sizeof(n) - 1) { if (pr != NULL) mtx_unlock(&pr->pr_mtx); return -1; /* label too long */ } l = p - name; strncpy(n, name, l); if (pr != NULL) mtx_unlock(&pr->pr_mtx); n[(int)l] = '\0'; for (q = n; *q; q++) { if ('A' <= *q && *q <= 'Z') *q = *q - 'A' + 'a'; } /* generate 16 bytes of pseudo-random value. */ bzero(&ctxt, sizeof(ctxt)); MD5Init(&ctxt); MD5Update(&ctxt, &l, sizeof(l)); MD5Update(&ctxt, n, l); MD5Final(digest, &ctxt); bzero(in6, sizeof(*in6)); in6->s6_addr16[0] = IPV6_ADDR_INT16_MLL; in6->s6_addr8[11] = 2; if (oldmcprefix == 0) { in6->s6_addr8[12] = 0xff; /* Copy the first 24 bits of 128-bit hash into the address. */ bcopy(digest, &in6->s6_addr8[13], 3); } else { /* Copy the first 32 bits of 128-bit hash into the address. */ bcopy(digest, &in6->s6_addr32[3], sizeof(in6->s6_addr32[3])); } if (in6_setscope(in6, ifp, NULL)) return (-1); /* XXX: should not fail */ return 0; } int in6_nigroup(struct ifnet *ifp, const char *name, int namelen, struct in6_addr *in6) { return (in6_nigroup0(ifp, name, namelen, in6, 0)); } int in6_nigroup_oldmcprefix(struct ifnet *ifp, const char *name, int namelen, struct in6_addr *in6) { return (in6_nigroup0(ifp, name, namelen, in6, 1)); } /* * XXX multiple loopback interface needs more care. for instance, * nodelocal address needs to be configured onto only one of them. * XXX multiple link-local address case * * altifp - secondary EUI64 source */ void in6_ifattach(struct ifnet *ifp, struct ifnet *altifp) { struct in6_ifaddr *ia; struct in6_addr in6; if (ifp->if_afdata[AF_INET6] == NULL) return; /* * quirks based on interface type */ switch (ifp->if_type) { case IFT_STF: /* * 6to4 interface is a very special kind of beast. * no multicast, no linklocal. RFC2529 specifies how to make * linklocals for 6to4 interface, but there's no use and * it is rather harmful to have one. */ ND_IFINFO(ifp)->flags &= ~ND6_IFF_AUTO_LINKLOCAL; break; default: break; } /* * usually, we require multicast capability to the interface */ if ((ifp->if_flags & IFF_MULTICAST) == 0) { nd6log((LOG_INFO, "in6_ifattach: " "%s is not multicast capable, IPv6 not enabled\n", if_name(ifp))); return; } /* * assign loopback address for loopback interface. * XXX multiple loopback interface case. */ if ((ifp->if_flags & IFF_LOOPBACK) != 0) { struct ifaddr *ifa; in6 = in6addr_loopback; ifa = (struct ifaddr *)in6ifa_ifpwithaddr(ifp, &in6); if (ifa == NULL) { if (in6_ifattach_loopback(ifp) != 0) return; } else ifa_free(ifa); } /* * assign a link-local address, if there's none. */ if (!(ND_IFINFO(ifp)->flags & ND6_IFF_IFDISABLED) && ND_IFINFO(ifp)->flags & ND6_IFF_AUTO_LINKLOCAL) { int error; ia = in6ifa_ifpforlinklocal(ifp, 0); if (ia == NULL) { error = in6_ifattach_linklocal(ifp, altifp); #if 0 if (error) log(LOG_NOTICE, "in6_ifattach_linklocal: " "failed to add a link-local addr to %s\n", if_name(ifp)); #endif } else ifa_free(&ia->ia_ifa); } /* update dynamically. */ if (V_in6_maxmtu < ifp->if_mtu) V_in6_maxmtu = ifp->if_mtu; } /* * NOTE: in6_ifdetach() does not support loopback if at this moment. * * When shutting down a VNET we clean up layers top-down. In that case * upper layer protocols (ulp) are cleaned up already and locks are destroyed * and we must not call into these cleanup functions anymore, thus purgeulp * is set to 0 in that case by in6_ifdetach_destroy(). * The normal case of destroying a (cloned) interface still needs to cleanup * everything related to the interface and will have purgeulp set to 1. */ static void _in6_ifdetach(struct ifnet *ifp, int purgeulp) { struct ifaddr *ifa, *next; if (ifp->if_afdata[AF_INET6] == NULL) return; /* - * Remove neighbor management table. - * Enabling the nd6_purge will panic on vmove for interfaces on VNET - * teardown as the IPv6 layer is cleaned up already and the locks - * are destroyed. - */ - if (purgeulp) - nd6_purge(ifp); - - /* * nuke any of IPv6 addresses we have * XXX: all addresses should be already removed */ TAILQ_FOREACH_SAFE(ifa, &ifp->if_addrhead, ifa_link, next) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; in6_purgeaddr(ifa); } if (purgeulp) { in6_pcbpurgeif0(&V_udbinfo, ifp); in6_pcbpurgeif0(&V_ulitecbinfo, ifp); in6_pcbpurgeif0(&V_ripcbinfo, ifp); } /* leave from all multicast groups joined */ in6_purgemaddrs(ifp); /* - * remove neighbor management table. we call it twice just to make - * sure we nuke everything. maybe we need just one call. - * XXX: since the first call did not release addresses, some prefixes - * might remain. We should call nd6_purge() again to release the - * prefixes after removing all addresses above. - * (Or can we just delay calling nd6_purge until at this point?) + * Remove neighbor management table. + * Enabling the nd6_purge will panic on vmove for interfaces on VNET + * teardown as the IPv6 layer is cleaned up already and the locks + * are destroyed. */ if (purgeulp) nd6_purge(ifp); } void in6_ifdetach(struct ifnet *ifp) { _in6_ifdetach(ifp, 1); } void in6_ifdetach_destroy(struct ifnet *ifp) { _in6_ifdetach(ifp, 0); } int in6_get_tmpifid(struct ifnet *ifp, u_int8_t *retbuf, const u_int8_t *baseid, int generate) { u_int8_t nullbuf[8]; struct nd_ifinfo *ndi = ND_IFINFO(ifp); bzero(nullbuf, sizeof(nullbuf)); if (bcmp(ndi->randomid, nullbuf, sizeof(nullbuf)) == 0) { /* we've never created a random ID. Create a new one. */ generate = 1; } if (generate) { bcopy(baseid, ndi->randomseed1, sizeof(ndi->randomseed1)); /* generate_tmp_ifid will update seedn and buf */ (void)generate_tmp_ifid(ndi->randomseed0, ndi->randomseed1, ndi->randomid); } bcopy(ndi->randomid, retbuf, 8); return (0); } void in6_tmpaddrtimer(void *arg) { CURVNET_SET((struct vnet *) arg); struct nd_ifinfo *ndi; u_int8_t nullbuf[8]; struct ifnet *ifp; callout_reset(&V_in6_tmpaddrtimer_ch, (V_ip6_temp_preferred_lifetime - V_ip6_desync_factor - V_ip6_temp_regen_advance) * hz, in6_tmpaddrtimer, curvnet); bzero(nullbuf, sizeof(nullbuf)); TAILQ_FOREACH(ifp, &V_ifnet, if_link) { if (ifp->if_afdata[AF_INET6] == NULL) continue; ndi = ND_IFINFO(ifp); if (bcmp(ndi->randomid, nullbuf, sizeof(nullbuf)) != 0) { /* * We've been generating a random ID on this interface. * Create a new one. */ (void)generate_tmp_ifid(ndi->randomseed0, ndi->randomseed1, ndi->randomid); } } CURVNET_RESTORE(); } static void in6_purgemaddrs(struct ifnet *ifp) { LIST_HEAD(,in6_multi) purgeinms; struct in6_multi *inm, *tinm; struct ifmultiaddr *ifma; LIST_INIT(&purgeinms); IN6_MULTI_LOCK(); /* * Extract list of in6_multi associated with the detaching ifp * which the PF_INET6 layer is about to release. * We need to do this as IF_ADDR_LOCK() may be re-acquired * by code further down. */ IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { if (ifma->ifma_addr->sa_family != AF_INET6 || ifma->ifma_protospec == NULL) continue; inm = (struct in6_multi *)ifma->ifma_protospec; LIST_INSERT_HEAD(&purgeinms, inm, in6m_entry); } IF_ADDR_RUNLOCK(ifp); LIST_FOREACH_SAFE(inm, &purgeinms, in6m_entry, tinm) { LIST_REMOVE(inm, in6m_entry); in6m_release_locked(inm); } mld_ifdetach(ifp); IN6_MULTI_UNLOCK(); } void in6_ifattach_destroy(void) { callout_drain(&V_in6_tmpaddrtimer_ch); } static void in6_ifattach_init(void *dummy) { /* Timer for regeneranation of temporary addresses randomize ID. */ callout_init(&V_in6_tmpaddrtimer_ch, 0); callout_reset(&V_in6_tmpaddrtimer_ch, (V_ip6_temp_preferred_lifetime - V_ip6_desync_factor - V_ip6_temp_regen_advance) * hz, in6_tmpaddrtimer, curvnet); } /* * Cheat. * This must be after route_init(), which is now SI_ORDER_THIRD. */ SYSINIT(in6_ifattach_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_MIDDLE, in6_ifattach_init, NULL); Index: user/alc/PQ_LAUNDRY/sys/netinet6/nd6.c =================================================================== --- user/alc/PQ_LAUNDRY/sys/netinet6/nd6.c (revision 306831) +++ user/alc/PQ_LAUNDRY/sys/netinet6/nd6.c (revision 306832) @@ -1,2698 +1,2753 @@ /*- * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $KAME: nd6.c,v 1.144 2001/05/24 07:44:00 itojun Exp $ */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include #include #include +#include #include #include +#include #include #include #include #include #include #include #include -#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define ND6_SLOWTIMER_INTERVAL (60 * 60) /* 1 hour */ #define ND6_RECALC_REACHTM_INTERVAL (60 * 120) /* 2 hours */ #define SIN6(s) ((const struct sockaddr_in6 *)(s)) MALLOC_DEFINE(M_IP6NDP, "ip6ndp", "IPv6 Neighbor Discovery"); /* timer values */ VNET_DEFINE(int, nd6_prune) = 1; /* walk list every 1 seconds */ VNET_DEFINE(int, nd6_delay) = 5; /* delay first probe time 5 second */ VNET_DEFINE(int, nd6_umaxtries) = 3; /* maximum unicast query */ VNET_DEFINE(int, nd6_mmaxtries) = 3; /* maximum multicast query */ VNET_DEFINE(int, nd6_useloopback) = 1; /* use loopback interface for * local traffic */ VNET_DEFINE(int, nd6_gctimer) = (60 * 60 * 24); /* 1 day: garbage * collection timer */ /* preventing too many loops in ND option parsing */ static VNET_DEFINE(int, nd6_maxndopt) = 10; /* max # of ND options allowed */ VNET_DEFINE(int, nd6_maxnudhint) = 0; /* max # of subsequent upper * layer hints */ static VNET_DEFINE(int, nd6_maxqueuelen) = 1; /* max pkts cached in unresolved * ND entries */ #define V_nd6_maxndopt VNET(nd6_maxndopt) #define V_nd6_maxqueuelen VNET(nd6_maxqueuelen) #ifdef ND6_DEBUG VNET_DEFINE(int, nd6_debug) = 1; #else VNET_DEFINE(int, nd6_debug) = 0; #endif static eventhandler_tag lle_event_eh, iflladdr_event_eh; VNET_DEFINE(struct nd_drhead, nd_defrouter); VNET_DEFINE(struct nd_prhead, nd_prefix); VNET_DEFINE(struct rwlock, nd6_lock); +VNET_DEFINE(uint64_t, nd6_list_genid); +VNET_DEFINE(struct mtx, nd6_onlink_mtx); VNET_DEFINE(int, nd6_recalc_reachtm_interval) = ND6_RECALC_REACHTM_INTERVAL; #define V_nd6_recalc_reachtm_interval VNET(nd6_recalc_reachtm_interval) int (*send_sendso_input_hook)(struct mbuf *, struct ifnet *, int, int); static int nd6_is_new_addr_neighbor(const struct sockaddr_in6 *, struct ifnet *); static void nd6_setmtu0(struct ifnet *, struct nd_ifinfo *); static void nd6_slowtimo(void *); static int regen_tmpaddr(struct in6_ifaddr *); static void nd6_free(struct llentry **, int); static void nd6_free_redirect(const struct llentry *); static void nd6_llinfo_timer(void *); static void nd6_llinfo_settimer_locked(struct llentry *, long); static void clear_llinfo_pqueue(struct llentry *); static void nd6_rtrequest(int, struct rtentry *, struct rt_addrinfo *); static int nd6_resolve_slow(struct ifnet *, int, struct mbuf *, const struct sockaddr_in6 *, u_char *, uint32_t *, struct llentry **); static int nd6_need_cache(struct ifnet *); static VNET_DEFINE(struct callout, nd6_slowtimo_ch); #define V_nd6_slowtimo_ch VNET(nd6_slowtimo_ch) VNET_DEFINE(struct callout, nd6_timer_ch); #define V_nd6_timer_ch VNET(nd6_timer_ch) static void nd6_lle_event(void *arg __unused, struct llentry *lle, int evt) { struct rt_addrinfo rtinfo; struct sockaddr_in6 dst; struct sockaddr_dl gw; struct ifnet *ifp; int type; LLE_WLOCK_ASSERT(lle); if (lltable_get_af(lle->lle_tbl) != AF_INET6) return; switch (evt) { case LLENTRY_RESOLVED: type = RTM_ADD; KASSERT(lle->la_flags & LLE_VALID, ("%s: %p resolved but not valid?", __func__, lle)); break; case LLENTRY_EXPIRED: type = RTM_DELETE; break; default: return; } ifp = lltable_get_ifp(lle->lle_tbl); bzero(&dst, sizeof(dst)); bzero(&gw, sizeof(gw)); bzero(&rtinfo, sizeof(rtinfo)); lltable_fill_sa_entry(lle, (struct sockaddr *)&dst); dst.sin6_scope_id = in6_getscopezone(ifp, in6_addrscope(&dst.sin6_addr)); gw.sdl_len = sizeof(struct sockaddr_dl); gw.sdl_family = AF_LINK; gw.sdl_alen = ifp->if_addrlen; gw.sdl_index = ifp->if_index; gw.sdl_type = ifp->if_type; if (evt == LLENTRY_RESOLVED) bcopy(lle->ll_addr, gw.sdl_data, ifp->if_addrlen); rtinfo.rti_info[RTAX_DST] = (struct sockaddr *)&dst; rtinfo.rti_info[RTAX_GATEWAY] = (struct sockaddr *)&gw; rtinfo.rti_addrs = RTA_DST | RTA_GATEWAY; rt_missmsg_fib(type, &rtinfo, RTF_HOST | RTF_LLDATA | ( type == RTM_ADD ? RTF_UP: 0), 0, RT_DEFAULT_FIB); } /* * A handler for interface link layer address change event. */ static void nd6_iflladdr(void *arg __unused, struct ifnet *ifp) { lltable_update_ifaddr(LLTABLE6(ifp)); } void nd6_init(void) { - rw_init(&V_nd6_lock, "nd6"); + mtx_init(&V_nd6_onlink_mtx, "nd6 onlink", NULL, MTX_DEF); + rw_init(&V_nd6_lock, "nd6 list"); LIST_INIT(&V_nd_prefix); - - /* initialization of the default router list */ TAILQ_INIT(&V_nd_defrouter); /* Start timers. */ callout_init(&V_nd6_slowtimo_ch, 0); callout_reset(&V_nd6_slowtimo_ch, ND6_SLOWTIMER_INTERVAL * hz, nd6_slowtimo, curvnet); callout_init(&V_nd6_timer_ch, 0); callout_reset(&V_nd6_timer_ch, hz, nd6_timer, curvnet); nd6_dad_init(); if (IS_DEFAULT_VNET(curvnet)) { lle_event_eh = EVENTHANDLER_REGISTER(lle_event, nd6_lle_event, NULL, EVENTHANDLER_PRI_ANY); iflladdr_event_eh = EVENTHANDLER_REGISTER(iflladdr_event, nd6_iflladdr, NULL, EVENTHANDLER_PRI_ANY); } } #ifdef VIMAGE void nd6_destroy() { callout_drain(&V_nd6_slowtimo_ch); callout_drain(&V_nd6_timer_ch); if (IS_DEFAULT_VNET(curvnet)) { EVENTHANDLER_DEREGISTER(lle_event, lle_event_eh); EVENTHANDLER_DEREGISTER(iflladdr_event, iflladdr_event_eh); } rw_destroy(&V_nd6_lock); + mtx_destroy(&V_nd6_onlink_mtx); } #endif struct nd_ifinfo * nd6_ifattach(struct ifnet *ifp) { struct nd_ifinfo *nd; nd = malloc(sizeof(*nd), M_IP6NDP, M_WAITOK | M_ZERO); nd->initialized = 1; nd->chlim = IPV6_DEFHLIM; nd->basereachable = REACHABLE_TIME; nd->reachable = ND_COMPUTE_RTIME(nd->basereachable); nd->retrans = RETRANS_TIMER; nd->flags = ND6_IFF_PERFORMNUD; /* A loopback interface always has ND6_IFF_AUTO_LINKLOCAL. * XXXHRS: Clear ND6_IFF_AUTO_LINKLOCAL on an IFT_BRIDGE interface by * default regardless of the V_ip6_auto_linklocal configuration to * give a reasonable default behavior. */ if ((V_ip6_auto_linklocal && ifp->if_type != IFT_BRIDGE) || (ifp->if_flags & IFF_LOOPBACK)) nd->flags |= ND6_IFF_AUTO_LINKLOCAL; /* * A loopback interface does not need to accept RTADV. * XXXHRS: Clear ND6_IFF_ACCEPT_RTADV on an IFT_BRIDGE interface by * default regardless of the V_ip6_accept_rtadv configuration to * prevent the interface from accepting RA messages arrived * on one of the member interfaces with ND6_IFF_ACCEPT_RTADV. */ if (V_ip6_accept_rtadv && !(ifp->if_flags & IFF_LOOPBACK) && (ifp->if_type != IFT_BRIDGE)) nd->flags |= ND6_IFF_ACCEPT_RTADV; if (V_ip6_no_radr && !(ifp->if_flags & IFF_LOOPBACK)) nd->flags |= ND6_IFF_NO_RADR; /* XXX: we cannot call nd6_setmtu since ifp is not fully initialized */ nd6_setmtu0(ifp, nd); return nd; } void nd6_ifdetach(struct ifnet *ifp, struct nd_ifinfo *nd) { struct ifaddr *ifa, *next; IF_ADDR_RLOCK(ifp); TAILQ_FOREACH_SAFE(ifa, &ifp->if_addrhead, ifa_link, next) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; /* stop DAD processing */ nd6_dad_stop(ifa); } IF_ADDR_RUNLOCK(ifp); free(nd, M_IP6NDP); } /* * Reset ND level link MTU. This function is called when the physical MTU * changes, which means we might have to adjust the ND level MTU. */ void nd6_setmtu(struct ifnet *ifp) { if (ifp->if_afdata[AF_INET6] == NULL) return; nd6_setmtu0(ifp, ND_IFINFO(ifp)); } /* XXX todo: do not maintain copy of ifp->if_mtu in ndi->maxmtu */ void nd6_setmtu0(struct ifnet *ifp, struct nd_ifinfo *ndi) { u_int32_t omaxmtu; omaxmtu = ndi->maxmtu; switch (ifp->if_type) { case IFT_ARCNET: ndi->maxmtu = MIN(ARC_PHDS_MAXMTU, ifp->if_mtu); /* RFC2497 */ break; case IFT_FDDI: ndi->maxmtu = MIN(FDDIIPMTU, ifp->if_mtu); /* RFC2467 */ break; case IFT_ISO88025: ndi->maxmtu = MIN(ISO88025_MAX_MTU, ifp->if_mtu); break; default: ndi->maxmtu = ifp->if_mtu; break; } /* * Decreasing the interface MTU under IPV6 minimum MTU may cause * undesirable situation. We thus notify the operator of the change * explicitly. The check for omaxmtu is necessary to restrict the * log to the case of changing the MTU, not initializing it. */ if (omaxmtu >= IPV6_MMTU && ndi->maxmtu < IPV6_MMTU) { log(LOG_NOTICE, "nd6_setmtu0: " "new link MTU on %s (%lu) is too small for IPv6\n", if_name(ifp), (unsigned long)ndi->maxmtu); } if (ndi->maxmtu > V_in6_maxmtu) in6_setmaxmtu(); /* check all interfaces just in case */ } void nd6_option_init(void *opt, int icmp6len, union nd_opts *ndopts) { bzero(ndopts, sizeof(*ndopts)); ndopts->nd_opts_search = (struct nd_opt_hdr *)opt; ndopts->nd_opts_last = (struct nd_opt_hdr *)(((u_char *)opt) + icmp6len); if (icmp6len == 0) { ndopts->nd_opts_done = 1; ndopts->nd_opts_search = NULL; } } /* * Take one ND option. */ struct nd_opt_hdr * nd6_option(union nd_opts *ndopts) { struct nd_opt_hdr *nd_opt; int olen; KASSERT(ndopts != NULL, ("%s: ndopts == NULL", __func__)); KASSERT(ndopts->nd_opts_last != NULL, ("%s: uninitialized ndopts", __func__)); if (ndopts->nd_opts_search == NULL) return NULL; if (ndopts->nd_opts_done) return NULL; nd_opt = ndopts->nd_opts_search; /* make sure nd_opt_len is inside the buffer */ if ((caddr_t)&nd_opt->nd_opt_len >= (caddr_t)ndopts->nd_opts_last) { bzero(ndopts, sizeof(*ndopts)); return NULL; } olen = nd_opt->nd_opt_len << 3; if (olen == 0) { /* * Message validation requires that all included * options have a length that is greater than zero. */ bzero(ndopts, sizeof(*ndopts)); return NULL; } ndopts->nd_opts_search = (struct nd_opt_hdr *)((caddr_t)nd_opt + olen); if (ndopts->nd_opts_search > ndopts->nd_opts_last) { /* option overruns the end of buffer, invalid */ bzero(ndopts, sizeof(*ndopts)); return NULL; } else if (ndopts->nd_opts_search == ndopts->nd_opts_last) { /* reached the end of options chain */ ndopts->nd_opts_done = 1; ndopts->nd_opts_search = NULL; } return nd_opt; } /* * Parse multiple ND options. * This function is much easier to use, for ND routines that do not need * multiple options of the same type. */ int nd6_options(union nd_opts *ndopts) { struct nd_opt_hdr *nd_opt; int i = 0; KASSERT(ndopts != NULL, ("%s: ndopts == NULL", __func__)); KASSERT(ndopts->nd_opts_last != NULL, ("%s: uninitialized ndopts", __func__)); if (ndopts->nd_opts_search == NULL) return 0; while (1) { nd_opt = nd6_option(ndopts); if (nd_opt == NULL && ndopts->nd_opts_last == NULL) { /* * Message validation requires that all included * options have a length that is greater than zero. */ ICMP6STAT_INC(icp6s_nd_badopt); bzero(ndopts, sizeof(*ndopts)); return -1; } if (nd_opt == NULL) goto skip1; switch (nd_opt->nd_opt_type) { case ND_OPT_SOURCE_LINKADDR: case ND_OPT_TARGET_LINKADDR: case ND_OPT_MTU: case ND_OPT_REDIRECTED_HEADER: case ND_OPT_NONCE: if (ndopts->nd_opt_array[nd_opt->nd_opt_type]) { nd6log((LOG_INFO, "duplicated ND6 option found (type=%d)\n", nd_opt->nd_opt_type)); /* XXX bark? */ } else { ndopts->nd_opt_array[nd_opt->nd_opt_type] = nd_opt; } break; case ND_OPT_PREFIX_INFORMATION: if (ndopts->nd_opt_array[nd_opt->nd_opt_type] == 0) { ndopts->nd_opt_array[nd_opt->nd_opt_type] = nd_opt; } ndopts->nd_opts_pi_end = (struct nd_opt_prefix_info *)nd_opt; break; /* What about ND_OPT_ROUTE_INFO? RFC 4191 */ case ND_OPT_RDNSS: /* RFC 6106 */ case ND_OPT_DNSSL: /* RFC 6106 */ /* * Silently ignore options we know and do not care about * in the kernel. */ break; default: /* * Unknown options must be silently ignored, * to accommodate future extension to the protocol. */ nd6log((LOG_DEBUG, "nd6_options: unsupported option %d - " "option ignored\n", nd_opt->nd_opt_type)); } skip1: i++; if (i > V_nd6_maxndopt) { ICMP6STAT_INC(icp6s_nd_toomanyopt); nd6log((LOG_INFO, "too many loop in nd opt\n")); break; } if (ndopts->nd_opts_done) break; } return 0; } /* * ND6 timer routine to handle ND6 entries */ static void nd6_llinfo_settimer_locked(struct llentry *ln, long tick) { int canceled; LLE_WLOCK_ASSERT(ln); if (tick < 0) { ln->la_expire = 0; ln->ln_ntick = 0; canceled = callout_stop(&ln->lle_timer); } else { ln->la_expire = time_uptime + tick / hz; LLE_ADDREF(ln); if (tick > INT_MAX) { ln->ln_ntick = tick - INT_MAX; canceled = callout_reset(&ln->lle_timer, INT_MAX, nd6_llinfo_timer, ln); } else { ln->ln_ntick = 0; canceled = callout_reset(&ln->lle_timer, tick, nd6_llinfo_timer, ln); } } if (canceled > 0) LLE_REMREF(ln); } /* * Gets source address of the first packet in hold queue * and stores it in @src. * Returns pointer to @src (if hold queue is not empty) or NULL. * * Set noinline to be dtrace-friendly */ static __noinline struct in6_addr * nd6_llinfo_get_holdsrc(struct llentry *ln, struct in6_addr *src) { struct ip6_hdr hdr; struct mbuf *m; if (ln->la_hold == NULL) return (NULL); /* * assume every packet in la_hold has the same IP header */ m = ln->la_hold; if (sizeof(hdr) > m->m_len) return (NULL); m_copydata(m, 0, sizeof(hdr), (caddr_t)&hdr); *src = hdr.ip6_src; return (src); } /* * Checks if we need to switch from STALE state. * * RFC 4861 requires switching from STALE to DELAY state * on first packet matching entry, waiting V_nd6_delay and * transition to PROBE state (if upper layer confirmation was * not received). * * This code performs a bit differently: * On packet hit we don't change state (but desired state * can be guessed by control plane). However, after V_nd6_delay * seconds code will transition to PROBE state (so DELAY state * is kinda skipped in most situations). * * Typically, V_nd6_gctimer is bigger than V_nd6_delay, so * we perform the following upon entering STALE state: * * 1) Arm timer to run each V_nd6_delay seconds to make sure that * if packet was transmitted at the start of given interval, we * would be able to switch to PROBE state in V_nd6_delay seconds * as user expects. * * 2) Reschedule timer until original V_nd6_gctimer expires keeping * lle in STALE state (remaining timer value stored in lle_remtime). * * 3) Reschedule timer if packet was transmitted less that V_nd6_delay * seconds ago. * * Returns non-zero value if the entry is still STALE (storing * the next timer interval in @pdelay). * * Returns zero value if original timer expired or we need to switch to * PROBE (store that in @do_switch variable). */ static int nd6_is_stale(struct llentry *lle, long *pdelay, int *do_switch) { int nd_delay, nd_gctimer, r_skip_req; time_t lle_hittime; long delay; *do_switch = 0; nd_gctimer = V_nd6_gctimer; nd_delay = V_nd6_delay; LLE_REQ_LOCK(lle); r_skip_req = lle->r_skip_req; lle_hittime = lle->lle_hittime; LLE_REQ_UNLOCK(lle); if (r_skip_req > 0) { /* * Nonzero r_skip_req value was set upon entering * STALE state. Since value was not changed, no * packets were passed using this lle. Ask for * timer reschedule and keep STALE state. */ delay = (long)(MIN(nd_gctimer, nd_delay)); delay *= hz; if (lle->lle_remtime > delay) lle->lle_remtime -= delay; else { delay = lle->lle_remtime; lle->lle_remtime = 0; } if (delay == 0) { /* * The original ng6_gctime timeout ended, * no more rescheduling. */ return (0); } *pdelay = delay; return (1); } /* * Packet received. Verify timestamp */ delay = (long)(time_uptime - lle_hittime); if (delay < nd_delay) { /* * V_nd6_delay still not passed since the first * hit in STALE state. * Reshedule timer and return. */ *pdelay = (long)(nd_delay - delay) * hz; return (1); } /* Request switching to probe */ *do_switch = 1; return (0); } /* * Switch @lle state to new state optionally arming timers. * * Set noinline to be dtrace-friendly */ __noinline void nd6_llinfo_setstate(struct llentry *lle, int newstate) { struct ifnet *ifp; int nd_gctimer, nd_delay; long delay, remtime; delay = 0; remtime = 0; switch (newstate) { case ND6_LLINFO_INCOMPLETE: ifp = lle->lle_tbl->llt_ifp; delay = (long)ND_IFINFO(ifp)->retrans * hz / 1000; break; case ND6_LLINFO_REACHABLE: if (!ND6_LLINFO_PERMANENT(lle)) { ifp = lle->lle_tbl->llt_ifp; delay = (long)ND_IFINFO(ifp)->reachable * hz; } break; case ND6_LLINFO_STALE: /* * Notify fast path that we want to know if any packet * is transmitted by setting r_skip_req. */ LLE_REQ_LOCK(lle); lle->r_skip_req = 1; LLE_REQ_UNLOCK(lle); nd_delay = V_nd6_delay; nd_gctimer = V_nd6_gctimer; delay = (long)(MIN(nd_gctimer, nd_delay)) * hz; remtime = (long)nd_gctimer * hz - delay; break; case ND6_LLINFO_DELAY: lle->la_asked = 0; delay = (long)V_nd6_delay * hz; break; } if (delay > 0) nd6_llinfo_settimer_locked(lle, delay); lle->lle_remtime = remtime; lle->ln_state = newstate; } /* * Timer-dependent part of nd state machine. * * Set noinline to be dtrace-friendly */ static __noinline void nd6_llinfo_timer(void *arg) { struct llentry *ln; struct in6_addr *dst, *pdst, *psrc, src; struct ifnet *ifp; struct nd_ifinfo *ndi; int do_switch, send_ns; long delay; KASSERT(arg != NULL, ("%s: arg NULL", __func__)); ln = (struct llentry *)arg; ifp = lltable_get_ifp(ln->lle_tbl); CURVNET_SET(ifp->if_vnet); ND6_RLOCK(); LLE_WLOCK(ln); if (callout_pending(&ln->lle_timer)) { /* * Here we are a bit odd here in the treatment of * active/pending. If the pending bit is set, it got * rescheduled before I ran. The active * bit we ignore, since if it was stopped * in ll_tablefree() and was currently running * it would have return 0 so the code would * not have deleted it since the callout could * not be stopped so we want to go through * with the delete here now. If the callout * was restarted, the pending bit will be back on and * we just want to bail since the callout_reset would * return 1 and our reference would have been removed * by nd6_llinfo_settimer_locked above since canceled * would have been 1. */ LLE_WUNLOCK(ln); ND6_RUNLOCK(); CURVNET_RESTORE(); return; } ndi = ND_IFINFO(ifp); send_ns = 0; dst = &ln->r_l3addr.addr6; pdst = dst; if (ln->ln_ntick > 0) { if (ln->ln_ntick > INT_MAX) { ln->ln_ntick -= INT_MAX; nd6_llinfo_settimer_locked(ln, INT_MAX); } else { ln->ln_ntick = 0; nd6_llinfo_settimer_locked(ln, ln->ln_ntick); } goto done; } if (ln->la_flags & LLE_STATIC) { goto done; } if (ln->la_flags & LLE_DELETED) { nd6_free(&ln, 0); goto done; } switch (ln->ln_state) { case ND6_LLINFO_INCOMPLETE: if (ln->la_asked < V_nd6_mmaxtries) { ln->la_asked++; send_ns = 1; /* Send NS to multicast address */ pdst = NULL; } else { struct mbuf *m = ln->la_hold; if (m) { struct mbuf *m0; /* * assuming every packet in la_hold has the * same IP header. Send error after unlock. */ m0 = m->m_nextpkt; m->m_nextpkt = NULL; ln->la_hold = m0; clear_llinfo_pqueue(ln); } nd6_free(&ln, 0); if (m != NULL) icmp6_error2(m, ICMP6_DST_UNREACH, ICMP6_DST_UNREACH_ADDR, 0, ifp); } break; case ND6_LLINFO_REACHABLE: if (!ND6_LLINFO_PERMANENT(ln)) nd6_llinfo_setstate(ln, ND6_LLINFO_STALE); break; case ND6_LLINFO_STALE: if (nd6_is_stale(ln, &delay, &do_switch) != 0) { /* * No packet has used this entry and GC timeout * has not been passed. Reshedule timer and * return. */ nd6_llinfo_settimer_locked(ln, delay); break; } if (do_switch == 0) { /* * GC timer has ended and entry hasn't been used. * Run Garbage collector (RFC 4861, 5.3) */ if (!ND6_LLINFO_PERMANENT(ln)) nd6_free(&ln, 1); break; } /* Entry has been used AND delay timer has ended. */ /* FALLTHROUGH */ case ND6_LLINFO_DELAY: if (ndi && (ndi->flags & ND6_IFF_PERFORMNUD) != 0) { /* We need NUD */ ln->la_asked = 1; nd6_llinfo_setstate(ln, ND6_LLINFO_PROBE); send_ns = 1; } else nd6_llinfo_setstate(ln, ND6_LLINFO_STALE); /* XXX */ break; case ND6_LLINFO_PROBE: if (ln->la_asked < V_nd6_umaxtries) { ln->la_asked++; send_ns = 1; } else { nd6_free(&ln, 0); } break; default: panic("%s: paths in a dark night can be confusing: %d", __func__, ln->ln_state); } done: if (ln != NULL) ND6_RUNLOCK(); if (send_ns != 0) { nd6_llinfo_settimer_locked(ln, (long)ndi->retrans * hz / 1000); psrc = nd6_llinfo_get_holdsrc(ln, &src); LLE_FREE_LOCKED(ln); ln = NULL; nd6_ns_output(ifp, psrc, pdst, dst, NULL); } if (ln != NULL) LLE_FREE_LOCKED(ln); CURVNET_RESTORE(); } /* * ND6 timer routine to expire default route list and prefix list */ void nd6_timer(void *arg) { CURVNET_SET((struct vnet *) arg); struct nd_drhead drq; + struct nd_prhead prl; struct nd_defrouter *dr, *ndr; struct nd_prefix *pr, *npr; struct in6_ifaddr *ia6, *nia6; + bool onlink_locked; TAILQ_INIT(&drq); + LIST_INIT(&prl); - /* expire default router list */ ND6_WLOCK(); TAILQ_FOREACH_SAFE(dr, &V_nd_defrouter, dr_entry, ndr) if (dr->expire && dr->expire < time_uptime) defrouter_unlink(dr, &drq); ND6_WUNLOCK(); while ((dr = TAILQ_FIRST(&drq)) != NULL) { TAILQ_REMOVE(&drq, dr, dr_entry); defrouter_del(dr); } /* * expire interface addresses. * in the past the loop was inside prefix expiry processing. * However, from a stricter speci-confrmance standpoint, we should * rather separate address lifetimes and prefix lifetimes. * * XXXRW: in6_ifaddrhead locking. */ addrloop: TAILQ_FOREACH_SAFE(ia6, &V_in6_ifaddrhead, ia_link, nia6) { /* check address lifetime */ if (IFA6_IS_INVALID(ia6)) { int regen = 0; /* * If the expiring address is temporary, try * regenerating a new one. This would be useful when * we suspended a laptop PC, then turned it on after a * period that could invalidate all temporary * addresses. Although we may have to restart the * loop (see below), it must be after purging the * address. Otherwise, we'd see an infinite loop of * regeneration. */ if (V_ip6_use_tempaddr && (ia6->ia6_flags & IN6_IFF_TEMPORARY) != 0) { if (regen_tmpaddr(ia6) == 0) regen = 1; } in6_purgeaddr(&ia6->ia_ifa); if (regen) goto addrloop; /* XXX: see below */ } else if (IFA6_IS_DEPRECATED(ia6)) { int oldflags = ia6->ia6_flags; ia6->ia6_flags |= IN6_IFF_DEPRECATED; /* * If a temporary address has just become deprecated, * regenerate a new one if possible. */ if (V_ip6_use_tempaddr && (ia6->ia6_flags & IN6_IFF_TEMPORARY) != 0 && (oldflags & IN6_IFF_DEPRECATED) == 0) { if (regen_tmpaddr(ia6) == 0) { /* * A new temporary address is * generated. * XXX: this means the address chain * has changed while we are still in * the loop. Although the change * would not cause disaster (because * it's not a deletion, but an * addition,) we'd rather restart the * loop just for safety. Or does this * significantly reduce performance?? */ goto addrloop; } } } else if ((ia6->ia6_flags & IN6_IFF_TENTATIVE) != 0) { /* * Schedule DAD for a tentative address. This happens * if the interface was down or not running * when the address was configured. */ int delay; delay = arc4random() % (MAX_RTR_SOLICITATION_DELAY * hz); nd6_dad_start((struct ifaddr *)ia6, delay); } else { /* * Check status of the interface. If it is down, * mark the address as tentative for future DAD. */ if ((ia6->ia_ifp->if_flags & IFF_UP) == 0 || (ia6->ia_ifp->if_drv_flags & IFF_DRV_RUNNING) == 0 || (ND_IFINFO(ia6->ia_ifp)->flags & ND6_IFF_IFDISABLED) != 0) { ia6->ia6_flags &= ~IN6_IFF_DUPLICATED; ia6->ia6_flags |= IN6_IFF_TENTATIVE; } /* * A new RA might have made a deprecated address * preferred. */ ia6->ia6_flags &= ~IN6_IFF_DEPRECATED; } } - /* expire prefix list */ + ND6_WLOCK(); + onlink_locked = false; +restart: LIST_FOREACH_SAFE(pr, &V_nd_prefix, ndpr_entry, npr) { /* - * check prefix lifetime. - * since pltime is just for autoconf, pltime processing for - * prefix is not necessary. + * Expire prefixes. Since the pltime is only used for + * autoconfigured addresses, pltime processing for prefixes is + * not necessary. + * + * Only unlink after all derived addresses have expired. This + * may not occur until two hours after the prefix has expired + * per RFC 4862. If the prefix expires before its derived + * addresses, mark it off-link. This will be done automatically + * after unlinking if no address references remain. */ - if (pr->ndpr_vltime != ND6_INFINITE_LIFETIME && - time_uptime - pr->ndpr_lastupdate > pr->ndpr_vltime) { + if (pr->ndpr_vltime == ND6_INFINITE_LIFETIME || + time_uptime - pr->ndpr_lastupdate <= pr->ndpr_vltime) + continue; - /* - * address expiration and prefix expiration are - * separate. NEVER perform in6_purgeaddr here. - */ - prelist_remove(pr); + if (pr->ndpr_addrcnt == 0) { + nd6_prefix_unlink(pr, &prl); + continue; } + if ((pr->ndpr_stateflags & NDPRF_ONLINK) != 0) { + if (!onlink_locked) { + onlink_locked = ND6_ONLINK_TRYLOCK(); + if (!onlink_locked) { + ND6_WUNLOCK(); + ND6_ONLINK_LOCK(); + onlink_locked = true; + ND6_WLOCK(); + goto restart; + } + } + (void)nd6_prefix_offlink(pr); + } } + ND6_WUNLOCK(); + if (onlink_locked) + ND6_ONLINK_UNLOCK(); + while ((pr = LIST_FIRST(&prl)) != NULL) { + LIST_REMOVE(pr, ndpr_entry); + nd6_prefix_del(pr); + } + callout_reset(&V_nd6_timer_ch, V_nd6_prune * hz, nd6_timer, curvnet); CURVNET_RESTORE(); } /* * ia6 - deprecated/invalidated temporary address */ static int regen_tmpaddr(struct in6_ifaddr *ia6) { struct ifaddr *ifa; struct ifnet *ifp; struct in6_ifaddr *public_ifa6 = NULL; ifp = ia6->ia_ifa.ifa_ifp; IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { struct in6_ifaddr *it6; if (ifa->ifa_addr->sa_family != AF_INET6) continue; it6 = (struct in6_ifaddr *)ifa; /* ignore no autoconf addresses. */ if ((it6->ia6_flags & IN6_IFF_AUTOCONF) == 0) continue; /* ignore autoconf addresses with different prefixes. */ if (it6->ia6_ndpr == NULL || it6->ia6_ndpr != ia6->ia6_ndpr) continue; /* * Now we are looking at an autoconf address with the same * prefix as ours. If the address is temporary and is still * preferred, do not create another one. It would be rare, but * could happen, for example, when we resume a laptop PC after * a long period. */ if ((it6->ia6_flags & IN6_IFF_TEMPORARY) != 0 && !IFA6_IS_DEPRECATED(it6)) { public_ifa6 = NULL; break; } /* * This is a public autoconf address that has the same prefix * as ours. If it is preferred, keep it. We can't break the * loop here, because there may be a still-preferred temporary * address with the prefix. */ if (!IFA6_IS_DEPRECATED(it6)) public_ifa6 = it6; } if (public_ifa6 != NULL) ifa_ref(&public_ifa6->ia_ifa); IF_ADDR_RUNLOCK(ifp); if (public_ifa6 != NULL) { int e; if ((e = in6_tmpifadd(public_ifa6, 0, 0)) != 0) { ifa_free(&public_ifa6->ia_ifa); log(LOG_NOTICE, "regen_tmpaddr: failed to create a new" " tmp addr,errno=%d\n", e); return (-1); } ifa_free(&public_ifa6->ia_ifa); return (0); } return (-1); } /* * Remove prefix and default router list entries corresponding to ifp. Neighbor * cache entries are freed in in6_domifdetach(). */ void nd6_purge(struct ifnet *ifp) { struct nd_drhead drq; + struct nd_prhead prl; struct nd_defrouter *dr, *ndr; struct nd_prefix *pr, *npr; TAILQ_INIT(&drq); + LIST_INIT(&prl); /* * Nuke default router list entries toward ifp. * We defer removal of default router list entries that is installed * in the routing table, in order to keep additional side effects as * small as possible. */ ND6_WLOCK(); TAILQ_FOREACH_SAFE(dr, &V_nd_defrouter, dr_entry, ndr) { if (dr->installed) continue; if (dr->ifp == ifp) defrouter_unlink(dr, &drq); } - TAILQ_FOREACH_SAFE(dr, &V_nd_defrouter, dr_entry, ndr) { if (!dr->installed) continue; if (dr->ifp == ifp) defrouter_unlink(dr, &drq); } + + /* + * Remove prefixes on ifp. We should have already removed addresses on + * this interface, so no addresses should be referencing these prefixes. + */ + LIST_FOREACH_SAFE(pr, &V_nd_prefix, ndpr_entry, npr) { + if (pr->ndpr_ifp == ifp) + nd6_prefix_unlink(pr, &prl); + } ND6_WUNLOCK(); + /* Delete the unlinked router and prefix objects. */ while ((dr = TAILQ_FIRST(&drq)) != NULL) { TAILQ_REMOVE(&drq, dr, dr_entry); defrouter_del(dr); } - - /* Nuke prefix list entries toward ifp */ - LIST_FOREACH_SAFE(pr, &V_nd_prefix, ndpr_entry, npr) { - if (pr->ndpr_ifp == ifp) { - /* - * Because if_detach() does *not* release prefixes - * while purging addresses the reference count will - * still be above zero. We therefore reset it to - * make sure that the prefix really gets purged. - */ - pr->ndpr_addrcnt = 0; - - prelist_remove(pr); - } + while ((pr = LIST_FIRST(&prl)) != NULL) { + LIST_REMOVE(pr, ndpr_entry); + nd6_prefix_del(pr); } /* cancel default outgoing interface setting */ if (V_nd6_defifindex == ifp->if_index) nd6_setdefaultiface(0); if (ND_IFINFO(ifp)->flags & ND6_IFF_ACCEPT_RTADV) { /* Refresh default router list. */ defrouter_select(); } } /* * the caller acquires and releases the lock on the lltbls * Returns the llentry locked */ struct llentry * nd6_lookup(const struct in6_addr *addr6, int flags, struct ifnet *ifp) { struct sockaddr_in6 sin6; struct llentry *ln; bzero(&sin6, sizeof(sin6)); sin6.sin6_len = sizeof(struct sockaddr_in6); sin6.sin6_family = AF_INET6; sin6.sin6_addr = *addr6; IF_AFDATA_LOCK_ASSERT(ifp); ln = lla_lookup(LLTABLE6(ifp), flags, (struct sockaddr *)&sin6); return (ln); } struct llentry * nd6_alloc(const struct in6_addr *addr6, int flags, struct ifnet *ifp) { struct sockaddr_in6 sin6; struct llentry *ln; bzero(&sin6, sizeof(sin6)); sin6.sin6_len = sizeof(struct sockaddr_in6); sin6.sin6_family = AF_INET6; sin6.sin6_addr = *addr6; ln = lltable_alloc_entry(LLTABLE6(ifp), 0, (struct sockaddr *)&sin6); if (ln != NULL) ln->ln_state = ND6_LLINFO_NOSTATE; return (ln); } /* * Test whether a given IPv6 address is a neighbor or not, ignoring * the actual neighbor cache. The neighbor cache is ignored in order * to not reenter the routing code from within itself. */ static int nd6_is_new_addr_neighbor(const struct sockaddr_in6 *addr, struct ifnet *ifp) { struct nd_prefix *pr; struct ifaddr *dstaddr; struct rt_addrinfo info; struct sockaddr_in6 rt_key; - struct sockaddr *dst6; - int fibnum; + const struct sockaddr *dst6; + uint64_t genid; + int error, fibnum; /* * A link-local address is always a neighbor. * XXX: a link does not necessarily specify a single interface. */ if (IN6_IS_ADDR_LINKLOCAL(&addr->sin6_addr)) { struct sockaddr_in6 sin6_copy; u_int32_t zone; /* * We need sin6_copy since sa6_recoverscope() may modify the * content (XXX). */ sin6_copy = *addr; if (sa6_recoverscope(&sin6_copy)) return (0); /* XXX: should be impossible */ if (in6_setscope(&sin6_copy.sin6_addr, ifp, &zone)) return (0); if (sin6_copy.sin6_scope_id == zone) return (1); else return (0); } bzero(&rt_key, sizeof(rt_key)); bzero(&info, sizeof(info)); info.rti_info[RTAX_DST] = (struct sockaddr *)&rt_key; /* Always use the default FIB here. XXME - why? */ fibnum = RT_DEFAULT_FIB; /* * If the address matches one of our addresses, * it should be a neighbor. * If the address matches one of our on-link prefixes, it should be a * neighbor. */ + ND6_RLOCK(); +restart: LIST_FOREACH(pr, &V_nd_prefix, ndpr_entry) { if (pr->ndpr_ifp != ifp) continue; - if (!(pr->ndpr_stateflags & NDPRF_ONLINK)) { - + if ((pr->ndpr_stateflags & NDPRF_ONLINK) == 0) { /* Always use the default FIB here. */ - dst6 = (struct sockaddr *)&pr->ndpr_prefix; + dst6 = (const struct sockaddr *)&pr->ndpr_prefix; + genid = V_nd6_list_genid; + ND6_RUNLOCK(); + /* Restore length field before retrying lookup */ rt_key.sin6_len = sizeof(rt_key); - if (rib_lookup_info(fibnum, dst6, 0, 0, &info) != 0) + error = rib_lookup_info(fibnum, dst6, 0, 0, &info); + + ND6_RLOCK(); + if (genid != V_nd6_list_genid) + goto restart; + if (error != 0) continue; + /* * This is the case where multiple interfaces * have the same prefix, but only one is installed * into the routing table and that prefix entry * is not the one being examined here. In the case * where RADIX_MPATH is enabled, multiple route * entries (of the same rt_key value) will be * installed because the interface addresses all * differ. */ if (!IN6_ARE_ADDR_EQUAL(&pr->ndpr_prefix.sin6_addr, - &rt_key.sin6_addr)) + &rt_key.sin6_addr)) continue; } if (IN6_ARE_MASKED_ADDR_EQUAL(&pr->ndpr_prefix.sin6_addr, - &addr->sin6_addr, &pr->ndpr_mask)) + &addr->sin6_addr, &pr->ndpr_mask)) { + ND6_RUNLOCK(); return (1); + } } + ND6_RUNLOCK(); /* * If the address is assigned on the node of the other side of * a p2p interface, the address should be a neighbor. */ dstaddr = ifa_ifwithdstaddr((const struct sockaddr *)addr, RT_ALL_FIBS); if (dstaddr != NULL) { if (dstaddr->ifa_ifp == ifp) { ifa_free(dstaddr); return (1); } ifa_free(dstaddr); } /* * If the default router list is empty, all addresses are regarded * as on-link, and thus, as a neighbor. */ if (ND_IFINFO(ifp)->flags & ND6_IFF_ACCEPT_RTADV && TAILQ_EMPTY(&V_nd_defrouter) && V_nd6_defifindex == ifp->if_index) { return (1); } return (0); } /* * Detect if a given IPv6 address identifies a neighbor on a given link. * XXX: should take care of the destination of a p2p link? */ int nd6_is_addr_neighbor(const struct sockaddr_in6 *addr, struct ifnet *ifp) { struct llentry *lle; int rc = 0; IF_AFDATA_UNLOCK_ASSERT(ifp); if (nd6_is_new_addr_neighbor(addr, ifp)) return (1); /* * Even if the address matches none of our addresses, it might be * in the neighbor cache. */ IF_AFDATA_RLOCK(ifp); if ((lle = nd6_lookup(&addr->sin6_addr, 0, ifp)) != NULL) { LLE_RUNLOCK(lle); rc = 1; } IF_AFDATA_RUNLOCK(ifp); return (rc); } /* * Free an nd6 llinfo entry. * Since the function would cause significant changes in the kernel, DO NOT * make it global, unless you have a strong reason for the change, and are sure * that the change is safe. * * Set noinline to be dtrace-friendly */ static __noinline void nd6_free(struct llentry **lnp, int gc) { struct ifnet *ifp; struct llentry *ln; struct nd_defrouter *dr; ln = *lnp; *lnp = NULL; LLE_WLOCK_ASSERT(ln); ND6_RLOCK_ASSERT(); ifp = lltable_get_ifp(ln->lle_tbl); if ((ND_IFINFO(ifp)->flags & ND6_IFF_ACCEPT_RTADV) != 0) dr = defrouter_lookup_locked(&ln->r_l3addr.addr6, ifp); else dr = NULL; ND6_RUNLOCK(); if ((ln->la_flags & LLE_DELETED) == 0) EVENTHANDLER_INVOKE(lle_event, ln, LLENTRY_EXPIRED); /* * we used to have pfctlinput(PRC_HOSTDEAD) here. * even though it is not harmful, it was not really necessary. */ /* cancel timer */ nd6_llinfo_settimer_locked(ln, -1); if (ND_IFINFO(ifp)->flags & ND6_IFF_ACCEPT_RTADV) { if (dr != NULL && dr->expire && ln->ln_state == ND6_LLINFO_STALE && gc) { /* * If the reason for the deletion is just garbage * collection, and the neighbor is an active default * router, do not delete it. Instead, reset the GC * timer using the router's lifetime. * Simply deleting the entry would affect default * router selection, which is not necessarily a good * thing, especially when we're using router preference * values. * XXX: the check for ln_state would be redundant, * but we intentionally keep it just in case. */ if (dr->expire > time_uptime) nd6_llinfo_settimer_locked(ln, (dr->expire - time_uptime) * hz); else nd6_llinfo_settimer_locked(ln, (long)V_nd6_gctimer * hz); LLE_REMREF(ln); LLE_WUNLOCK(ln); defrouter_rele(dr); return; } if (dr) { /* * Unreachablity of a router might affect the default * router selection and on-link detection of advertised * prefixes. */ /* * Temporarily fake the state to choose a new default * router and to perform on-link determination of * prefixes correctly. * Below the state will be set correctly, * or the entry itself will be deleted. */ ln->ln_state = ND6_LLINFO_INCOMPLETE; } if (ln->ln_router || dr) { /* * We need to unlock to avoid a LOR with rt6_flush() with the * rnh and for the calls to pfxlist_onlink_check() and * defrouter_select() in the block further down for calls * into nd6_lookup(). We still hold a ref. */ LLE_WUNLOCK(ln); /* * rt6_flush must be called whether or not the neighbor * is in the Default Router List. * See a corresponding comment in nd6_na_input(). */ rt6_flush(&ln->r_l3addr.addr6, ifp); } if (dr) { /* * Since defrouter_select() does not affect the * on-link determination and MIP6 needs the check * before the default router selection, we perform * the check now. */ pfxlist_onlink_check(); /* * Refresh default router list. */ defrouter_select(); } /* * If this entry was added by an on-link redirect, remove the * corresponding host route. */ if (ln->la_flags & LLE_REDIRECT) nd6_free_redirect(ln); if (ln->ln_router || dr) LLE_WLOCK(ln); } /* * Save to unlock. We still hold an extra reference and will not * free(9) in llentry_free() if someone else holds one as well. */ LLE_WUNLOCK(ln); IF_AFDATA_LOCK(ifp); LLE_WLOCK(ln); /* Guard against race with other llentry_free(). */ if (ln->la_flags & LLE_LINKED) { /* Remove callout reference */ LLE_REMREF(ln); lltable_unlink_entry(ln->lle_tbl, ln); } IF_AFDATA_UNLOCK(ifp); llentry_free(ln); if (dr != NULL) defrouter_rele(dr); } static int nd6_isdynrte(const struct rtentry *rt, void *xap) { if (rt->rt_flags == (RTF_UP | RTF_HOST | RTF_DYNAMIC)) return (1); return (0); } /* * Remove the rtentry for the given llentry, * both of which were installed by a redirect. */ static void nd6_free_redirect(const struct llentry *ln) { int fibnum; struct sockaddr_in6 sin6; struct rt_addrinfo info; lltable_fill_sa_entry(ln, (struct sockaddr *)&sin6); memset(&info, 0, sizeof(info)); info.rti_info[RTAX_DST] = (struct sockaddr *)&sin6; info.rti_filter = nd6_isdynrte; for (fibnum = 0; fibnum < rt_numfibs; fibnum++) rtrequest1_fib(RTM_DELETE, &info, NULL, fibnum); } /* * Rejuvenate this function for routing operations related * processing. */ void nd6_rtrequest(int req, struct rtentry *rt, struct rt_addrinfo *info) { struct sockaddr_in6 *gateway; struct nd_defrouter *dr; struct ifnet *ifp; gateway = (struct sockaddr_in6 *)rt->rt_gateway; ifp = rt->rt_ifp; switch (req) { case RTM_ADD: break; case RTM_DELETE: if (!ifp) return; /* * Only indirect routes are interesting. */ if ((rt->rt_flags & RTF_GATEWAY) == 0) return; /* * check for default route */ if (IN6_ARE_ADDR_EQUAL(&in6addr_any, &SIN6(rt_key(rt))->sin6_addr)) { dr = defrouter_lookup(&gateway->sin6_addr, ifp); if (dr != NULL) { dr->installed = 0; defrouter_rele(dr); } } break; } } int nd6_ioctl(u_long cmd, caddr_t data, struct ifnet *ifp) { struct in6_ndireq *ndi = (struct in6_ndireq *)data; struct in6_nbrinfo *nbi = (struct in6_nbrinfo *)data; struct in6_ndifreq *ndif = (struct in6_ndifreq *)data; int error = 0; if (ifp->if_afdata[AF_INET6] == NULL) return (EPFNOSUPPORT); switch (cmd) { case OSIOCGIFINFO_IN6: #define ND ndi->ndi /* XXX: old ndp(8) assumes a positive value for linkmtu. */ bzero(&ND, sizeof(ND)); ND.linkmtu = IN6_LINKMTU(ifp); ND.maxmtu = ND_IFINFO(ifp)->maxmtu; ND.basereachable = ND_IFINFO(ifp)->basereachable; ND.reachable = ND_IFINFO(ifp)->reachable; ND.retrans = ND_IFINFO(ifp)->retrans; ND.flags = ND_IFINFO(ifp)->flags; ND.recalctm = ND_IFINFO(ifp)->recalctm; ND.chlim = ND_IFINFO(ifp)->chlim; break; case SIOCGIFINFO_IN6: ND = *ND_IFINFO(ifp); break; case SIOCSIFINFO_IN6: /* * used to change host variables from userland. * intended for a use on router to reflect RA configurations. */ /* 0 means 'unspecified' */ if (ND.linkmtu != 0) { if (ND.linkmtu < IPV6_MMTU || ND.linkmtu > IN6_LINKMTU(ifp)) { error = EINVAL; break; } ND_IFINFO(ifp)->linkmtu = ND.linkmtu; } if (ND.basereachable != 0) { int obasereachable = ND_IFINFO(ifp)->basereachable; ND_IFINFO(ifp)->basereachable = ND.basereachable; if (ND.basereachable != obasereachable) ND_IFINFO(ifp)->reachable = ND_COMPUTE_RTIME(ND.basereachable); } if (ND.retrans != 0) ND_IFINFO(ifp)->retrans = ND.retrans; if (ND.chlim != 0) ND_IFINFO(ifp)->chlim = ND.chlim; /* FALLTHROUGH */ case SIOCSIFINFO_FLAGS: { struct ifaddr *ifa; struct in6_ifaddr *ia; if ((ND_IFINFO(ifp)->flags & ND6_IFF_IFDISABLED) && !(ND.flags & ND6_IFF_IFDISABLED)) { /* ifdisabled 1->0 transision */ /* * If the interface is marked as ND6_IFF_IFDISABLED and * has an link-local address with IN6_IFF_DUPLICATED, * do not clear ND6_IFF_IFDISABLED. * See RFC 4862, Section 5.4.5. */ IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; ia = (struct in6_ifaddr *)ifa; if ((ia->ia6_flags & IN6_IFF_DUPLICATED) && IN6_IS_ADDR_LINKLOCAL(IA6_IN6(ia))) break; } IF_ADDR_RUNLOCK(ifp); if (ifa != NULL) { /* LLA is duplicated. */ ND.flags |= ND6_IFF_IFDISABLED; log(LOG_ERR, "Cannot enable an interface" " with a link-local address marked" " duplicate.\n"); } else { ND_IFINFO(ifp)->flags &= ~ND6_IFF_IFDISABLED; if (ifp->if_flags & IFF_UP) in6_if_up(ifp); } } else if (!(ND_IFINFO(ifp)->flags & ND6_IFF_IFDISABLED) && (ND.flags & ND6_IFF_IFDISABLED)) { /* ifdisabled 0->1 transision */ /* Mark all IPv6 address as tentative. */ ND_IFINFO(ifp)->flags |= ND6_IFF_IFDISABLED; if (V_ip6_dad_count > 0 && (ND_IFINFO(ifp)->flags & ND6_IFF_NO_DAD) == 0) { IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; ia = (struct in6_ifaddr *)ifa; ia->ia6_flags |= IN6_IFF_TENTATIVE; } IF_ADDR_RUNLOCK(ifp); } } if (ND.flags & ND6_IFF_AUTO_LINKLOCAL) { if (!(ND_IFINFO(ifp)->flags & ND6_IFF_AUTO_LINKLOCAL)) { /* auto_linklocal 0->1 transision */ /* If no link-local address on ifp, configure */ ND_IFINFO(ifp)->flags |= ND6_IFF_AUTO_LINKLOCAL; in6_ifattach(ifp, NULL); } else if (!(ND.flags & ND6_IFF_IFDISABLED) && ifp->if_flags & IFF_UP) { /* * When the IF already has * ND6_IFF_AUTO_LINKLOCAL, no link-local * address is assigned, and IFF_UP, try to * assign one. */ IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; ia = (struct in6_ifaddr *)ifa; if (IN6_IS_ADDR_LINKLOCAL(IA6_IN6(ia))) break; } IF_ADDR_RUNLOCK(ifp); if (ifa != NULL) /* No LLA is configured. */ in6_ifattach(ifp, NULL); } } } ND_IFINFO(ifp)->flags = ND.flags; break; #undef ND case SIOCSNDFLUSH_IN6: /* XXX: the ioctl name is confusing... */ /* sync kernel routing table with the default router list */ defrouter_reset(); defrouter_select(); break; case SIOCSPFXFLUSH_IN6: { /* flush all the prefix advertised by routers */ + struct in6_ifaddr *ia, *ia_next; struct nd_prefix *pr, *next; + struct nd_prhead prl; - LIST_FOREACH_SAFE(pr, &V_nd_prefix, ndpr_entry, next) { - struct in6_ifaddr *ia, *ia_next; + LIST_INIT(&prl); + ND6_WLOCK(); + LIST_FOREACH_SAFE(pr, &V_nd_prefix, ndpr_entry, next) { if (IN6_IS_ADDR_LINKLOCAL(&pr->ndpr_prefix.sin6_addr)) continue; /* XXX */ + nd6_prefix_unlink(pr, &prl); + } + ND6_WUNLOCK(); - /* do we really have to remove addresses as well? */ + while ((pr = LIST_FIRST(&prl)) != NULL) { + LIST_REMOVE(pr, ndpr_entry); /* XXXRW: in6_ifaddrhead locking. */ TAILQ_FOREACH_SAFE(ia, &V_in6_ifaddrhead, ia_link, ia_next) { if ((ia->ia6_flags & IN6_IFF_AUTOCONF) == 0) continue; if (ia->ia6_ndpr == pr) in6_purgeaddr(&ia->ia_ifa); } - prelist_remove(pr); + nd6_prefix_del(pr); } break; } case SIOCSRTRFLUSH_IN6: { /* flush all the default routers */ struct nd_drhead drq; struct nd_defrouter *dr; TAILQ_INIT(&drq); defrouter_reset(); ND6_WLOCK(); while ((dr = TAILQ_FIRST(&V_nd_defrouter)) != NULL) defrouter_unlink(dr, &drq); ND6_WUNLOCK(); while ((dr = TAILQ_FIRST(&drq)) != NULL) { TAILQ_REMOVE(&drq, dr, dr_entry); defrouter_del(dr); } defrouter_select(); break; } case SIOCGNBRINFO_IN6: { struct llentry *ln; struct in6_addr nb_addr = nbi->addr; /* make local for safety */ if ((error = in6_setscope(&nb_addr, ifp, NULL)) != 0) return (error); IF_AFDATA_RLOCK(ifp); ln = nd6_lookup(&nb_addr, 0, ifp); IF_AFDATA_RUNLOCK(ifp); if (ln == NULL) { error = EINVAL; break; } nbi->state = ln->ln_state; nbi->asked = ln->la_asked; nbi->isrouter = ln->ln_router; if (ln->la_expire == 0) nbi->expire = 0; else nbi->expire = ln->la_expire + ln->lle_remtime / hz + (time_second - time_uptime); LLE_RUNLOCK(ln); break; } case SIOCGDEFIFACE_IN6: /* XXX: should be implemented as a sysctl? */ ndif->ifindex = V_nd6_defifindex; break; case SIOCSDEFIFACE_IN6: /* XXX: should be implemented as a sysctl? */ return (nd6_setdefaultiface(ndif->ifindex)); } return (error); } /* * Calculates new isRouter value based on provided parameters and * returns it. */ static int nd6_is_router(int type, int code, int is_new, int old_addr, int new_addr, int ln_router) { /* * ICMP6 type dependent behavior. * * NS: clear IsRouter if new entry * RS: clear IsRouter * RA: set IsRouter if there's lladdr * redir: clear IsRouter if new entry * * RA case, (1): * The spec says that we must set IsRouter in the following cases: * - If lladdr exist, set IsRouter. This means (1-5). * - If it is old entry (!newentry), set IsRouter. This means (7). * So, based on the spec, in (1-5) and (7) cases we must set IsRouter. * A quetion arises for (1) case. (1) case has no lladdr in the * neighbor cache, this is similar to (6). * This case is rare but we figured that we MUST NOT set IsRouter. * * is_new old_addr new_addr NS RS RA redir * D R * 0 n n (1) c ? s * 0 y n (2) c s s * 0 n y (3) c s s * 0 y y (4) c s s * 0 y y (5) c s s * 1 -- n (6) c c c s * 1 -- y (7) c c s c s * * (c=clear s=set) */ switch (type & 0xff) { case ND_NEIGHBOR_SOLICIT: /* * New entry must have is_router flag cleared. */ if (is_new) /* (6-7) */ ln_router = 0; break; case ND_REDIRECT: /* * If the icmp is a redirect to a better router, always set the * is_router flag. Otherwise, if the entry is newly created, * clear the flag. [RFC 2461, sec 8.3] */ if (code == ND_REDIRECT_ROUTER) ln_router = 1; else { if (is_new) /* (6-7) */ ln_router = 0; } break; case ND_ROUTER_SOLICIT: /* * is_router flag must always be cleared. */ ln_router = 0; break; case ND_ROUTER_ADVERT: /* * Mark an entry with lladdr as a router. */ if ((!is_new && (old_addr || new_addr)) || /* (2-5) */ (is_new && new_addr)) { /* (7) */ ln_router = 1; } break; } return (ln_router); } /* * Create neighbor cache entry and cache link-layer address, * on reception of inbound ND6 packets. (RS/RA/NS/redirect) * * type - ICMP6 type * code - type dependent information * */ void nd6_cache_lladdr(struct ifnet *ifp, struct in6_addr *from, char *lladdr, int lladdrlen, int type, int code) { struct llentry *ln = NULL, *ln_tmp; int is_newentry; int do_update; int olladdr; int llchange; int flags; uint16_t router = 0; struct sockaddr_in6 sin6; struct mbuf *chain = NULL; u_char linkhdr[LLE_MAX_LINKHDR]; size_t linkhdrsize; int lladdr_off; IF_AFDATA_UNLOCK_ASSERT(ifp); KASSERT(ifp != NULL, ("%s: ifp == NULL", __func__)); KASSERT(from != NULL, ("%s: from == NULL", __func__)); /* nothing must be updated for unspecified address */ if (IN6_IS_ADDR_UNSPECIFIED(from)) return; /* * Validation about ifp->if_addrlen and lladdrlen must be done in * the caller. * * XXX If the link does not have link-layer adderss, what should * we do? (ifp->if_addrlen == 0) * Spec says nothing in sections for RA, RS and NA. There's small * description on it in NS section (RFC 2461 7.2.3). */ flags = lladdr ? LLE_EXCLUSIVE : 0; IF_AFDATA_RLOCK(ifp); ln = nd6_lookup(from, flags, ifp); IF_AFDATA_RUNLOCK(ifp); is_newentry = 0; if (ln == NULL) { flags |= LLE_EXCLUSIVE; ln = nd6_alloc(from, 0, ifp); if (ln == NULL) return; /* * Since we already know all the data for the new entry, * fill it before insertion. */ if (lladdr != NULL) { linkhdrsize = sizeof(linkhdr); if (lltable_calc_llheader(ifp, AF_INET6, lladdr, linkhdr, &linkhdrsize, &lladdr_off) != 0) return; lltable_set_entry_addr(ifp, ln, linkhdr, linkhdrsize, lladdr_off); } IF_AFDATA_WLOCK(ifp); LLE_WLOCK(ln); /* Prefer any existing lle over newly-created one */ ln_tmp = nd6_lookup(from, LLE_EXCLUSIVE, ifp); if (ln_tmp == NULL) lltable_link_entry(LLTABLE6(ifp), ln); IF_AFDATA_WUNLOCK(ifp); if (ln_tmp == NULL) { /* No existing lle, mark as new entry (6,7) */ is_newentry = 1; nd6_llinfo_setstate(ln, ND6_LLINFO_STALE); if (lladdr != NULL) /* (7) */ EVENTHANDLER_INVOKE(lle_event, ln, LLENTRY_RESOLVED); } else { lltable_free_entry(LLTABLE6(ifp), ln); ln = ln_tmp; ln_tmp = NULL; } } /* do nothing if static ndp is set */ if ((ln->la_flags & LLE_STATIC)) { if (flags & LLE_EXCLUSIVE) LLE_WUNLOCK(ln); else LLE_RUNLOCK(ln); return; } olladdr = (ln->la_flags & LLE_VALID) ? 1 : 0; if (olladdr && lladdr) { llchange = bcmp(lladdr, ln->ll_addr, ifp->if_addrlen); } else if (!olladdr && lladdr) llchange = 1; else llchange = 0; /* * newentry olladdr lladdr llchange (*=record) * 0 n n -- (1) * 0 y n -- (2) * 0 n y y (3) * STALE * 0 y y n (4) * * 0 y y y (5) * STALE * 1 -- n -- (6) NOSTATE(= PASSIVE) * 1 -- y -- (7) * STALE */ do_update = 0; if (is_newentry == 0 && llchange != 0) { do_update = 1; /* (3,5) */ /* * Record source link-layer address * XXX is it dependent to ifp->if_type? */ linkhdrsize = sizeof(linkhdr); if (lltable_calc_llheader(ifp, AF_INET6, lladdr, linkhdr, &linkhdrsize, &lladdr_off) != 0) return; if (lltable_try_set_entry_addr(ifp, ln, linkhdr, linkhdrsize, lladdr_off) == 0) { /* Entry was deleted */ return; } nd6_llinfo_setstate(ln, ND6_LLINFO_STALE); EVENTHANDLER_INVOKE(lle_event, ln, LLENTRY_RESOLVED); if (ln->la_hold != NULL) nd6_grab_holdchain(ln, &chain, &sin6); } /* Calculates new router status */ router = nd6_is_router(type, code, is_newentry, olladdr, lladdr != NULL ? 1 : 0, ln->ln_router); ln->ln_router = router; /* Mark non-router redirects with special flag */ if ((type & 0xFF) == ND_REDIRECT && code != ND_REDIRECT_ROUTER) ln->la_flags |= LLE_REDIRECT; if (flags & LLE_EXCLUSIVE) LLE_WUNLOCK(ln); else LLE_RUNLOCK(ln); if (chain != NULL) nd6_flush_holdchain(ifp, ifp, chain, &sin6); /* * When the link-layer address of a router changes, select the * best router again. In particular, when the neighbor entry is newly * created, it might affect the selection policy. * Question: can we restrict the first condition to the "is_newentry" * case? * XXX: when we hear an RA from a new router with the link-layer * address option, defrouter_select() is called twice, since * defrtrlist_update called the function as well. However, I believe * we can compromise the overhead, since it only happens the first * time. * XXX: although defrouter_select() should not have a bad effect * for those are not autoconfigured hosts, we explicitly avoid such * cases for safety. */ if ((do_update || is_newentry) && router && ND_IFINFO(ifp)->flags & ND6_IFF_ACCEPT_RTADV) { /* * guaranteed recursion */ defrouter_select(); } } static void nd6_slowtimo(void *arg) { CURVNET_SET((struct vnet *) arg); struct nd_ifinfo *nd6if; struct ifnet *ifp; callout_reset(&V_nd6_slowtimo_ch, ND6_SLOWTIMER_INTERVAL * hz, nd6_slowtimo, curvnet); IFNET_RLOCK_NOSLEEP(); TAILQ_FOREACH(ifp, &V_ifnet, if_link) { if (ifp->if_afdata[AF_INET6] == NULL) continue; nd6if = ND_IFINFO(ifp); if (nd6if->basereachable && /* already initialized */ (nd6if->recalctm -= ND6_SLOWTIMER_INTERVAL) <= 0) { /* * Since reachable time rarely changes by router * advertisements, we SHOULD insure that a new random * value gets recomputed at least once every few hours. * (RFC 2461, 6.3.4) */ nd6if->recalctm = V_nd6_recalc_reachtm_interval; nd6if->reachable = ND_COMPUTE_RTIME(nd6if->basereachable); } } IFNET_RUNLOCK_NOSLEEP(); CURVNET_RESTORE(); } void nd6_grab_holdchain(struct llentry *ln, struct mbuf **chain, struct sockaddr_in6 *sin6) { LLE_WLOCK_ASSERT(ln); *chain = ln->la_hold; ln->la_hold = NULL; lltable_fill_sa_entry(ln, (struct sockaddr *)sin6); if (ln->ln_state == ND6_LLINFO_STALE) { /* * The first time we send a packet to a * neighbor whose entry is STALE, we have * to change the state to DELAY and a sets * a timer to expire in DELAY_FIRST_PROBE_TIME * seconds to ensure do neighbor unreachability * detection on expiration. * (RFC 2461 7.3.3) */ nd6_llinfo_setstate(ln, ND6_LLINFO_DELAY); } } int nd6_output_ifp(struct ifnet *ifp, struct ifnet *origifp, struct mbuf *m, struct sockaddr_in6 *dst, struct route *ro) { int error; int ip6len; struct ip6_hdr *ip6; struct m_tag *mtag; #ifdef MAC mac_netinet6_nd6_send(ifp, m); #endif /* * If called from nd6_ns_output() (NS), nd6_na_output() (NA), * icmp6_redirect_output() (REDIRECT) or from rip6_output() (RS, RA * as handled by rtsol and rtadvd), mbufs will be tagged for SeND * to be diverted to user space. When re-injected into the kernel, * send_output() will directly dispatch them to the outgoing interface. */ if (send_sendso_input_hook != NULL) { mtag = m_tag_find(m, PACKET_TAG_ND_OUTGOING, NULL); if (mtag != NULL) { ip6 = mtod(m, struct ip6_hdr *); ip6len = sizeof(struct ip6_hdr) + ntohs(ip6->ip6_plen); /* Use the SEND socket */ error = send_sendso_input_hook(m, ifp, SND_OUT, ip6len); /* -1 == no app on SEND socket */ if (error == 0 || error != -1) return (error); } } m_clrprotoflags(m); /* Avoid confusing lower layers. */ IP_PROBE(send, NULL, NULL, mtod(m, struct ip6_hdr *), ifp, NULL, mtod(m, struct ip6_hdr *)); if ((ifp->if_flags & IFF_LOOPBACK) == 0) origifp = ifp; error = (*ifp->if_output)(origifp, m, (struct sockaddr *)dst, ro); return (error); } /* * Lookup link headerfor @sa_dst address. Stores found * data in @desten buffer. Copy of lle ln_flags can be also * saved in @pflags if @pflags is non-NULL. * * If destination LLE does not exists or lle state modification * is required, call "slow" version. * * Return values: * - 0 on success (address copied to buffer). * - EWOULDBLOCK (no local error, but address is still unresolved) * - other errors (alloc failure, etc) */ int nd6_resolve(struct ifnet *ifp, int is_gw, struct mbuf *m, const struct sockaddr *sa_dst, u_char *desten, uint32_t *pflags, struct llentry **plle) { struct llentry *ln = NULL; const struct sockaddr_in6 *dst6; if (pflags != NULL) *pflags = 0; dst6 = (const struct sockaddr_in6 *)sa_dst; /* discard the packet if IPv6 operation is disabled on the interface */ if ((ND_IFINFO(ifp)->flags & ND6_IFF_IFDISABLED)) { m_freem(m); return (ENETDOWN); /* better error? */ } if (m != NULL && m->m_flags & M_MCAST) { switch (ifp->if_type) { case IFT_ETHER: case IFT_FDDI: case IFT_L2VLAN: case IFT_IEEE80211: case IFT_BRIDGE: case IFT_ISO88025: ETHER_MAP_IPV6_MULTICAST(&dst6->sin6_addr, desten); return (0); default: m_freem(m); return (EAFNOSUPPORT); } } IF_AFDATA_RLOCK(ifp); ln = nd6_lookup(&dst6->sin6_addr, plle ? LLE_EXCLUSIVE : LLE_UNLOCKED, ifp); if (ln != NULL && (ln->r_flags & RLLE_VALID) != 0) { /* Entry found, let's copy lle info */ bcopy(ln->r_linkdata, desten, ln->r_hdrlen); if (pflags != NULL) *pflags = LLE_VALID | (ln->r_flags & RLLE_IFADDR); /* Check if we have feedback request from nd6 timer */ if (ln->r_skip_req != 0) { LLE_REQ_LOCK(ln); ln->r_skip_req = 0; /* Notify that entry was used */ ln->lle_hittime = time_uptime; LLE_REQ_UNLOCK(ln); } if (plle) { LLE_ADDREF(ln); *plle = ln; LLE_WUNLOCK(ln); } IF_AFDATA_RUNLOCK(ifp); return (0); } else if (plle && ln) LLE_WUNLOCK(ln); IF_AFDATA_RUNLOCK(ifp); return (nd6_resolve_slow(ifp, 0, m, dst6, desten, pflags, plle)); } /* * Do L2 address resolution for @sa_dst address. Stores found * address in @desten buffer. Copy of lle ln_flags can be also * saved in @pflags if @pflags is non-NULL. * * Heavy version. * Function assume that destination LLE does not exist, * is invalid or stale, so LLE_EXCLUSIVE lock needs to be acquired. * * Set noinline to be dtrace-friendly */ static __noinline int nd6_resolve_slow(struct ifnet *ifp, int flags, struct mbuf *m, const struct sockaddr_in6 *dst, u_char *desten, uint32_t *pflags, struct llentry **plle) { struct llentry *lle = NULL, *lle_tmp; struct in6_addr *psrc, src; int send_ns, ll_len; char *lladdr; /* * Address resolution or Neighbor Unreachability Detection * for the next hop. * At this point, the destination of the packet must be a unicast * or an anycast address(i.e. not a multicast). */ if (lle == NULL) { IF_AFDATA_RLOCK(ifp); lle = nd6_lookup(&dst->sin6_addr, LLE_EXCLUSIVE, ifp); IF_AFDATA_RUNLOCK(ifp); if ((lle == NULL) && nd6_is_addr_neighbor(dst, ifp)) { /* * Since nd6_is_addr_neighbor() internally calls nd6_lookup(), * the condition below is not very efficient. But we believe * it is tolerable, because this should be a rare case. */ lle = nd6_alloc(&dst->sin6_addr, 0, ifp); if (lle == NULL) { char ip6buf[INET6_ADDRSTRLEN]; log(LOG_DEBUG, "nd6_output: can't allocate llinfo for %s " "(ln=%p)\n", ip6_sprintf(ip6buf, &dst->sin6_addr), lle); m_freem(m); return (ENOBUFS); } IF_AFDATA_WLOCK(ifp); LLE_WLOCK(lle); /* Prefer any existing entry over newly-created one */ lle_tmp = nd6_lookup(&dst->sin6_addr, LLE_EXCLUSIVE, ifp); if (lle_tmp == NULL) lltable_link_entry(LLTABLE6(ifp), lle); IF_AFDATA_WUNLOCK(ifp); if (lle_tmp != NULL) { lltable_free_entry(LLTABLE6(ifp), lle); lle = lle_tmp; lle_tmp = NULL; } } } if (lle == NULL) { if (!(ND_IFINFO(ifp)->flags & ND6_IFF_PERFORMNUD)) { m_freem(m); return (ENOBUFS); } if (m != NULL) m_freem(m); return (ENOBUFS); } LLE_WLOCK_ASSERT(lle); /* * The first time we send a packet to a neighbor whose entry is * STALE, we have to change the state to DELAY and a sets a timer to * expire in DELAY_FIRST_PROBE_TIME seconds to ensure do * neighbor unreachability detection on expiration. * (RFC 2461 7.3.3) */ if (lle->ln_state == ND6_LLINFO_STALE) nd6_llinfo_setstate(lle, ND6_LLINFO_DELAY); /* * If the neighbor cache entry has a state other than INCOMPLETE * (i.e. its link-layer address is already resolved), just * send the packet. */ if (lle->ln_state > ND6_LLINFO_INCOMPLETE) { if (flags & LLE_ADDRONLY) { lladdr = lle->ll_addr; ll_len = ifp->if_addrlen; } else { lladdr = lle->r_linkdata; ll_len = lle->r_hdrlen; } bcopy(lladdr, desten, ll_len); if (pflags != NULL) *pflags = lle->la_flags; if (plle) { LLE_ADDREF(lle); *plle = lle; } LLE_WUNLOCK(lle); return (0); } /* * There is a neighbor cache entry, but no ethernet address * response yet. Append this latest packet to the end of the * packet queue in the mbuf. When it exceeds nd6_maxqueuelen, * the oldest packet in the queue will be removed. */ if (lle->la_hold != NULL) { struct mbuf *m_hold; int i; i = 0; for (m_hold = lle->la_hold; m_hold; m_hold = m_hold->m_nextpkt){ i++; if (m_hold->m_nextpkt == NULL) { m_hold->m_nextpkt = m; break; } } while (i >= V_nd6_maxqueuelen) { m_hold = lle->la_hold; lle->la_hold = lle->la_hold->m_nextpkt; m_freem(m_hold); i--; } } else { lle->la_hold = m; } /* * If there has been no NS for the neighbor after entering the * INCOMPLETE state, send the first solicitation. * Note that for newly-created lle la_asked will be 0, * so we will transition from ND6_LLINFO_NOSTATE to * ND6_LLINFO_INCOMPLETE state here. */ psrc = NULL; send_ns = 0; if (lle->la_asked == 0) { lle->la_asked++; send_ns = 1; psrc = nd6_llinfo_get_holdsrc(lle, &src); nd6_llinfo_setstate(lle, ND6_LLINFO_INCOMPLETE); } LLE_WUNLOCK(lle); if (send_ns != 0) nd6_ns_output(ifp, psrc, NULL, &dst->sin6_addr, NULL); return (EWOULDBLOCK); } /* * Do L2 address resolution for @sa_dst address. Stores found * address in @desten buffer. Copy of lle ln_flags can be also * saved in @pflags if @pflags is non-NULL. * * Return values: * - 0 on success (address copied to buffer). * - EWOULDBLOCK (no local error, but address is still unresolved) * - other errors (alloc failure, etc) */ int nd6_resolve_addr(struct ifnet *ifp, int flags, const struct sockaddr *dst, char *desten, uint32_t *pflags) { int error; flags |= LLE_ADDRONLY; error = nd6_resolve_slow(ifp, flags, NULL, (const struct sockaddr_in6 *)dst, desten, pflags, NULL); return (error); } int nd6_flush_holdchain(struct ifnet *ifp, struct ifnet *origifp, struct mbuf *chain, struct sockaddr_in6 *dst) { struct mbuf *m, *m_head; struct ifnet *outifp; int error = 0; m_head = chain; if ((ifp->if_flags & IFF_LOOPBACK) != 0) outifp = origifp; else outifp = ifp; while (m_head) { m = m_head; m_head = m_head->m_nextpkt; error = nd6_output_ifp(ifp, origifp, m, dst, NULL); } /* * XXX * note that intermediate errors are blindly ignored */ return (error); } static int nd6_need_cache(struct ifnet *ifp) { /* * XXX: we currently do not make neighbor cache on any interface * other than ARCnet, Ethernet, FDDI and GIF. * * RFC2893 says: * - unidirectional tunnels needs no ND */ switch (ifp->if_type) { case IFT_ARCNET: case IFT_ETHER: case IFT_FDDI: case IFT_IEEE1394: case IFT_L2VLAN: case IFT_IEEE80211: case IFT_INFINIBAND: case IFT_BRIDGE: case IFT_PROPVIRTUAL: return (1); default: return (0); } } /* * Add pernament ND6 link-layer record for given * interface address. * * Very similar to IPv4 arp_ifinit(), but: * 1) IPv6 DAD is performed in different place * 2) It is called by IPv6 protocol stack in contrast to * arp_ifinit() which is typically called in SIOCSIFADDR * driver ioctl handler. * */ int nd6_add_ifa_lle(struct in6_ifaddr *ia) { struct ifnet *ifp; struct llentry *ln, *ln_tmp; struct sockaddr *dst; ifp = ia->ia_ifa.ifa_ifp; if (nd6_need_cache(ifp) == 0) return (0); ia->ia_ifa.ifa_rtrequest = nd6_rtrequest; dst = (struct sockaddr *)&ia->ia_addr; ln = lltable_alloc_entry(LLTABLE6(ifp), LLE_IFADDR, dst); if (ln == NULL) return (ENOBUFS); IF_AFDATA_WLOCK(ifp); LLE_WLOCK(ln); /* Unlink any entry if exists */ ln_tmp = lla_lookup(LLTABLE6(ifp), LLE_EXCLUSIVE, dst); if (ln_tmp != NULL) lltable_unlink_entry(LLTABLE6(ifp), ln_tmp); lltable_link_entry(LLTABLE6(ifp), ln); IF_AFDATA_WUNLOCK(ifp); if (ln_tmp != NULL) EVENTHANDLER_INVOKE(lle_event, ln_tmp, LLENTRY_EXPIRED); EVENTHANDLER_INVOKE(lle_event, ln, LLENTRY_RESOLVED); LLE_WUNLOCK(ln); if (ln_tmp != NULL) llentry_free(ln_tmp); return (0); } /* * Removes either all lle entries for given @ia, or lle * corresponding to @ia address. */ void nd6_rem_ifa_lle(struct in6_ifaddr *ia, int all) { struct sockaddr_in6 mask, addr; struct sockaddr *saddr, *smask; struct ifnet *ifp; ifp = ia->ia_ifa.ifa_ifp; memcpy(&addr, &ia->ia_addr, sizeof(ia->ia_addr)); memcpy(&mask, &ia->ia_prefixmask, sizeof(ia->ia_prefixmask)); saddr = (struct sockaddr *)&addr; smask = (struct sockaddr *)&mask; if (all != 0) lltable_prefix_free(AF_INET6, saddr, smask, LLE_STATIC); else lltable_delete_addr(LLTABLE6(ifp), LLE_IFADDR, saddr); } static void clear_llinfo_pqueue(struct llentry *ln) { struct mbuf *m_hold, *m_hold_next; for (m_hold = ln->la_hold; m_hold; m_hold = m_hold_next) { m_hold_next = m_hold->m_nextpkt; m_freem(m_hold); } ln->la_hold = NULL; } static int nd6_sysctl_drlist(SYSCTL_HANDLER_ARGS); static int nd6_sysctl_prlist(SYSCTL_HANDLER_ARGS); SYSCTL_DECL(_net_inet6_icmp6); SYSCTL_PROC(_net_inet6_icmp6, ICMPV6CTL_ND6_DRLIST, nd6_drlist, CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, nd6_sysctl_drlist, "S,in6_defrouter", "NDP default router list"); SYSCTL_PROC(_net_inet6_icmp6, ICMPV6CTL_ND6_PRLIST, nd6_prlist, CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, nd6_sysctl_prlist, "S,in6_prefix", "NDP prefix list"); SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_ND6_MAXQLEN, nd6_maxqueuelen, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(nd6_maxqueuelen), 1, ""); SYSCTL_INT(_net_inet6_icmp6, OID_AUTO, nd6_gctimer, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(nd6_gctimer), (60 * 60 * 24), ""); static int nd6_sysctl_drlist(SYSCTL_HANDLER_ARGS) { struct in6_defrouter d; struct nd_defrouter *dr; int error; if (req->newptr != NULL) return (EPERM); error = sysctl_wire_old_buffer(req, 0); if (error != 0) return (error); bzero(&d, sizeof(d)); d.rtaddr.sin6_family = AF_INET6; d.rtaddr.sin6_len = sizeof(d.rtaddr); ND6_RLOCK(); TAILQ_FOREACH(dr, &V_nd_defrouter, dr_entry) { d.rtaddr.sin6_addr = dr->rtaddr; error = sa6_recoverscope(&d.rtaddr); if (error != 0) break; d.flags = dr->raflags; d.rtlifetime = dr->rtlifetime; d.expire = dr->expire + (time_second - time_uptime); d.if_index = dr->ifp->if_index; error = SYSCTL_OUT(req, &d, sizeof(d)); if (error != 0) break; } ND6_RUNLOCK(); return (error); } static int nd6_sysctl_prlist(SYSCTL_HANDLER_ARGS) { struct in6_prefix p; struct sockaddr_in6 s6; struct nd_prefix *pr; struct nd_pfxrouter *pfr; time_t maxexpire; int error; char ip6buf[INET6_ADDRSTRLEN]; if (req->newptr) return (EPERM); error = sysctl_wire_old_buffer(req, 0); if (error != 0) return (error); bzero(&p, sizeof(p)); p.origin = PR_ORIG_RA; bzero(&s6, sizeof(s6)); s6.sin6_family = AF_INET6; s6.sin6_len = sizeof(s6); ND6_RLOCK(); LIST_FOREACH(pr, &V_nd_prefix, ndpr_entry) { p.prefix = pr->ndpr_prefix; if (sa6_recoverscope(&p.prefix)) { log(LOG_ERR, "scope error in prefix list (%s)\n", ip6_sprintf(ip6buf, &p.prefix.sin6_addr)); /* XXX: press on... */ } p.raflags = pr->ndpr_raf; p.prefixlen = pr->ndpr_plen; p.vltime = pr->ndpr_vltime; p.pltime = pr->ndpr_pltime; p.if_index = pr->ndpr_ifp->if_index; if (pr->ndpr_vltime == ND6_INFINITE_LIFETIME) p.expire = 0; else { /* XXX: we assume time_t is signed. */ maxexpire = (-1) & ~((time_t)1 << ((sizeof(maxexpire) * 8) - 1)); if (pr->ndpr_vltime < maxexpire - pr->ndpr_lastupdate) p.expire = pr->ndpr_lastupdate + pr->ndpr_vltime + (time_second - time_uptime); else p.expire = maxexpire; } p.refcnt = pr->ndpr_addrcnt; p.flags = pr->ndpr_stateflags; p.advrtrs = 0; LIST_FOREACH(pfr, &pr->ndpr_advrtrs, pfr_entry) p.advrtrs++; error = SYSCTL_OUT(req, &p, sizeof(p)); if (error != 0) break; LIST_FOREACH(pfr, &pr->ndpr_advrtrs, pfr_entry) { s6.sin6_addr = pfr->router->rtaddr; if (sa6_recoverscope(&s6)) log(LOG_ERR, "scope error in prefix list (%s)\n", ip6_sprintf(ip6buf, &pfr->router->rtaddr)); error = SYSCTL_OUT(req, &s6, sizeof(s6)); if (error != 0) - break; + goto out; } } +out: ND6_RUNLOCK(); return (error); } Index: user/alc/PQ_LAUNDRY/sys/netinet6/nd6.h =================================================================== --- user/alc/PQ_LAUNDRY/sys/netinet6/nd6.h (revision 306831) +++ user/alc/PQ_LAUNDRY/sys/netinet6/nd6.h (revision 306832) @@ -1,479 +1,496 @@ /*- * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $KAME: nd6.h,v 1.76 2001/12/18 02:10:31 itojun Exp $ * $FreeBSD$ */ #ifndef _NETINET6_ND6_H_ #define _NETINET6_ND6_H_ /* see net/route.h, or net/if_inarp.h */ #ifndef RTF_ANNOUNCE #define RTF_ANNOUNCE RTF_PROTO2 #endif #include #include struct llentry; #define ND6_LLINFO_NOSTATE -2 /* * We don't need the WAITDELETE state any more, but we keep the definition * in a comment line instead of removing it. This is necessary to avoid * unintentionally reusing the value for another purpose, which might * affect backward compatibility with old applications. * (20000711 jinmei@kame.net) */ /* #define ND6_LLINFO_WAITDELETE -1 */ #define ND6_LLINFO_INCOMPLETE 0 #define ND6_LLINFO_REACHABLE 1 #define ND6_LLINFO_STALE 2 #define ND6_LLINFO_DELAY 3 #define ND6_LLINFO_PROBE 4 #define ND6_IS_LLINFO_PROBREACH(n) ((n)->ln_state > ND6_LLINFO_INCOMPLETE) #define ND6_LLINFO_PERMANENT(n) (((n)->la_expire == 0) && ((n)->ln_state > ND6_LLINFO_INCOMPLETE)) struct nd_ifinfo { u_int32_t linkmtu; /* LinkMTU */ u_int32_t maxmtu; /* Upper bound of LinkMTU */ u_int32_t basereachable; /* BaseReachableTime */ u_int32_t reachable; /* Reachable Time */ u_int32_t retrans; /* Retrans Timer */ u_int32_t flags; /* Flags */ int recalctm; /* BaseReacable re-calculation timer */ u_int8_t chlim; /* CurHopLimit */ u_int8_t initialized; /* Flag to see the entry is initialized */ /* the following 3 members are for privacy extension for addrconf */ u_int8_t randomseed0[8]; /* upper 64 bits of MD5 digest */ u_int8_t randomseed1[8]; /* lower 64 bits (usually the EUI64 IFID) */ u_int8_t randomid[8]; /* current random ID */ }; #define ND6_IFF_PERFORMNUD 0x1 #define ND6_IFF_ACCEPT_RTADV 0x2 #define ND6_IFF_PREFER_SOURCE 0x4 /* Not used in FreeBSD. */ #define ND6_IFF_IFDISABLED 0x8 /* IPv6 operation is disabled due to * DAD failure. (XXX: not ND-specific) */ #define ND6_IFF_DONT_SET_IFROUTE 0x10 #define ND6_IFF_AUTO_LINKLOCAL 0x20 #define ND6_IFF_NO_RADR 0x40 #define ND6_IFF_NO_PREFER_IFACE 0x80 /* XXX: not related to ND. */ #define ND6_IFF_NO_DAD 0x100 #ifdef _KERNEL #define ND_IFINFO(ifp) \ (((struct in6_ifextra *)(ifp)->if_afdata[AF_INET6])->nd_ifinfo) #define IN6_LINKMTU(ifp) \ ((ND_IFINFO(ifp)->linkmtu && ND_IFINFO(ifp)->linkmtu < (ifp)->if_mtu) \ ? ND_IFINFO(ifp)->linkmtu \ : ((ND_IFINFO(ifp)->maxmtu && ND_IFINFO(ifp)->maxmtu < (ifp)->if_mtu) \ ? ND_IFINFO(ifp)->maxmtu : (ifp)->if_mtu)) #endif struct in6_nbrinfo { char ifname[IFNAMSIZ]; /* if name, e.g. "en0" */ struct in6_addr addr; /* IPv6 address of the neighbor */ long asked; /* number of queries already sent for this addr */ int isrouter; /* if it acts as a router */ int state; /* reachability state */ int expire; /* lifetime for NDP state transition */ }; #define DRLSTSIZ 10 #define PRLSTSIZ 10 struct in6_drlist { char ifname[IFNAMSIZ]; struct { struct in6_addr rtaddr; u_char flags; u_short rtlifetime; u_long expire; u_short if_index; } defrouter[DRLSTSIZ]; }; struct in6_defrouter { struct sockaddr_in6 rtaddr; u_char flags; u_short rtlifetime; u_long expire; u_short if_index; }; #ifdef _KERNEL struct in6_oprlist { char ifname[IFNAMSIZ]; struct { struct in6_addr prefix; struct prf_ra raflags; u_char prefixlen; u_char origin; u_long vltime; u_long pltime; u_long expire; u_short if_index; u_short advrtrs; /* number of advertisement routers */ struct in6_addr advrtr[DRLSTSIZ]; /* XXX: explicit limit */ } prefix[PRLSTSIZ]; }; #endif struct in6_prlist { char ifname[IFNAMSIZ]; struct { struct in6_addr prefix; struct prf_ra raflags; u_char prefixlen; u_char origin; u_int32_t vltime; u_int32_t pltime; time_t expire; u_short if_index; u_short advrtrs; /* number of advertisement routers */ struct in6_addr advrtr[DRLSTSIZ]; /* XXX: explicit limit */ } prefix[PRLSTSIZ]; }; struct in6_prefix { struct sockaddr_in6 prefix; struct prf_ra raflags; u_char prefixlen; u_char origin; u_int32_t vltime; u_int32_t pltime; time_t expire; u_int32_t flags; int refcnt; u_short if_index; u_short advrtrs; /* number of advertisement routers */ /* struct sockaddr_in6 advrtr[] */ }; #ifdef _KERNEL struct in6_ondireq { char ifname[IFNAMSIZ]; struct { u_int32_t linkmtu; /* LinkMTU */ u_int32_t maxmtu; /* Upper bound of LinkMTU */ u_int32_t basereachable; /* BaseReachableTime */ u_int32_t reachable; /* Reachable Time */ u_int32_t retrans; /* Retrans Timer */ u_int32_t flags; /* Flags */ int recalctm; /* BaseReacable re-calculation timer */ u_int8_t chlim; /* CurHopLimit */ u_int8_t receivedra; } ndi; }; #endif struct in6_ndireq { char ifname[IFNAMSIZ]; struct nd_ifinfo ndi; }; struct in6_ndifreq { char ifname[IFNAMSIZ]; u_long ifindex; }; /* Prefix status */ #define NDPRF_ONLINK 0x1 #define NDPRF_DETACHED 0x2 /* protocol constants */ #define MAX_RTR_SOLICITATION_DELAY 1 /* 1sec */ #define RTR_SOLICITATION_INTERVAL 4 /* 4sec */ #define MAX_RTR_SOLICITATIONS 3 #define ND6_INFINITE_LIFETIME 0xffffffff #ifdef _KERNEL /* node constants */ #define MAX_REACHABLE_TIME 3600000 /* msec */ #define REACHABLE_TIME 30000 /* msec */ #define RETRANS_TIMER 1000 /* msec */ #define MIN_RANDOM_FACTOR 512 /* 1024 * 0.5 */ #define MAX_RANDOM_FACTOR 1536 /* 1024 * 1.5 */ #define DEF_TEMP_VALID_LIFETIME 604800 /* 1 week */ #define DEF_TEMP_PREFERRED_LIFETIME 86400 /* 1 day */ #define TEMPADDR_REGEN_ADVANCE 5 /* sec */ #define MAX_TEMP_DESYNC_FACTOR 600 /* 10 min */ #define ND_COMPUTE_RTIME(x) \ (((MIN_RANDOM_FACTOR * (x >> 10)) + (arc4random() & \ ((MAX_RANDOM_FACTOR - MIN_RANDOM_FACTOR) * (x >> 10)))) /1000) TAILQ_HEAD(nd_drhead, nd_defrouter); struct nd_defrouter { TAILQ_ENTRY(nd_defrouter) dr_entry; struct in6_addr rtaddr; u_char raflags; /* flags on RA message */ u_short rtlifetime; u_long expire; struct ifnet *ifp; int installed; /* is installed into kernel routing table */ u_int refcnt; }; struct nd_prefixctl { struct ifnet *ndpr_ifp; /* prefix */ struct sockaddr_in6 ndpr_prefix; u_char ndpr_plen; u_int32_t ndpr_vltime; /* advertised valid lifetime */ u_int32_t ndpr_pltime; /* advertised preferred lifetime */ struct prf_ra ndpr_flags; }; - +LIST_HEAD(nd_prhead, nd_prefix); struct nd_prefix { struct ifnet *ndpr_ifp; LIST_ENTRY(nd_prefix) ndpr_entry; struct sockaddr_in6 ndpr_prefix; /* prefix */ struct in6_addr ndpr_mask; /* netmask derived from the prefix */ u_int32_t ndpr_vltime; /* advertised valid lifetime */ u_int32_t ndpr_pltime; /* advertised preferred lifetime */ time_t ndpr_expire; /* expiration time of the prefix */ time_t ndpr_preferred; /* preferred time of the prefix */ time_t ndpr_lastupdate; /* reception time of last advertisement */ struct prf_ra ndpr_flags; u_int32_t ndpr_stateflags; /* actual state flags */ /* list of routers that advertise the prefix: */ LIST_HEAD(pr_rtrhead, nd_pfxrouter) ndpr_advrtrs; u_char ndpr_plen; int ndpr_addrcnt; /* count of derived addresses */ + volatile u_int ndpr_refcnt; }; #define ndpr_raf ndpr_flags #define ndpr_raf_onlink ndpr_flags.onlink #define ndpr_raf_auto ndpr_flags.autonomous #define ndpr_raf_router ndpr_flags.router /* * Message format for use in obtaining information about prefixes * from inet6 sysctl function */ struct inet6_ndpr_msghdr { u_short inpm_msglen; /* to skip over non-understood messages */ u_char inpm_version; /* future binary compatibility */ u_char inpm_type; /* message type */ struct in6_addr inpm_prefix; u_long prm_vltim; u_long prm_pltime; u_long prm_expire; u_long prm_preferred; struct in6_prflags prm_flags; u_short prm_index; /* index for associated ifp */ u_char prm_plen; /* length of prefix in bits */ }; #define prm_raf_onlink prm_flags.prf_ra.onlink #define prm_raf_auto prm_flags.prf_ra.autonomous #define prm_statef_onlink prm_flags.prf_state.onlink #define prm_rrf_decrvalid prm_flags.prf_rr.decrvalid #define prm_rrf_decrprefd prm_flags.prf_rr.decrprefd struct nd_pfxrouter { LIST_ENTRY(nd_pfxrouter) pfr_entry; struct nd_defrouter *router; }; -LIST_HEAD(nd_prhead, nd_prefix); - #ifdef MALLOC_DECLARE MALLOC_DECLARE(M_IP6NDP); #endif /* nd6.c */ VNET_DECLARE(int, nd6_prune); VNET_DECLARE(int, nd6_delay); VNET_DECLARE(int, nd6_umaxtries); VNET_DECLARE(int, nd6_mmaxtries); VNET_DECLARE(int, nd6_useloopback); VNET_DECLARE(int, nd6_maxnudhint); VNET_DECLARE(int, nd6_gctimer); VNET_DECLARE(struct nd_drhead, nd_defrouter); VNET_DECLARE(struct nd_prhead, nd_prefix); VNET_DECLARE(int, nd6_debug); VNET_DECLARE(int, nd6_onlink_ns_rfc4861); #define V_nd6_prune VNET(nd6_prune) #define V_nd6_delay VNET(nd6_delay) #define V_nd6_umaxtries VNET(nd6_umaxtries) #define V_nd6_mmaxtries VNET(nd6_mmaxtries) #define V_nd6_useloopback VNET(nd6_useloopback) #define V_nd6_maxnudhint VNET(nd6_maxnudhint) #define V_nd6_gctimer VNET(nd6_gctimer) #define V_nd_defrouter VNET(nd_defrouter) #define V_nd_prefix VNET(nd_prefix) #define V_nd6_debug VNET(nd6_debug) #define V_nd6_onlink_ns_rfc4861 VNET(nd6_onlink_ns_rfc4861) /* Lock for the prefix and default router lists. */ VNET_DECLARE(struct rwlock, nd6_lock); +VNET_DECLARE(uint64_t, nd6_list_genid); #define V_nd6_lock VNET(nd6_lock) +#define V_nd6_list_genid VNET(nd6_list_genid) #define ND6_RLOCK() rw_rlock(&V_nd6_lock) #define ND6_RUNLOCK() rw_runlock(&V_nd6_lock) #define ND6_WLOCK() rw_wlock(&V_nd6_lock) #define ND6_WUNLOCK() rw_wunlock(&V_nd6_lock) +#define ND6_TRY_UPGRADE() rw_try_upgrade(&V_nd6_lock) #define ND6_WLOCK_ASSERT() rw_assert(&V_nd6_lock, RA_WLOCKED) #define ND6_RLOCK_ASSERT() rw_assert(&V_nd6_lock, RA_RLOCKED) #define ND6_LOCK_ASSERT() rw_assert(&V_nd6_lock, RA_LOCKED) #define ND6_UNLOCK_ASSERT() rw_assert(&V_nd6_lock, RA_UNLOCKED) +/* Mutex for prefix onlink/offlink transitions. */ +VNET_DECLARE(struct mtx, nd6_onlink_mtx); +#define V_nd6_onlink_mtx VNET(nd6_onlink_mtx) + +#define ND6_ONLINK_LOCK() mtx_lock(&V_nd6_onlink_mtx) +#define ND6_ONLINK_TRYLOCK() mtx_trylock(&V_nd6_onlink_mtx) +#define ND6_ONLINK_UNLOCK() mtx_unlock(&V_nd6_onlink_mtx) +#define ND6_ONLINK_LOCK_ASSERT() mtx_assert(&V_nd6_onlink_mtx, MA_OWNED) +#define ND6_ONLINK_UNLOCK_ASSERT() mtx_assert(&V_nd6_onlink_mtx, MA_NOTOWNED) + #define nd6log(x) do { if (V_nd6_debug) log x; } while (/*CONSTCOND*/ 0) /* nd6_rtr.c */ VNET_DECLARE(int, nd6_defifindex); VNET_DECLARE(int, ip6_desync_factor); /* seconds */ VNET_DECLARE(u_int32_t, ip6_temp_preferred_lifetime); /* seconds */ VNET_DECLARE(u_int32_t, ip6_temp_valid_lifetime); /* seconds */ VNET_DECLARE(int, ip6_temp_regen_advance); /* seconds */ #define V_nd6_defifindex VNET(nd6_defifindex) #define V_ip6_desync_factor VNET(ip6_desync_factor) #define V_ip6_temp_preferred_lifetime VNET(ip6_temp_preferred_lifetime) #define V_ip6_temp_valid_lifetime VNET(ip6_temp_valid_lifetime) #define V_ip6_temp_regen_advance VNET(ip6_temp_regen_advance) union nd_opts { struct nd_opt_hdr *nd_opt_array[16]; /* max = ND_OPT_NONCE */ struct { struct nd_opt_hdr *zero; struct nd_opt_hdr *src_lladdr; struct nd_opt_hdr *tgt_lladdr; struct nd_opt_prefix_info *pi_beg; /* multiple opts, start */ struct nd_opt_rd_hdr *rh; struct nd_opt_mtu *mtu; struct nd_opt_hdr *__res6; struct nd_opt_hdr *__res7; struct nd_opt_hdr *__res8; struct nd_opt_hdr *__res9; struct nd_opt_hdr *__res10; struct nd_opt_hdr *__res11; struct nd_opt_hdr *__res12; struct nd_opt_hdr *__res13; struct nd_opt_nonce *nonce; struct nd_opt_hdr *__res15; struct nd_opt_hdr *search; /* multiple opts */ struct nd_opt_hdr *last; /* multiple opts */ int done; struct nd_opt_prefix_info *pi_end;/* multiple opts, end */ } nd_opt_each; }; #define nd_opts_src_lladdr nd_opt_each.src_lladdr #define nd_opts_tgt_lladdr nd_opt_each.tgt_lladdr #define nd_opts_pi nd_opt_each.pi_beg #define nd_opts_pi_end nd_opt_each.pi_end #define nd_opts_rh nd_opt_each.rh #define nd_opts_mtu nd_opt_each.mtu #define nd_opts_nonce nd_opt_each.nonce #define nd_opts_search nd_opt_each.search #define nd_opts_last nd_opt_each.last #define nd_opts_done nd_opt_each.done /* XXX: need nd6_var.h?? */ /* nd6.c */ void nd6_init(void); #ifdef VIMAGE void nd6_destroy(void); #endif struct nd_ifinfo *nd6_ifattach(struct ifnet *); void nd6_ifdetach(struct ifnet *, struct nd_ifinfo *); int nd6_is_addr_neighbor(const struct sockaddr_in6 *, struct ifnet *); void nd6_option_init(void *, int, union nd_opts *); struct nd_opt_hdr *nd6_option(union nd_opts *); int nd6_options(union nd_opts *); struct llentry *nd6_lookup(const struct in6_addr *, int, struct ifnet *); struct llentry *nd6_alloc(const struct in6_addr *, int, struct ifnet *); void nd6_setmtu(struct ifnet *); void nd6_llinfo_setstate(struct llentry *lle, int newstate); void nd6_timer(void *); void nd6_purge(struct ifnet *); int nd6_resolve_addr(struct ifnet *ifp, int flags, const struct sockaddr *dst, char *desten, uint32_t *pflags); int nd6_resolve(struct ifnet *, int, struct mbuf *, const struct sockaddr *, u_char *, uint32_t *, struct llentry **); int nd6_ioctl(u_long, caddr_t, struct ifnet *); void nd6_cache_lladdr(struct ifnet *, struct in6_addr *, char *, int, int, int); void nd6_grab_holdchain(struct llentry *, struct mbuf **, struct sockaddr_in6 *); int nd6_flush_holdchain(struct ifnet *, struct ifnet *, struct mbuf *, struct sockaddr_in6 *); int nd6_add_ifa_lle(struct in6_ifaddr *); void nd6_rem_ifa_lle(struct in6_ifaddr *, int); int nd6_output_ifp(struct ifnet *, struct ifnet *, struct mbuf *, struct sockaddr_in6 *, struct route *); /* nd6_nbr.c */ void nd6_na_input(struct mbuf *, int, int); void nd6_na_output(struct ifnet *, const struct in6_addr *, const struct in6_addr *, u_long, int, struct sockaddr *); void nd6_ns_input(struct mbuf *, int, int); void nd6_ns_output(struct ifnet *, const struct in6_addr *, const struct in6_addr *, const struct in6_addr *, uint8_t *); caddr_t nd6_ifptomac(struct ifnet *); void nd6_dad_init(void); void nd6_dad_start(struct ifaddr *, int); void nd6_dad_stop(struct ifaddr *); /* nd6_rtr.c */ void nd6_rs_input(struct mbuf *, int, int); void nd6_ra_input(struct mbuf *, int, int); void defrouter_reset(void); void defrouter_select(void); void defrouter_ref(struct nd_defrouter *); void defrouter_rele(struct nd_defrouter *); bool defrouter_remove(struct in6_addr *, struct ifnet *); void defrouter_unlink(struct nd_defrouter *, struct nd_drhead *); void defrouter_del(struct nd_defrouter *); -void prelist_remove(struct nd_prefix *); int nd6_prelist_add(struct nd_prefixctl *, struct nd_defrouter *, - struct nd_prefix **); + struct nd_prefix **); +void nd6_prefix_unlink(struct nd_prefix *, struct nd_prhead *); +void nd6_prefix_del(struct nd_prefix *); +void nd6_prefix_ref(struct nd_prefix *); +void nd6_prefix_rele(struct nd_prefix *); +int nd6_prefix_onlink(struct nd_prefix *); +int nd6_prefix_offlink(struct nd_prefix *); void pfxlist_onlink_check(void); struct nd_defrouter *defrouter_lookup(struct in6_addr *, struct ifnet *); struct nd_defrouter *defrouter_lookup_locked(struct in6_addr *, struct ifnet *); struct nd_prefix *nd6_prefix_lookup(struct nd_prefixctl *); void rt6_flush(struct in6_addr *, struct ifnet *); int nd6_setdefaultiface(int); int in6_tmpifadd(const struct in6_ifaddr *, int, int); #endif /* _KERNEL */ #endif /* _NETINET6_ND6_H_ */ Index: user/alc/PQ_LAUNDRY/sys/netinet6/nd6_rtr.c =================================================================== --- user/alc/PQ_LAUNDRY/sys/netinet6/nd6_rtr.c (revision 306831) +++ user/alc/PQ_LAUNDRY/sys/netinet6/nd6_rtr.c (revision 306832) @@ -1,2243 +1,2343 @@ /*- * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $KAME: nd6_rtr.c,v 1.111 2001/04/27 01:37:15 jinmei Exp $ */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static int rtpref(struct nd_defrouter *); static struct nd_defrouter *defrtrlist_update(struct nd_defrouter *); static int prelist_update(struct nd_prefixctl *, struct nd_defrouter *, struct mbuf *, int); static struct in6_ifaddr *in6_ifadd(struct nd_prefixctl *, int); static struct nd_pfxrouter *pfxrtr_lookup(struct nd_prefix *, - struct nd_defrouter *); + struct nd_defrouter *); static void pfxrtr_add(struct nd_prefix *, struct nd_defrouter *); static void pfxrtr_del(struct nd_pfxrouter *); -static struct nd_pfxrouter *find_pfxlist_reachable_router -(struct nd_prefix *); +static struct nd_pfxrouter *find_pfxlist_reachable_router(struct nd_prefix *); static void defrouter_delreq(struct nd_defrouter *); static void nd6_rtmsg(int, struct rtentry *); static int in6_init_prefix_ltimes(struct nd_prefix *); static void in6_init_address_ltimes(struct nd_prefix *, - struct in6_addrlifetime *); + struct in6_addrlifetime *); -static int nd6_prefix_onlink(struct nd_prefix *); -static int nd6_prefix_offlink(struct nd_prefix *); - static int rt6_deleteroute(const struct rtentry *, void *); VNET_DECLARE(int, nd6_recalc_reachtm_interval); #define V_nd6_recalc_reachtm_interval VNET(nd6_recalc_reachtm_interval) static VNET_DEFINE(struct ifnet *, nd6_defifp); VNET_DEFINE(int, nd6_defifindex); #define V_nd6_defifp VNET(nd6_defifp) VNET_DEFINE(int, ip6_use_tempaddr) = 0; VNET_DEFINE(int, ip6_desync_factor); VNET_DEFINE(u_int32_t, ip6_temp_preferred_lifetime) = DEF_TEMP_PREFERRED_LIFETIME; VNET_DEFINE(u_int32_t, ip6_temp_valid_lifetime) = DEF_TEMP_VALID_LIFETIME; VNET_DEFINE(int, ip6_temp_regen_advance) = TEMPADDR_REGEN_ADVANCE; /* RTPREF_MEDIUM has to be 0! */ #define RTPREF_HIGH 1 #define RTPREF_MEDIUM 0 #define RTPREF_LOW (-1) #define RTPREF_RESERVED (-2) #define RTPREF_INVALID (-3) /* internal */ /* * Receive Router Solicitation Message - just for routers. * Router solicitation/advertisement is mostly managed by userland program * (rtadvd) so here we have no function like nd6_ra_output(). * * Based on RFC 2461 */ void nd6_rs_input(struct mbuf *m, int off, int icmp6len) { struct ifnet *ifp = m->m_pkthdr.rcvif; struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *); struct nd_router_solicit *nd_rs; struct in6_addr saddr6 = ip6->ip6_src; char *lladdr = NULL; int lladdrlen = 0; union nd_opts ndopts; char ip6bufs[INET6_ADDRSTRLEN], ip6bufd[INET6_ADDRSTRLEN]; /* * Accept RS only when V_ip6_forwarding=1 and the interface has * no ND6_IFF_ACCEPT_RTADV. */ if (!V_ip6_forwarding || ND_IFINFO(ifp)->flags & ND6_IFF_ACCEPT_RTADV) goto freeit; /* Sanity checks */ if (ip6->ip6_hlim != 255) { nd6log((LOG_ERR, "nd6_rs_input: invalid hlim (%d) from %s to %s on %s\n", ip6->ip6_hlim, ip6_sprintf(ip6bufs, &ip6->ip6_src), ip6_sprintf(ip6bufd, &ip6->ip6_dst), if_name(ifp))); goto bad; } /* * Don't update the neighbor cache, if src = ::. * This indicates that the src has no IP address assigned yet. */ if (IN6_IS_ADDR_UNSPECIFIED(&saddr6)) goto freeit; #ifndef PULLDOWN_TEST IP6_EXTHDR_CHECK(m, off, icmp6len,); nd_rs = (struct nd_router_solicit *)((caddr_t)ip6 + off); #else IP6_EXTHDR_GET(nd_rs, struct nd_router_solicit *, m, off, icmp6len); if (nd_rs == NULL) { ICMP6STAT_INC(icp6s_tooshort); return; } #endif icmp6len -= sizeof(*nd_rs); nd6_option_init(nd_rs + 1, icmp6len, &ndopts); if (nd6_options(&ndopts) < 0) { nd6log((LOG_INFO, "nd6_rs_input: invalid ND option, ignored\n")); /* nd6_options have incremented stats */ goto freeit; } if (ndopts.nd_opts_src_lladdr) { lladdr = (char *)(ndopts.nd_opts_src_lladdr + 1); lladdrlen = ndopts.nd_opts_src_lladdr->nd_opt_len << 3; } if (lladdr && ((ifp->if_addrlen + 2 + 7) & ~7) != lladdrlen) { nd6log((LOG_INFO, "nd6_rs_input: lladdrlen mismatch for %s " "(if %d, RS packet %d)\n", ip6_sprintf(ip6bufs, &saddr6), ifp->if_addrlen, lladdrlen - 2)); goto bad; } nd6_cache_lladdr(ifp, &saddr6, lladdr, lladdrlen, ND_ROUTER_SOLICIT, 0); freeit: m_freem(m); return; bad: ICMP6STAT_INC(icp6s_badrs); m_freem(m); } /* * Receive Router Advertisement Message. * * Based on RFC 2461 * TODO: on-link bit on prefix information * TODO: ND_RA_FLAG_{OTHER,MANAGED} processing */ void nd6_ra_input(struct mbuf *m, int off, int icmp6len) { struct ifnet *ifp = m->m_pkthdr.rcvif; struct nd_ifinfo *ndi = ND_IFINFO(ifp); struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *); struct nd_router_advert *nd_ra; struct in6_addr saddr6 = ip6->ip6_src; int mcast = 0; union nd_opts ndopts; struct nd_defrouter *dr; char ip6bufs[INET6_ADDRSTRLEN], ip6bufd[INET6_ADDRSTRLEN]; dr = NULL; /* * We only accept RAs only when the per-interface flag * ND6_IFF_ACCEPT_RTADV is on the receiving interface. */ if (!(ndi->flags & ND6_IFF_ACCEPT_RTADV)) goto freeit; if (ip6->ip6_hlim != 255) { nd6log((LOG_ERR, "nd6_ra_input: invalid hlim (%d) from %s to %s on %s\n", ip6->ip6_hlim, ip6_sprintf(ip6bufs, &ip6->ip6_src), ip6_sprintf(ip6bufd, &ip6->ip6_dst), if_name(ifp))); goto bad; } if (!IN6_IS_ADDR_LINKLOCAL(&saddr6)) { nd6log((LOG_ERR, "nd6_ra_input: src %s is not link-local\n", ip6_sprintf(ip6bufs, &saddr6))); goto bad; } #ifndef PULLDOWN_TEST IP6_EXTHDR_CHECK(m, off, icmp6len,); nd_ra = (struct nd_router_advert *)((caddr_t)ip6 + off); #else IP6_EXTHDR_GET(nd_ra, struct nd_router_advert *, m, off, icmp6len); if (nd_ra == NULL) { ICMP6STAT_INC(icp6s_tooshort); return; } #endif icmp6len -= sizeof(*nd_ra); nd6_option_init(nd_ra + 1, icmp6len, &ndopts); if (nd6_options(&ndopts) < 0) { nd6log((LOG_INFO, "nd6_ra_input: invalid ND option, ignored\n")); /* nd6_options have incremented stats */ goto freeit; } { struct nd_defrouter dr0; u_int32_t advreachable = nd_ra->nd_ra_reachable; /* remember if this is a multicasted advertisement */ if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) mcast = 1; bzero(&dr0, sizeof(dr0)); dr0.rtaddr = saddr6; dr0.raflags = nd_ra->nd_ra_flags_reserved; /* * Effectively-disable routes from RA messages when * ND6_IFF_NO_RADR enabled on the receiving interface or * (ip6.forwarding == 1 && ip6.rfc6204w3 != 1). */ if (ndi->flags & ND6_IFF_NO_RADR) dr0.rtlifetime = 0; else if (V_ip6_forwarding && !V_ip6_rfc6204w3) dr0.rtlifetime = 0; else dr0.rtlifetime = ntohs(nd_ra->nd_ra_router_lifetime); dr0.expire = time_uptime + dr0.rtlifetime; dr0.ifp = ifp; /* unspecified or not? (RFC 2461 6.3.4) */ if (advreachable) { advreachable = ntohl(advreachable); if (advreachable <= MAX_REACHABLE_TIME && ndi->basereachable != advreachable) { ndi->basereachable = advreachable; ndi->reachable = ND_COMPUTE_RTIME(ndi->basereachable); ndi->recalctm = V_nd6_recalc_reachtm_interval; /* reset */ } } if (nd_ra->nd_ra_retransmit) ndi->retrans = ntohl(nd_ra->nd_ra_retransmit); if (nd_ra->nd_ra_curhoplimit) { if (ndi->chlim < nd_ra->nd_ra_curhoplimit) ndi->chlim = nd_ra->nd_ra_curhoplimit; else if (ndi->chlim != nd_ra->nd_ra_curhoplimit) { log(LOG_ERR, "RA with a lower CurHopLimit sent from " "%s on %s (current = %d, received = %d). " "Ignored.\n", ip6_sprintf(ip6bufs, &ip6->ip6_src), if_name(ifp), ndi->chlim, nd_ra->nd_ra_curhoplimit); } } dr = defrtrlist_update(&dr0); } /* * prefix */ if (ndopts.nd_opts_pi) { struct nd_opt_hdr *pt; struct nd_opt_prefix_info *pi = NULL; struct nd_prefixctl pr; for (pt = (struct nd_opt_hdr *)ndopts.nd_opts_pi; pt <= (struct nd_opt_hdr *)ndopts.nd_opts_pi_end; pt = (struct nd_opt_hdr *)((caddr_t)pt + (pt->nd_opt_len << 3))) { if (pt->nd_opt_type != ND_OPT_PREFIX_INFORMATION) continue; pi = (struct nd_opt_prefix_info *)pt; if (pi->nd_opt_pi_len != 4) { nd6log((LOG_INFO, "nd6_ra_input: invalid option " "len %d for prefix information option, " "ignored\n", pi->nd_opt_pi_len)); continue; } if (128 < pi->nd_opt_pi_prefix_len) { nd6log((LOG_INFO, "nd6_ra_input: invalid prefix " "len %d for prefix information option, " "ignored\n", pi->nd_opt_pi_prefix_len)); continue; } if (IN6_IS_ADDR_MULTICAST(&pi->nd_opt_pi_prefix) || IN6_IS_ADDR_LINKLOCAL(&pi->nd_opt_pi_prefix)) { nd6log((LOG_INFO, "nd6_ra_input: invalid prefix " "%s, ignored\n", ip6_sprintf(ip6bufs, &pi->nd_opt_pi_prefix))); continue; } bzero(&pr, sizeof(pr)); pr.ndpr_prefix.sin6_family = AF_INET6; pr.ndpr_prefix.sin6_len = sizeof(pr.ndpr_prefix); pr.ndpr_prefix.sin6_addr = pi->nd_opt_pi_prefix; pr.ndpr_ifp = (struct ifnet *)m->m_pkthdr.rcvif; pr.ndpr_raf_onlink = (pi->nd_opt_pi_flags_reserved & ND_OPT_PI_FLAG_ONLINK) ? 1 : 0; pr.ndpr_raf_auto = (pi->nd_opt_pi_flags_reserved & ND_OPT_PI_FLAG_AUTO) ? 1 : 0; pr.ndpr_plen = pi->nd_opt_pi_prefix_len; pr.ndpr_vltime = ntohl(pi->nd_opt_pi_valid_time); pr.ndpr_pltime = ntohl(pi->nd_opt_pi_preferred_time); (void)prelist_update(&pr, dr, m, mcast); } } if (dr != NULL) { defrouter_rele(dr); dr = NULL; } /* * MTU */ if (ndopts.nd_opts_mtu && ndopts.nd_opts_mtu->nd_opt_mtu_len == 1) { u_long mtu; u_long maxmtu; mtu = (u_long)ntohl(ndopts.nd_opts_mtu->nd_opt_mtu_mtu); /* lower bound */ if (mtu < IPV6_MMTU) { nd6log((LOG_INFO, "nd6_ra_input: bogus mtu option " "mtu=%lu sent from %s, ignoring\n", mtu, ip6_sprintf(ip6bufs, &ip6->ip6_src))); goto skip; } /* upper bound */ maxmtu = (ndi->maxmtu && ndi->maxmtu < ifp->if_mtu) ? ndi->maxmtu : ifp->if_mtu; if (mtu <= maxmtu) { int change = (ndi->linkmtu != mtu); ndi->linkmtu = mtu; if (change) /* in6_maxmtu may change */ in6_setmaxmtu(); } else { nd6log((LOG_INFO, "nd6_ra_input: bogus mtu " "mtu=%lu sent from %s; " "exceeds maxmtu %lu, ignoring\n", mtu, ip6_sprintf(ip6bufs, &ip6->ip6_src), maxmtu)); } } skip: /* * Source link layer address */ { char *lladdr = NULL; int lladdrlen = 0; if (ndopts.nd_opts_src_lladdr) { lladdr = (char *)(ndopts.nd_opts_src_lladdr + 1); lladdrlen = ndopts.nd_opts_src_lladdr->nd_opt_len << 3; } if (lladdr && ((ifp->if_addrlen + 2 + 7) & ~7) != lladdrlen) { nd6log((LOG_INFO, "nd6_ra_input: lladdrlen mismatch for %s " "(if %d, RA packet %d)\n", ip6_sprintf(ip6bufs, &saddr6), ifp->if_addrlen, lladdrlen - 2)); goto bad; } nd6_cache_lladdr(ifp, &saddr6, lladdr, lladdrlen, ND_ROUTER_ADVERT, 0); /* * Installing a link-layer address might change the state of the * router's neighbor cache, which might also affect our on-link * detection of adveritsed prefixes. */ pfxlist_onlink_check(); } freeit: m_freem(m); return; bad: ICMP6STAT_INC(icp6s_badra); m_freem(m); } /* tell the change to user processes watching the routing socket. */ static void nd6_rtmsg(int cmd, struct rtentry *rt) { struct rt_addrinfo info; struct ifnet *ifp; struct ifaddr *ifa; bzero((caddr_t)&info, sizeof(info)); info.rti_info[RTAX_DST] = rt_key(rt); info.rti_info[RTAX_GATEWAY] = rt->rt_gateway; info.rti_info[RTAX_NETMASK] = rt_mask(rt); ifp = rt->rt_ifp; if (ifp != NULL) { IF_ADDR_RLOCK(ifp); ifa = TAILQ_FIRST(&ifp->if_addrhead); info.rti_info[RTAX_IFP] = ifa->ifa_addr; ifa_ref(ifa); IF_ADDR_RUNLOCK(ifp); info.rti_info[RTAX_IFA] = rt->rt_ifa->ifa_addr; } else ifa = NULL; rt_missmsg_fib(cmd, &info, rt->rt_flags, 0, rt->rt_fibnum); if (ifa != NULL) ifa_free(ifa); } /* * default router list processing sub routines */ static void defrouter_addreq(struct nd_defrouter *new) { struct sockaddr_in6 def, mask, gate; struct rtentry *newrt = NULL; int error; bzero(&def, sizeof(def)); bzero(&mask, sizeof(mask)); bzero(&gate, sizeof(gate)); def.sin6_len = mask.sin6_len = gate.sin6_len = sizeof(struct sockaddr_in6); def.sin6_family = gate.sin6_family = AF_INET6; gate.sin6_addr = new->rtaddr; error = in6_rtrequest(RTM_ADD, (struct sockaddr *)&def, (struct sockaddr *)&gate, (struct sockaddr *)&mask, RTF_GATEWAY, &newrt, RT_DEFAULT_FIB); if (newrt) { nd6_rtmsg(RTM_ADD, newrt); /* tell user process */ RTFREE(newrt); } if (error == 0) new->installed = 1; } struct nd_defrouter * defrouter_lookup_locked(struct in6_addr *addr, struct ifnet *ifp) { struct nd_defrouter *dr; ND6_LOCK_ASSERT(); TAILQ_FOREACH(dr, &V_nd_defrouter, dr_entry) if (dr->ifp == ifp && IN6_ARE_ADDR_EQUAL(addr, &dr->rtaddr)) { defrouter_ref(dr); return (dr); } return (NULL); } struct nd_defrouter * defrouter_lookup(struct in6_addr *addr, struct ifnet *ifp) { struct nd_defrouter *dr; ND6_RLOCK(); dr = defrouter_lookup_locked(addr, ifp); ND6_RUNLOCK(); return (dr); } void defrouter_ref(struct nd_defrouter *dr) { refcount_acquire(&dr->refcnt); } void defrouter_rele(struct nd_defrouter *dr) { if (refcount_release(&dr->refcnt)) free(dr, M_IP6NDP); } /* * Remove the default route for a given router. * This is just a subroutine function for defrouter_select(), and should * not be called from anywhere else. */ static void defrouter_delreq(struct nd_defrouter *dr) { struct sockaddr_in6 def, mask, gate; struct rtentry *oldrt = NULL; bzero(&def, sizeof(def)); bzero(&mask, sizeof(mask)); bzero(&gate, sizeof(gate)); def.sin6_len = mask.sin6_len = gate.sin6_len = sizeof(struct sockaddr_in6); def.sin6_family = gate.sin6_family = AF_INET6; gate.sin6_addr = dr->rtaddr; in6_rtrequest(RTM_DELETE, (struct sockaddr *)&def, (struct sockaddr *)&gate, (struct sockaddr *)&mask, RTF_GATEWAY, &oldrt, RT_DEFAULT_FIB); if (oldrt) { nd6_rtmsg(RTM_DELETE, oldrt); RTFREE(oldrt); } dr->installed = 0; } /* * Remove all default routes from default router list. */ void defrouter_reset(void) { struct nd_defrouter *dr, **dra; int count, i; count = i = 0; /* * We can't delete routes with the ND lock held, so make a copy of the * current default router list and use that when deleting routes. */ ND6_RLOCK(); TAILQ_FOREACH(dr, &V_nd_defrouter, dr_entry) count++; ND6_RUNLOCK(); dra = malloc(count * sizeof(*dra), M_TEMP, M_WAITOK | M_ZERO); ND6_RLOCK(); TAILQ_FOREACH(dr, &V_nd_defrouter, dr_entry) { if (i == count) break; defrouter_ref(dr); dra[i++] = dr; } ND6_RUNLOCK(); for (i = 0; i < count && dra[i] != NULL; i++) { defrouter_delreq(dra[i]); defrouter_rele(dra[i]); } free(dra, M_TEMP); /* * XXX should we also nuke any default routers in the kernel, by * going through them by rtalloc1()? */ } /* * Look up a matching default router list entry and remove it. Returns true if a * matching entry was found, false otherwise. */ bool defrouter_remove(struct in6_addr *addr, struct ifnet *ifp) { struct nd_defrouter *dr; ND6_WLOCK(); dr = defrouter_lookup_locked(addr, ifp); if (dr == NULL) { ND6_WUNLOCK(); return (false); } defrouter_unlink(dr, NULL); ND6_WUNLOCK(); defrouter_del(dr); defrouter_rele(dr); return (true); } /* * Remove a router from the global list and optionally stash it in a * caller-supplied queue. * * The ND lock must be held. */ void defrouter_unlink(struct nd_defrouter *dr, struct nd_drhead *drq) { ND6_WLOCK_ASSERT(); TAILQ_REMOVE(&V_nd_defrouter, dr, dr_entry); + V_nd6_list_genid++; if (drq != NULL) TAILQ_INSERT_TAIL(drq, dr, dr_entry); } void defrouter_del(struct nd_defrouter *dr) { struct nd_defrouter *deldr = NULL; struct nd_prefix *pr; + struct nd_pfxrouter *pfxrtr; ND6_UNLOCK_ASSERT(); /* * Flush all the routing table entries that use the router * as a next hop. */ if (ND_IFINFO(dr->ifp)->flags & ND6_IFF_ACCEPT_RTADV) rt6_flush(&dr->rtaddr, dr->ifp); if (dr->installed) { deldr = dr; defrouter_delreq(dr); } /* * Also delete all the pointers to the router in each prefix lists. */ + ND6_WLOCK(); LIST_FOREACH(pr, &V_nd_prefix, ndpr_entry) { - struct nd_pfxrouter *pfxrtr; if ((pfxrtr = pfxrtr_lookup(pr, dr)) != NULL) pfxrtr_del(pfxrtr); } + ND6_WUNLOCK(); + pfxlist_onlink_check(); /* * If the router is the primary one, choose a new one. * Note that defrouter_select() will remove the current gateway * from the routing table. */ if (deldr) defrouter_select(); /* * Release the list reference. */ defrouter_rele(dr); } /* * Default Router Selection according to Section 6.3.6 of RFC 2461 and * draft-ietf-ipngwg-router-selection: * 1) Routers that are reachable or probably reachable should be preferred. * If we have more than one (probably) reachable router, prefer ones * with the highest router preference. * 2) When no routers on the list are known to be reachable or * probably reachable, routers SHOULD be selected in a round-robin * fashion, regardless of router preference values. * 3) If the Default Router List is empty, assume that all * destinations are on-link. * * We assume nd_defrouter is sorted by router preference value. * Since the code below covers both with and without router preference cases, * we do not need to classify the cases by ifdef. * * At this moment, we do not try to install more than one default router, * even when the multipath routing is available, because we're not sure about * the benefits for stub hosts comparing to the risk of making the code * complicated and the possibility of introducing bugs. */ void defrouter_select(void) { struct nd_defrouter *dr, *selected_dr, *installed_dr; struct llentry *ln = NULL; ND6_RLOCK(); /* * Let's handle easy case (3) first: * If default router list is empty, there's nothing to be done. */ if (TAILQ_EMPTY(&V_nd_defrouter)) { ND6_RUNLOCK(); return; } /* * Search for a (probably) reachable router from the list. * We just pick up the first reachable one (if any), assuming that * the ordering rule of the list described in defrtrlist_update(). */ selected_dr = installed_dr = NULL; TAILQ_FOREACH(dr, &V_nd_defrouter, dr_entry) { IF_AFDATA_RLOCK(dr->ifp); if (selected_dr == NULL && (ln = nd6_lookup(&dr->rtaddr, 0, dr->ifp)) && ND6_IS_LLINFO_PROBREACH(ln)) { selected_dr = dr; defrouter_ref(selected_dr); } IF_AFDATA_RUNLOCK(dr->ifp); if (ln != NULL) { LLE_RUNLOCK(ln); ln = NULL; } if (dr->installed) { if (installed_dr == NULL) { installed_dr = dr; defrouter_ref(installed_dr); } else { /* this should not happen. warn for diagnosis. */ log(LOG_ERR, "defrouter_select: more than one router is installed\n"); } } } /* * If none of the default routers was found to be reachable, * round-robin the list regardless of preference. * Otherwise, if we have an installed router, check if the selected * (reachable) router should really be preferred to the installed one. * We only prefer the new router when the old one is not reachable * or when the new one has a really higher preference value. */ if (selected_dr == NULL) { if (installed_dr == NULL || TAILQ_NEXT(installed_dr, dr_entry) == NULL) selected_dr = TAILQ_FIRST(&V_nd_defrouter); else selected_dr = TAILQ_NEXT(installed_dr, dr_entry); defrouter_ref(selected_dr); } else if (installed_dr != NULL) { IF_AFDATA_RLOCK(installed_dr->ifp); if ((ln = nd6_lookup(&installed_dr->rtaddr, 0, installed_dr->ifp)) && ND6_IS_LLINFO_PROBREACH(ln) && rtpref(selected_dr) <= rtpref(installed_dr)) { defrouter_rele(selected_dr); selected_dr = installed_dr; } IF_AFDATA_RUNLOCK(installed_dr->ifp); if (ln != NULL) LLE_RUNLOCK(ln); } ND6_RUNLOCK(); /* * If the selected router is different than the installed one, * remove the installed router and install the selected one. * Note that the selected router is never NULL here. */ if (installed_dr != selected_dr) { if (installed_dr != NULL) { defrouter_delreq(installed_dr); defrouter_rele(installed_dr); } defrouter_addreq(selected_dr); } defrouter_rele(selected_dr); } /* * for default router selection * regards router-preference field as a 2-bit signed integer */ static int rtpref(struct nd_defrouter *dr) { switch (dr->raflags & ND_RA_FLAG_RTPREF_MASK) { case ND_RA_FLAG_RTPREF_HIGH: return (RTPREF_HIGH); case ND_RA_FLAG_RTPREF_MEDIUM: case ND_RA_FLAG_RTPREF_RSV: return (RTPREF_MEDIUM); case ND_RA_FLAG_RTPREF_LOW: return (RTPREF_LOW); default: /* * This case should never happen. If it did, it would mean a * serious bug of kernel internal. We thus always bark here. * Or, can we even panic? */ log(LOG_ERR, "rtpref: impossible RA flag %x\n", dr->raflags); return (RTPREF_INVALID); } /* NOTREACHED */ } static struct nd_defrouter * defrtrlist_update(struct nd_defrouter *new) { struct nd_defrouter *dr, *n; + uint64_t genid; int oldpref; + bool writelocked; if (new->rtlifetime == 0) { defrouter_remove(&new->rtaddr, new->ifp); return (NULL); } - ND6_WLOCK(); + ND6_RLOCK(); + writelocked = false; +restart: dr = defrouter_lookup_locked(&new->rtaddr, new->ifp); if (dr != NULL) { oldpref = rtpref(dr); /* override */ dr->raflags = new->raflags; /* XXX flag check */ dr->rtlifetime = new->rtlifetime; dr->expire = new->expire; /* * If the preference does not change, there's no need * to sort the entries. Also make sure the selected * router is still installed in the kernel. */ if (dr->installed && rtpref(new) == oldpref) { - ND6_WUNLOCK(); + if (writelocked) + ND6_WUNLOCK(); + else + ND6_RUNLOCK(); return (dr); } + } + /* + * The router needs to be reinserted into the default router + * list, so upgrade to a write lock. If that fails and the list + * has potentially changed while the lock was dropped, we'll + * redo the lookup with the write lock held. + */ + if (!writelocked) { + writelocked = true; + if (!ND6_TRY_UPGRADE()) { + genid = V_nd6_list_genid; + ND6_RUNLOCK(); + ND6_WLOCK(); + if (genid != V_nd6_list_genid) + goto restart; + } + } + + if (dr != NULL) { /* * The preferred router may have changed, so relocate this * router. */ TAILQ_REMOVE(&V_nd_defrouter, dr, dr_entry); n = dr; } else { n = malloc(sizeof(*n), M_IP6NDP, M_NOWAIT | M_ZERO); if (n == NULL) { ND6_WUNLOCK(); return (NULL); } memcpy(n, new, sizeof(*n)); /* Initialize with an extra reference for the caller. */ refcount_init(&n->refcnt, 2); } /* * Insert the new router in the Default Router List; * The Default Router List should be in the descending order * of router-preferece. Routers with the same preference are * sorted in the arriving time order. */ /* insert at the end of the group */ TAILQ_FOREACH(dr, &V_nd_defrouter, dr_entry) { if (rtpref(n) > rtpref(dr)) break; } if (dr != NULL) TAILQ_INSERT_BEFORE(dr, n, dr_entry); else TAILQ_INSERT_TAIL(&V_nd_defrouter, n, dr_entry); + V_nd6_list_genid++; ND6_WUNLOCK(); defrouter_select(); return (n); } static struct nd_pfxrouter * pfxrtr_lookup(struct nd_prefix *pr, struct nd_defrouter *dr) { struct nd_pfxrouter *search; + ND6_LOCK_ASSERT(); + LIST_FOREACH(search, &pr->ndpr_advrtrs, pfr_entry) { if (search->router == dr) break; } - return (search); } static void pfxrtr_add(struct nd_prefix *pr, struct nd_defrouter *dr) { struct nd_pfxrouter *new; + bool update; + ND6_UNLOCK_ASSERT(); + + ND6_RLOCK(); + if (pfxrtr_lookup(pr, dr) != NULL) { + ND6_RUNLOCK(); + return; + } + ND6_RUNLOCK(); + new = malloc(sizeof(*new), M_IP6NDP, M_NOWAIT | M_ZERO); if (new == NULL) return; - new->router = dr; defrouter_ref(dr); + new->router = dr; - LIST_INSERT_HEAD(&pr->ndpr_advrtrs, new, pfr_entry); + ND6_WLOCK(); + if (pfxrtr_lookup(pr, dr) == NULL) { + LIST_INSERT_HEAD(&pr->ndpr_advrtrs, new, pfr_entry); + update = true; + } else { + /* We lost a race to add the reference. */ + defrouter_rele(dr); + free(new, M_IP6NDP); + update = false; + } + ND6_WUNLOCK(); - pfxlist_onlink_check(); + if (update) + pfxlist_onlink_check(); } static void pfxrtr_del(struct nd_pfxrouter *pfr) { + ND6_WLOCK_ASSERT(); + LIST_REMOVE(pfr, pfr_entry); defrouter_rele(pfr->router); free(pfr, M_IP6NDP); } -struct nd_prefix * -nd6_prefix_lookup(struct nd_prefixctl *key) +static struct nd_prefix * +nd6_prefix_lookup_locked(struct nd_prefixctl *key) { struct nd_prefix *search; + ND6_LOCK_ASSERT(); + LIST_FOREACH(search, &V_nd_prefix, ndpr_entry) { if (key->ndpr_ifp == search->ndpr_ifp && key->ndpr_plen == search->ndpr_plen && in6_are_prefix_equal(&key->ndpr_prefix.sin6_addr, &search->ndpr_prefix.sin6_addr, key->ndpr_plen)) { + nd6_prefix_ref(search); break; } } + return (search); +} +struct nd_prefix * +nd6_prefix_lookup(struct nd_prefixctl *key) +{ + struct nd_prefix *search; + + ND6_RLOCK(); + search = nd6_prefix_lookup_locked(key); + ND6_RUNLOCK(); return (search); } +void +nd6_prefix_ref(struct nd_prefix *pr) +{ + + refcount_acquire(&pr->ndpr_refcnt); +} + +void +nd6_prefix_rele(struct nd_prefix *pr) +{ + + if (refcount_release(&pr->ndpr_refcnt)) { + KASSERT(LIST_EMPTY(&pr->ndpr_advrtrs), + ("prefix %p has advertising routers", pr)); + free(pr, M_IP6NDP); + } +} + int nd6_prelist_add(struct nd_prefixctl *pr, struct nd_defrouter *dr, struct nd_prefix **newp) { - struct nd_prefix *new = NULL; - int error = 0; + struct nd_prefix *new; char ip6buf[INET6_ADDRSTRLEN]; + int error; new = malloc(sizeof(*new), M_IP6NDP, M_NOWAIT | M_ZERO); if (new == NULL) return (ENOMEM); + refcount_init(&new->ndpr_refcnt, newp != NULL ? 2 : 1); new->ndpr_ifp = pr->ndpr_ifp; new->ndpr_prefix = pr->ndpr_prefix; new->ndpr_plen = pr->ndpr_plen; new->ndpr_vltime = pr->ndpr_vltime; new->ndpr_pltime = pr->ndpr_pltime; new->ndpr_flags = pr->ndpr_flags; if ((error = in6_init_prefix_ltimes(new)) != 0) { free(new, M_IP6NDP); return (error); } new->ndpr_lastupdate = time_uptime; /* initialization */ LIST_INIT(&new->ndpr_advrtrs); in6_prefixlen2mask(&new->ndpr_mask, new->ndpr_plen); /* make prefix in the canonical form */ IN6_MASK_ADDR(&new->ndpr_prefix.sin6_addr, &new->ndpr_mask); - /* link ndpr_entry to nd_prefix list */ + ND6_WLOCK(); LIST_INSERT_HEAD(&V_nd_prefix, new, ndpr_entry); + V_nd6_list_genid++; + ND6_WUNLOCK(); /* ND_OPT_PI_FLAG_ONLINK processing */ if (new->ndpr_raf_onlink) { - int e; - - if ((e = nd6_prefix_onlink(new)) != 0) { + ND6_ONLINK_LOCK(); + if ((error = nd6_prefix_onlink(new)) != 0) { nd6log((LOG_ERR, "nd6_prelist_add: failed to make " "the prefix %s/%d on-link on %s (errno=%d)\n", ip6_sprintf(ip6buf, &pr->ndpr_prefix.sin6_addr), - pr->ndpr_plen, if_name(pr->ndpr_ifp), e)); + pr->ndpr_plen, if_name(pr->ndpr_ifp), error)); /* proceed anyway. XXX: is it correct? */ } + ND6_ONLINK_UNLOCK(); } if (dr != NULL) pfxrtr_add(new, dr); if (newp != NULL) *newp = new; return (0); } +/* + * Remove a prefix from the prefix list and optionally stash it in a + * caller-provided list. + * + * The ND6 lock must be held. + */ void -prelist_remove(struct nd_prefix *pr) +nd6_prefix_unlink(struct nd_prefix *pr, struct nd_prhead *list) { + + KASSERT(pr->ndpr_addrcnt == 0, + ("prefix %p has referencing addresses", pr)); + ND6_WLOCK_ASSERT(); + + LIST_REMOVE(pr, ndpr_entry); + V_nd6_list_genid++; + if (list != NULL) + LIST_INSERT_HEAD(list, pr, ndpr_entry); +} + +/* + * Free an unlinked prefix, first marking it off-link if necessary. + */ +void +nd6_prefix_del(struct nd_prefix *pr) +{ struct nd_pfxrouter *pfr, *next; int e; char ip6buf[INET6_ADDRSTRLEN]; - /* make sure to invalidate the prefix until it is really freed. */ - pr->ndpr_vltime = 0; - pr->ndpr_pltime = 0; + KASSERT(pr->ndpr_addrcnt == 0, + ("prefix %p has referencing addresses", pr)); + ND6_UNLOCK_ASSERT(); /* * Though these flags are now meaningless, we'd rather keep the value * of pr->ndpr_raf_onlink and pr->ndpr_raf_auto not to confuse users * when executing "ndp -p". */ - - if ((pr->ndpr_stateflags & NDPRF_ONLINK) != 0 && - (e = nd6_prefix_offlink(pr)) != 0) { - nd6log((LOG_ERR, "prelist_remove: failed to make %s/%d offlink " - "on %s, errno=%d\n", - ip6_sprintf(ip6buf, &pr->ndpr_prefix.sin6_addr), - pr->ndpr_plen, if_name(pr->ndpr_ifp), e)); - /* what should we do? */ + if ((pr->ndpr_stateflags & NDPRF_ONLINK) != 0) { + ND6_ONLINK_LOCK(); + if ((e = nd6_prefix_offlink(pr)) != 0) { + nd6log((LOG_ERR, + "nd6_prefix_del: failed to make %s/%d offlink " + "on %s, errno=%d\n", + ip6_sprintf(ip6buf, &pr->ndpr_prefix.sin6_addr), + pr->ndpr_plen, if_name(pr->ndpr_ifp), e)); + /* what should we do? */ + } + ND6_ONLINK_UNLOCK(); } - if (pr->ndpr_addrcnt > 0) - return; /* notice here? */ - - /* unlink ndpr_entry from nd_prefix list */ - LIST_REMOVE(pr, ndpr_entry); - - /* free list of routers that advertised the prefix */ - LIST_FOREACH_SAFE(pfr, &pr->ndpr_advrtrs, pfr_entry, next) { + /* Release references to routers that have advertised this prefix. */ + ND6_WLOCK(); + LIST_FOREACH_SAFE(pfr, &pr->ndpr_advrtrs, pfr_entry, next) pfxrtr_del(pfr); - } - free(pr, M_IP6NDP); + ND6_WUNLOCK(); + nd6_prefix_rele(pr); + pfxlist_onlink_check(); } -/* - * dr - may be NULL - */ - static int prelist_update(struct nd_prefixctl *new, struct nd_defrouter *dr, struct mbuf *m, int mcast) { struct in6_ifaddr *ia6 = NULL, *ia6_match = NULL; struct ifaddr *ifa; struct ifnet *ifp = new->ndpr_ifp; struct nd_prefix *pr; int error = 0; int auth; struct in6_addrlifetime lt6_tmp; char ip6buf[INET6_ADDRSTRLEN]; auth = 0; if (m) { /* * Authenticity for NA consists authentication for * both IP header and IP datagrams, doesn't it ? */ #if defined(M_AUTHIPHDR) && defined(M_AUTHIPDGM) auth = ((m->m_flags & M_AUTHIPHDR) && (m->m_flags & M_AUTHIPDGM)); #endif } if ((pr = nd6_prefix_lookup(new)) != NULL) { /* * nd6_prefix_lookup() ensures that pr and new have the same * prefix on a same interface. */ /* * Update prefix information. Note that the on-link (L) bit * and the autonomous (A) bit should NOT be changed from 1 * to 0. */ if (new->ndpr_raf_onlink == 1) pr->ndpr_raf_onlink = 1; if (new->ndpr_raf_auto == 1) pr->ndpr_raf_auto = 1; if (new->ndpr_raf_onlink) { pr->ndpr_vltime = new->ndpr_vltime; pr->ndpr_pltime = new->ndpr_pltime; (void)in6_init_prefix_ltimes(pr); /* XXX error case? */ pr->ndpr_lastupdate = time_uptime; } if (new->ndpr_raf_onlink && (pr->ndpr_stateflags & NDPRF_ONLINK) == 0) { - int e; - - if ((e = nd6_prefix_onlink(pr)) != 0) { + ND6_ONLINK_LOCK(); + if ((error = nd6_prefix_onlink(pr)) != 0) { nd6log((LOG_ERR, "prelist_update: failed to make " "the prefix %s/%d on-link on %s " "(errno=%d)\n", ip6_sprintf(ip6buf, - &pr->ndpr_prefix.sin6_addr), - pr->ndpr_plen, if_name(pr->ndpr_ifp), e)); + &pr->ndpr_prefix.sin6_addr), + pr->ndpr_plen, if_name(pr->ndpr_ifp), + error)); /* proceed anyway. XXX: is it correct? */ } + ND6_ONLINK_UNLOCK(); } - if (dr && pfxrtr_lookup(pr, dr) == NULL) + if (dr != NULL) pfxrtr_add(pr, dr); } else { if (new->ndpr_vltime == 0) goto end; if (new->ndpr_raf_onlink == 0 && new->ndpr_raf_auto == 0) goto end; error = nd6_prelist_add(new, dr, &pr); if (error != 0) { nd6log((LOG_NOTICE, "prelist_update: " "nd6_prelist_add failed for %s/%d on %s errno=%d\n", ip6_sprintf(ip6buf, &new->ndpr_prefix.sin6_addr), new->ndpr_plen, if_name(new->ndpr_ifp), error)); goto end; /* we should just give up in this case. */ } /* * XXX: from the ND point of view, we can ignore a prefix * with the on-link bit being zero. However, we need a * prefix structure for references from autoconfigured * addresses. Thus, we explicitly make sure that the prefix * itself expires now. */ if (pr->ndpr_raf_onlink == 0) { pr->ndpr_vltime = 0; pr->ndpr_pltime = 0; in6_init_prefix_ltimes(pr); } } /* * Address autoconfiguration based on Section 5.5.3 of RFC 2462. * Note that pr must be non NULL at this point. */ /* 5.5.3 (a). Ignore the prefix without the A bit set. */ if (!new->ndpr_raf_auto) goto end; /* * 5.5.3 (b). the link-local prefix should have been ignored in * nd6_ra_input. */ /* 5.5.3 (c). Consistency check on lifetimes: pltime <= vltime. */ if (new->ndpr_pltime > new->ndpr_vltime) { error = EINVAL; /* XXX: won't be used */ goto end; } /* * 5.5.3 (d). If the prefix advertised is not equal to the prefix of * an address configured by stateless autoconfiguration already in the * list of addresses associated with the interface, and the Valid * Lifetime is not 0, form an address. We first check if we have * a matching prefix. * Note: we apply a clarification in rfc2462bis-02 here. We only * consider autoconfigured addresses while RFC2462 simply said * "address". */ IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { struct in6_ifaddr *ifa6; u_int32_t remaininglifetime; if (ifa->ifa_addr->sa_family != AF_INET6) continue; ifa6 = (struct in6_ifaddr *)ifa; /* * We only consider autoconfigured addresses as per rfc2462bis. */ if (!(ifa6->ia6_flags & IN6_IFF_AUTOCONF)) continue; /* * Spec is not clear here, but I believe we should concentrate * on unicast (i.e. not anycast) addresses. * XXX: other ia6_flags? detached or duplicated? */ if ((ifa6->ia6_flags & IN6_IFF_ANYCAST) != 0) continue; /* * Ignore the address if it is not associated with a prefix * or is associated with a prefix that is different from this * one. (pr is never NULL here) */ if (ifa6->ia6_ndpr != pr) continue; if (ia6_match == NULL) /* remember the first one */ ia6_match = ifa6; /* * An already autoconfigured address matched. Now that we * are sure there is at least one matched address, we can * proceed to 5.5.3. (e): update the lifetimes according to the * "two hours" rule and the privacy extension. * We apply some clarifications in rfc2462bis: * - use remaininglifetime instead of storedlifetime as a * variable name * - remove the dead code in the "two-hour" rule */ #define TWOHOUR (120*60) lt6_tmp = ifa6->ia6_lifetime; if (lt6_tmp.ia6t_vltime == ND6_INFINITE_LIFETIME) remaininglifetime = ND6_INFINITE_LIFETIME; else if (time_uptime - ifa6->ia6_updatetime > lt6_tmp.ia6t_vltime) { /* * The case of "invalid" address. We should usually * not see this case. */ remaininglifetime = 0; } else remaininglifetime = lt6_tmp.ia6t_vltime - (time_uptime - ifa6->ia6_updatetime); /* when not updating, keep the current stored lifetime. */ lt6_tmp.ia6t_vltime = remaininglifetime; if (TWOHOUR < new->ndpr_vltime || remaininglifetime < new->ndpr_vltime) { lt6_tmp.ia6t_vltime = new->ndpr_vltime; } else if (remaininglifetime <= TWOHOUR) { if (auth) { lt6_tmp.ia6t_vltime = new->ndpr_vltime; } } else { /* * new->ndpr_vltime <= TWOHOUR && * TWOHOUR < remaininglifetime */ lt6_tmp.ia6t_vltime = TWOHOUR; } /* The 2 hour rule is not imposed for preferred lifetime. */ lt6_tmp.ia6t_pltime = new->ndpr_pltime; in6_init_address_ltimes(pr, <6_tmp); /* * We need to treat lifetimes for temporary addresses * differently, according to * draft-ietf-ipv6-privacy-addrs-v2-01.txt 3.3 (1); * we only update the lifetimes when they are in the maximum * intervals. */ if ((ifa6->ia6_flags & IN6_IFF_TEMPORARY) != 0) { u_int32_t maxvltime, maxpltime; if (V_ip6_temp_valid_lifetime > (u_int32_t)((time_uptime - ifa6->ia6_createtime) + V_ip6_desync_factor)) { maxvltime = V_ip6_temp_valid_lifetime - (time_uptime - ifa6->ia6_createtime) - V_ip6_desync_factor; } else maxvltime = 0; if (V_ip6_temp_preferred_lifetime > (u_int32_t)((time_uptime - ifa6->ia6_createtime) + V_ip6_desync_factor)) { maxpltime = V_ip6_temp_preferred_lifetime - (time_uptime - ifa6->ia6_createtime) - V_ip6_desync_factor; } else maxpltime = 0; if (lt6_tmp.ia6t_vltime == ND6_INFINITE_LIFETIME || lt6_tmp.ia6t_vltime > maxvltime) { lt6_tmp.ia6t_vltime = maxvltime; } if (lt6_tmp.ia6t_pltime == ND6_INFINITE_LIFETIME || lt6_tmp.ia6t_pltime > maxpltime) { lt6_tmp.ia6t_pltime = maxpltime; } } ifa6->ia6_lifetime = lt6_tmp; ifa6->ia6_updatetime = time_uptime; } IF_ADDR_RUNLOCK(ifp); if (ia6_match == NULL && new->ndpr_vltime) { int ifidlen; /* * 5.5.3 (d) (continued) * No address matched and the valid lifetime is non-zero. * Create a new address. */ /* * Prefix Length check: * If the sum of the prefix length and interface identifier * length does not equal 128 bits, the Prefix Information * option MUST be ignored. The length of the interface * identifier is defined in a separate link-type specific * document. */ ifidlen = in6_if2idlen(ifp); if (ifidlen < 0) { /* this should not happen, so we always log it. */ log(LOG_ERR, "prelist_update: IFID undefined (%s)\n", if_name(ifp)); goto end; } if (ifidlen + pr->ndpr_plen != 128) { nd6log((LOG_INFO, "prelist_update: invalid prefixlen " "%d for %s, ignored\n", pr->ndpr_plen, if_name(ifp))); goto end; } if ((ia6 = in6_ifadd(new, mcast)) != NULL) { /* * note that we should use pr (not new) for reference. */ pr->ndpr_addrcnt++; ia6->ia6_ndpr = pr; /* * RFC 3041 3.3 (2). * When a new public address is created as described * in RFC2462, also create a new temporary address. * * RFC 3041 3.5. * When an interface connects to a new link, a new * randomized interface identifier should be generated * immediately together with a new set of temporary * addresses. Thus, we specifiy 1 as the 2nd arg of * in6_tmpifadd(). */ if (V_ip6_use_tempaddr) { int e; if ((e = in6_tmpifadd(ia6, 1, 1)) != 0) { nd6log((LOG_NOTICE, "prelist_update: " "failed to create a temporary " "address, errno=%d\n", e)); } } ifa_free(&ia6->ia_ifa); /* * A newly added address might affect the status * of other addresses, so we check and update it. * XXX: what if address duplication happens? */ pfxlist_onlink_check(); } else { /* just set an error. do not bark here. */ error = EADDRNOTAVAIL; /* XXX: might be unused. */ } } - end: - return error; +end: + if (pr != NULL) + nd6_prefix_rele(pr); + return (error); } /* * A supplement function used in the on-link detection below; * detect if a given prefix has a (probably) reachable advertising router. * XXX: lengthy function name... */ static struct nd_pfxrouter * find_pfxlist_reachable_router(struct nd_prefix *pr) { struct nd_pfxrouter *pfxrtr; struct llentry *ln; int canreach; + ND6_LOCK_ASSERT(); + LIST_FOREACH(pfxrtr, &pr->ndpr_advrtrs, pfr_entry) { IF_AFDATA_RLOCK(pfxrtr->router->ifp); ln = nd6_lookup(&pfxrtr->router->rtaddr, 0, pfxrtr->router->ifp); IF_AFDATA_RUNLOCK(pfxrtr->router->ifp); if (ln == NULL) continue; canreach = ND6_IS_LLINFO_PROBREACH(ln); LLE_RUNLOCK(ln); if (canreach) break; } return (pfxrtr); } /* * Check if each prefix in the prefix list has at least one available router * that advertised the prefix (a router is "available" if its neighbor cache * entry is reachable or probably reachable). * If the check fails, the prefix may be off-link, because, for example, * we have moved from the network but the lifetime of the prefix has not * expired yet. So we should not use the prefix if there is another prefix * that has an available router. - * But, if there is no prefix that has an available router, we still regards + * But, if there is no prefix that has an available router, we still regard * all the prefixes as on-link. This is because we can't tell if all the * routers are simply dead or if we really moved from the network and there * is no router around us. */ void pfxlist_onlink_check(void) { struct nd_prefix *pr; struct in6_ifaddr *ifa; struct nd_defrouter *dr; struct nd_pfxrouter *pfxrtr = NULL; struct rm_priotracker in6_ifa_tracker; + uint64_t genid; + uint32_t flags; + ND6_ONLINK_LOCK(); + ND6_RLOCK(); + /* * Check if there is a prefix that has a reachable advertising * router. */ LIST_FOREACH(pr, &V_nd_prefix, ndpr_entry) { if (pr->ndpr_raf_onlink && find_pfxlist_reachable_router(pr)) break; } /* * If we have no such prefix, check whether we still have a router * that does not advertise any prefixes. */ if (pr == NULL) { - ND6_RLOCK(); TAILQ_FOREACH(dr, &V_nd_defrouter, dr_entry) { struct nd_prefix *pr0; LIST_FOREACH(pr0, &V_nd_prefix, ndpr_entry) { if ((pfxrtr = pfxrtr_lookup(pr0, dr)) != NULL) break; } if (pfxrtr != NULL) break; } - ND6_RUNLOCK(); } if (pr != NULL || (!TAILQ_EMPTY(&V_nd_defrouter) && pfxrtr == NULL)) { /* * There is at least one prefix that has a reachable router, * or at least a router which probably does not advertise * any prefixes. The latter would be the case when we move * to a new link where we have a router that does not provide * prefixes and we configure an address by hand. * Detach prefixes which have no reachable advertising * router, and attach other prefixes. */ LIST_FOREACH(pr, &V_nd_prefix, ndpr_entry) { /* XXX: a link-local prefix should never be detached */ - if (IN6_IS_ADDR_LINKLOCAL(&pr->ndpr_prefix.sin6_addr)) + if (IN6_IS_ADDR_LINKLOCAL(&pr->ndpr_prefix.sin6_addr) || + pr->ndpr_raf_onlink == 0 || + pr->ndpr_raf_auto == 0) continue; - /* - * we aren't interested in prefixes without the L bit - * set. - */ - if (pr->ndpr_raf_onlink == 0) - continue; - - if (pr->ndpr_raf_auto == 0) - continue; - if ((pr->ndpr_stateflags & NDPRF_DETACHED) == 0 && find_pfxlist_reachable_router(pr) == NULL) pr->ndpr_stateflags |= NDPRF_DETACHED; - if ((pr->ndpr_stateflags & NDPRF_DETACHED) != 0 && + else if ((pr->ndpr_stateflags & NDPRF_DETACHED) != 0 && find_pfxlist_reachable_router(pr) != NULL) pr->ndpr_stateflags &= ~NDPRF_DETACHED; } } else { /* there is no prefix that has a reachable router */ LIST_FOREACH(pr, &V_nd_prefix, ndpr_entry) { - if (IN6_IS_ADDR_LINKLOCAL(&pr->ndpr_prefix.sin6_addr)) + if (IN6_IS_ADDR_LINKLOCAL(&pr->ndpr_prefix.sin6_addr) || + pr->ndpr_raf_onlink == 0 || + pr->ndpr_raf_auto == 0) continue; - - if (pr->ndpr_raf_onlink == 0) - continue; - - if (pr->ndpr_raf_auto == 0) - continue; - - if ((pr->ndpr_stateflags & NDPRF_DETACHED) != 0) - pr->ndpr_stateflags &= ~NDPRF_DETACHED; + pr->ndpr_stateflags &= ~NDPRF_DETACHED; } } /* * Remove each interface route associated with a (just) detached * prefix, and reinstall the interface route for a (just) attached * prefix. Note that all attempt of reinstallation does not * necessarily success, when a same prefix is shared among multiple * interfaces. Such cases will be handled in nd6_prefix_onlink, * so we don't have to care about them. */ +restart: LIST_FOREACH(pr, &V_nd_prefix, ndpr_entry) { - int e; char ip6buf[INET6_ADDRSTRLEN]; + int e; - if (IN6_IS_ADDR_LINKLOCAL(&pr->ndpr_prefix.sin6_addr)) + if (IN6_IS_ADDR_LINKLOCAL(&pr->ndpr_prefix.sin6_addr) || + pr->ndpr_raf_onlink == 0 || + pr->ndpr_raf_auto == 0) continue; - if (pr->ndpr_raf_onlink == 0) - continue; - - if (pr->ndpr_raf_auto == 0) - continue; - - if ((pr->ndpr_stateflags & NDPRF_DETACHED) != 0 && - (pr->ndpr_stateflags & NDPRF_ONLINK) != 0) { - if ((e = nd6_prefix_offlink(pr)) != 0) { + flags = pr->ndpr_stateflags & (NDPRF_DETACHED | NDPRF_ONLINK); + if (flags == 0 || flags == (NDPRF_DETACHED | NDPRF_ONLINK)) { + genid = V_nd6_list_genid; + ND6_RUNLOCK(); + if ((flags & NDPRF_ONLINK) != 0 && + (e = nd6_prefix_offlink(pr)) != 0) { nd6log((LOG_ERR, "pfxlist_onlink_check: failed to " "make %s/%d offlink, errno=%d\n", ip6_sprintf(ip6buf, &pr->ndpr_prefix.sin6_addr), pr->ndpr_plen, e)); - } - } - if ((pr->ndpr_stateflags & NDPRF_DETACHED) == 0 && - (pr->ndpr_stateflags & NDPRF_ONLINK) == 0 && - pr->ndpr_raf_onlink) { - if ((e = nd6_prefix_onlink(pr)) != 0) { + } else if ((flags & NDPRF_ONLINK) == 0 && + (e = nd6_prefix_onlink(pr)) != 0) { nd6log((LOG_ERR, "pfxlist_onlink_check: failed to " "make %s/%d onlink, errno=%d\n", ip6_sprintf(ip6buf, &pr->ndpr_prefix.sin6_addr), pr->ndpr_plen, e)); } + ND6_RLOCK(); + if (genid != V_nd6_list_genid) + goto restart; } } /* * Changes on the prefix status might affect address status as well. * Make sure that all addresses derived from an attached prefix are * attached, and that all addresses derived from a detached prefix are * detached. Note, however, that a manually configured address should * always be attached. * The precise detection logic is same as the one for prefixes. */ IN6_IFADDR_RLOCK(&in6_ifa_tracker); TAILQ_FOREACH(ifa, &V_in6_ifaddrhead, ia_link) { if (!(ifa->ia6_flags & IN6_IFF_AUTOCONF)) continue; if (ifa->ia6_ndpr == NULL) { /* * This can happen when we first configure the address * (i.e. the address exists, but the prefix does not). * XXX: complicated relationships... */ continue; } if (find_pfxlist_reachable_router(ifa->ia6_ndpr)) break; } if (ifa) { TAILQ_FOREACH(ifa, &V_in6_ifaddrhead, ia_link) { if ((ifa->ia6_flags & IN6_IFF_AUTOCONF) == 0) continue; if (ifa->ia6_ndpr == NULL) /* XXX: see above. */ continue; if (find_pfxlist_reachable_router(ifa->ia6_ndpr)) { if (ifa->ia6_flags & IN6_IFF_DETACHED) { ifa->ia6_flags &= ~IN6_IFF_DETACHED; ifa->ia6_flags |= IN6_IFF_TENTATIVE; nd6_dad_start((struct ifaddr *)ifa, 0); } } else { ifa->ia6_flags |= IN6_IFF_DETACHED; } } } else { TAILQ_FOREACH(ifa, &V_in6_ifaddrhead, ia_link) { if ((ifa->ia6_flags & IN6_IFF_AUTOCONF) == 0) continue; if (ifa->ia6_flags & IN6_IFF_DETACHED) { ifa->ia6_flags &= ~IN6_IFF_DETACHED; ifa->ia6_flags |= IN6_IFF_TENTATIVE; /* Do we need a delay in this case? */ nd6_dad_start((struct ifaddr *)ifa, 0); } } } IN6_IFADDR_RUNLOCK(&in6_ifa_tracker); + ND6_RUNLOCK(); + ND6_ONLINK_UNLOCK(); } static int nd6_prefix_onlink_rtrequest(struct nd_prefix *pr, struct ifaddr *ifa) { static struct sockaddr_dl null_sdl = {sizeof(null_sdl), AF_LINK}; struct rib_head *rnh; struct rtentry *rt; struct sockaddr_in6 mask6; u_long rtflags; int error, a_failure, fibnum; /* * in6_ifinit() sets nd6_rtrequest to ifa_rtrequest for all ifaddrs. * ifa->ifa_rtrequest = nd6_rtrequest; */ bzero(&mask6, sizeof(mask6)); mask6.sin6_len = sizeof(mask6); mask6.sin6_addr = pr->ndpr_mask; rtflags = (ifa->ifa_flags & ~IFA_RTSELF) | RTF_UP; a_failure = 0; for (fibnum = 0; fibnum < rt_numfibs; fibnum++) { rt = NULL; error = in6_rtrequest(RTM_ADD, (struct sockaddr *)&pr->ndpr_prefix, ifa->ifa_addr, (struct sockaddr *)&mask6, rtflags, &rt, fibnum); if (error == 0) { KASSERT(rt != NULL, ("%s: in6_rtrequest return no " "error(%d) but rt is NULL, pr=%p, ifa=%p", __func__, error, pr, ifa)); rnh = rt_tables_get_rnh(rt->rt_fibnum, AF_INET6); /* XXX what if rhn == NULL? */ RIB_WLOCK(rnh); RT_LOCK(rt); if (rt_setgate(rt, rt_key(rt), (struct sockaddr *)&null_sdl) == 0) { struct sockaddr_dl *dl; dl = (struct sockaddr_dl *)rt->rt_gateway; dl->sdl_type = rt->rt_ifp->if_type; dl->sdl_index = rt->rt_ifp->if_index; } RIB_WUNLOCK(rnh); nd6_rtmsg(RTM_ADD, rt); RT_UNLOCK(rt); pr->ndpr_stateflags |= NDPRF_ONLINK; } else { char ip6buf[INET6_ADDRSTRLEN]; char ip6bufg[INET6_ADDRSTRLEN]; char ip6bufm[INET6_ADDRSTRLEN]; struct sockaddr_in6 *sin6; sin6 = (struct sockaddr_in6 *)ifa->ifa_addr; nd6log((LOG_ERR, "nd6_prefix_onlink: failed to add " "route for a prefix (%s/%d) on %s, gw=%s, mask=%s, " "flags=%lx errno = %d\n", ip6_sprintf(ip6buf, &pr->ndpr_prefix.sin6_addr), pr->ndpr_plen, if_name(pr->ndpr_ifp), ip6_sprintf(ip6bufg, &sin6->sin6_addr), ip6_sprintf(ip6bufm, &mask6.sin6_addr), rtflags, error)); /* Save last error to return, see rtinit(). */ a_failure = error; } if (rt != NULL) { RT_LOCK(rt); RT_REMREF(rt); RT_UNLOCK(rt); } } /* Return the last error we got. */ return (a_failure); } -static int +int nd6_prefix_onlink(struct nd_prefix *pr) { struct ifaddr *ifa; struct ifnet *ifp = pr->ndpr_ifp; struct nd_prefix *opr; - int error = 0; char ip6buf[INET6_ADDRSTRLEN]; + int error; - /* sanity check */ - if ((pr->ndpr_stateflags & NDPRF_ONLINK) != 0) { - nd6log((LOG_ERR, - "nd6_prefix_onlink: %s/%d is already on-link\n", - ip6_sprintf(ip6buf, &pr->ndpr_prefix.sin6_addr), - pr->ndpr_plen)); + ND6_ONLINK_LOCK_ASSERT(); + ND6_UNLOCK_ASSERT(); + + if ((pr->ndpr_stateflags & NDPRF_ONLINK) != 0) return (EEXIST); - } /* * Add the interface route associated with the prefix. Before * installing the route, check if there's the same prefix on another * interface, and the prefix has already installed the interface route. * Although such a configuration is expected to be rare, we explicitly * allow it. */ + ND6_RLOCK(); LIST_FOREACH(opr, &V_nd_prefix, ndpr_entry) { if (opr == pr) continue; if ((opr->ndpr_stateflags & NDPRF_ONLINK) == 0) continue; if (opr->ndpr_plen == pr->ndpr_plen && in6_are_prefix_equal(&pr->ndpr_prefix.sin6_addr, - &opr->ndpr_prefix.sin6_addr, pr->ndpr_plen)) + &opr->ndpr_prefix.sin6_addr, pr->ndpr_plen)) { + ND6_RUNLOCK(); return (0); + } } + ND6_RUNLOCK(); /* * We prefer link-local addresses as the associated interface address. */ /* search for a link-local addr */ ifa = (struct ifaddr *)in6ifa_ifpforlinklocal(ifp, IN6_IFF_NOTREADY | IN6_IFF_ANYCAST); if (ifa == NULL) { /* XXX: freebsd does not have ifa_ifwithaf */ IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { - if (ifa->ifa_addr->sa_family == AF_INET6) + if (ifa->ifa_addr->sa_family == AF_INET6) { + ifa_ref(ifa); break; + } } - if (ifa != NULL) - ifa_ref(ifa); IF_ADDR_RUNLOCK(ifp); /* should we care about ia6_flags? */ } if (ifa == NULL) { /* * This can still happen, when, for example, we receive an RA * containing a prefix with the L bit set and the A bit clear, * after removing all IPv6 addresses on the receiving * interface. This should, of course, be rare though. */ nd6log((LOG_NOTICE, "nd6_prefix_onlink: failed to find any ifaddr" " to add route for a prefix(%s/%d) on %s\n", ip6_sprintf(ip6buf, &pr->ndpr_prefix.sin6_addr), pr->ndpr_plen, if_name(ifp))); return (0); } error = nd6_prefix_onlink_rtrequest(pr, ifa); if (ifa != NULL) ifa_free(ifa); return (error); } -static int +int nd6_prefix_offlink(struct nd_prefix *pr) { int error = 0; struct ifnet *ifp = pr->ndpr_ifp; struct nd_prefix *opr; struct sockaddr_in6 sa6, mask6; struct rtentry *rt; char ip6buf[INET6_ADDRSTRLEN]; + uint64_t genid; int fibnum, a_failure; - /* sanity check */ - if ((pr->ndpr_stateflags & NDPRF_ONLINK) == 0) { - nd6log((LOG_ERR, - "nd6_prefix_offlink: %s/%d is already off-link\n", - ip6_sprintf(ip6buf, &pr->ndpr_prefix.sin6_addr), - pr->ndpr_plen)); + ND6_ONLINK_LOCK_ASSERT(); + ND6_UNLOCK_ASSERT(); + + if ((pr->ndpr_stateflags & NDPRF_ONLINK) == 0) return (EEXIST); - } bzero(&sa6, sizeof(sa6)); sa6.sin6_family = AF_INET6; sa6.sin6_len = sizeof(sa6); bcopy(&pr->ndpr_prefix.sin6_addr, &sa6.sin6_addr, sizeof(struct in6_addr)); bzero(&mask6, sizeof(mask6)); mask6.sin6_family = AF_INET6; mask6.sin6_len = sizeof(sa6); bcopy(&pr->ndpr_mask, &mask6.sin6_addr, sizeof(struct in6_addr)); a_failure = 0; for (fibnum = 0; fibnum < rt_numfibs; fibnum++) { rt = NULL; error = in6_rtrequest(RTM_DELETE, (struct sockaddr *)&sa6, NULL, (struct sockaddr *)&mask6, 0, &rt, fibnum); if (error == 0) { /* report the route deletion to the routing socket. */ if (rt != NULL) nd6_rtmsg(RTM_DELETE, rt); } else { /* Save last error to return, see rtinit(). */ a_failure = error; } if (rt != NULL) { RTFREE(rt); } } error = a_failure; a_failure = 1; if (error == 0) { pr->ndpr_stateflags &= ~NDPRF_ONLINK; /* * There might be the same prefix on another interface, * the prefix which could not be on-link just because we have * the interface route (see comments in nd6_prefix_onlink). * If there's one, try to make the prefix on-link on the * interface. */ + ND6_RLOCK(); +restart: LIST_FOREACH(opr, &V_nd_prefix, ndpr_entry) { - if (opr == pr) - continue; - - if ((opr->ndpr_stateflags & NDPRF_ONLINK) != 0) - continue; - /* * KAME specific: detached prefixes should not be * on-link. */ - if ((opr->ndpr_stateflags & NDPRF_DETACHED) != 0) + if (opr == pr || (opr->ndpr_stateflags & + (NDPRF_ONLINK | NDPRF_DETACHED)) != 0) continue; if (opr->ndpr_plen == pr->ndpr_plen && in6_are_prefix_equal(&pr->ndpr_prefix.sin6_addr, &opr->ndpr_prefix.sin6_addr, pr->ndpr_plen)) { int e; + genid = V_nd6_list_genid; + ND6_RUNLOCK(); if ((e = nd6_prefix_onlink(opr)) != 0) { nd6log((LOG_ERR, "nd6_prefix_offlink: failed to " "recover a prefix %s/%d from %s " "to %s (errno = %d)\n", ip6_sprintf(ip6buf, &opr->ndpr_prefix.sin6_addr), opr->ndpr_plen, if_name(ifp), if_name(opr->ndpr_ifp), e)); } else a_failure = 0; + ND6_RLOCK(); + if (genid != V_nd6_list_genid) + goto restart; } } + ND6_RUNLOCK(); } else { /* XXX: can we still set the NDPRF_ONLINK flag? */ nd6log((LOG_ERR, "nd6_prefix_offlink: failed to delete route: " "%s/%d on %s (errno = %d)\n", ip6_sprintf(ip6buf, &sa6.sin6_addr), pr->ndpr_plen, if_name(ifp), error)); } if (a_failure) lltable_prefix_free(AF_INET6, (struct sockaddr *)&sa6, (struct sockaddr *)&mask6, LLE_STATIC); return (error); } static struct in6_ifaddr * in6_ifadd(struct nd_prefixctl *pr, int mcast) { struct ifnet *ifp = pr->ndpr_ifp; struct ifaddr *ifa; struct in6_aliasreq ifra; struct in6_ifaddr *ia, *ib; int error, plen0; struct in6_addr mask; int prefixlen = pr->ndpr_plen; int updateflags; char ip6buf[INET6_ADDRSTRLEN]; in6_prefixlen2mask(&mask, prefixlen); /* * find a link-local address (will be interface ID). * Is it really mandatory? Theoretically, a global or a site-local * address can be configured without a link-local address, if we * have a unique interface identifier... * * it is not mandatory to have a link-local address, we can generate * interface identifier on the fly. we do this because: * (1) it should be the easiest way to find interface identifier. * (2) RFC2462 5.4 suggesting the use of the same interface identifier * for multiple addresses on a single interface, and possible shortcut * of DAD. we omitted DAD for this reason in the past. * (3) a user can prevent autoconfiguration of global address * by removing link-local address by hand (this is partly because we * don't have other way to control the use of IPv6 on an interface. * this has been our design choice - cf. NRL's "ifconfig auto"). * (4) it is easier to manage when an interface has addresses * with the same interface identifier, than to have multiple addresses * with different interface identifiers. */ ifa = (struct ifaddr *)in6ifa_ifpforlinklocal(ifp, 0); /* 0 is OK? */ if (ifa) ib = (struct in6_ifaddr *)ifa; else return NULL; /* prefixlen + ifidlen must be equal to 128 */ plen0 = in6_mask2len(&ib->ia_prefixmask.sin6_addr, NULL); if (prefixlen != plen0) { ifa_free(ifa); nd6log((LOG_INFO, "in6_ifadd: wrong prefixlen for %s " "(prefix=%d ifid=%d)\n", if_name(ifp), prefixlen, 128 - plen0)); return NULL; } /* make ifaddr */ in6_prepare_ifra(&ifra, &pr->ndpr_prefix.sin6_addr, &mask); IN6_MASK_ADDR(&ifra.ifra_addr.sin6_addr, &mask); /* interface ID */ ifra.ifra_addr.sin6_addr.s6_addr32[0] |= (ib->ia_addr.sin6_addr.s6_addr32[0] & ~mask.s6_addr32[0]); ifra.ifra_addr.sin6_addr.s6_addr32[1] |= (ib->ia_addr.sin6_addr.s6_addr32[1] & ~mask.s6_addr32[1]); ifra.ifra_addr.sin6_addr.s6_addr32[2] |= (ib->ia_addr.sin6_addr.s6_addr32[2] & ~mask.s6_addr32[2]); ifra.ifra_addr.sin6_addr.s6_addr32[3] |= (ib->ia_addr.sin6_addr.s6_addr32[3] & ~mask.s6_addr32[3]); ifa_free(ifa); /* lifetimes. */ ifra.ifra_lifetime.ia6t_vltime = pr->ndpr_vltime; ifra.ifra_lifetime.ia6t_pltime = pr->ndpr_pltime; /* XXX: scope zone ID? */ ifra.ifra_flags |= IN6_IFF_AUTOCONF; /* obey autoconf */ /* * Make sure that we do not have this address already. This should * usually not happen, but we can still see this case, e.g., if we * have manually configured the exact address to be configured. */ ifa = (struct ifaddr *)in6ifa_ifpwithaddr(ifp, &ifra.ifra_addr.sin6_addr); if (ifa != NULL) { ifa_free(ifa); /* this should be rare enough to make an explicit log */ log(LOG_INFO, "in6_ifadd: %s is already configured\n", ip6_sprintf(ip6buf, &ifra.ifra_addr.sin6_addr)); return (NULL); } /* * Allocate ifaddr structure, link into chain, etc. * If we are going to create a new address upon receiving a multicasted * RA, we need to impose a random delay before starting DAD. * [draft-ietf-ipv6-rfc2462bis-02.txt, Section 5.4.2] */ updateflags = 0; if (mcast) updateflags |= IN6_IFAUPDATE_DADDELAY; if ((error = in6_update_ifa(ifp, &ifra, NULL, updateflags)) != 0) { nd6log((LOG_ERR, "in6_ifadd: failed to make ifaddr %s on %s (errno=%d)\n", ip6_sprintf(ip6buf, &ifra.ifra_addr.sin6_addr), if_name(ifp), error)); return (NULL); /* ifaddr must not have been allocated. */ } ia = in6ifa_ifpwithaddr(ifp, &ifra.ifra_addr.sin6_addr); /* * XXXRW: Assumption of non-NULLness here might not be true with * fine-grained locking -- should we validate it? Or just return * earlier ifa rather than looking it up again? */ return (ia); /* this is always non-NULL and referenced. */ } /* * ia0 - corresponding public address */ int in6_tmpifadd(const struct in6_ifaddr *ia0, int forcegen, int delay) { struct ifnet *ifp = ia0->ia_ifa.ifa_ifp; struct in6_ifaddr *newia; struct in6_aliasreq ifra; int error; int trylimit = 3; /* XXX: adhoc value */ int updateflags; u_int32_t randid[2]; time_t vltime0, pltime0; in6_prepare_ifra(&ifra, &ia0->ia_addr.sin6_addr, &ia0->ia_prefixmask.sin6_addr); ifra.ifra_addr = ia0->ia_addr; /* XXX: do we need this ? */ /* clear the old IFID */ IN6_MASK_ADDR(&ifra.ifra_addr.sin6_addr, &ifra.ifra_prefixmask.sin6_addr); again: if (in6_get_tmpifid(ifp, (u_int8_t *)randid, (const u_int8_t *)&ia0->ia_addr.sin6_addr.s6_addr[8], forcegen)) { nd6log((LOG_NOTICE, "in6_tmpifadd: failed to find a good " "random IFID\n")); return (EINVAL); } ifra.ifra_addr.sin6_addr.s6_addr32[2] |= (randid[0] & ~(ifra.ifra_prefixmask.sin6_addr.s6_addr32[2])); ifra.ifra_addr.sin6_addr.s6_addr32[3] |= (randid[1] & ~(ifra.ifra_prefixmask.sin6_addr.s6_addr32[3])); /* * in6_get_tmpifid() quite likely provided a unique interface ID. * However, we may still have a chance to see collision, because * there may be a time lag between generation of the ID and generation * of the address. So, we'll do one more sanity check. */ if (in6_localip(&ifra.ifra_addr.sin6_addr) != 0) { if (trylimit-- > 0) { forcegen = 1; goto again; } /* Give up. Something strange should have happened. */ nd6log((LOG_NOTICE, "in6_tmpifadd: failed to " "find a unique random IFID\n")); return (EEXIST); } /* * The Valid Lifetime is the lower of the Valid Lifetime of the * public address or TEMP_VALID_LIFETIME. * The Preferred Lifetime is the lower of the Preferred Lifetime * of the public address or TEMP_PREFERRED_LIFETIME - * DESYNC_FACTOR. */ if (ia0->ia6_lifetime.ia6t_vltime != ND6_INFINITE_LIFETIME) { vltime0 = IFA6_IS_INVALID(ia0) ? 0 : (ia0->ia6_lifetime.ia6t_vltime - (time_uptime - ia0->ia6_updatetime)); if (vltime0 > V_ip6_temp_valid_lifetime) vltime0 = V_ip6_temp_valid_lifetime; } else vltime0 = V_ip6_temp_valid_lifetime; if (ia0->ia6_lifetime.ia6t_pltime != ND6_INFINITE_LIFETIME) { pltime0 = IFA6_IS_DEPRECATED(ia0) ? 0 : (ia0->ia6_lifetime.ia6t_pltime - (time_uptime - ia0->ia6_updatetime)); if (pltime0 > V_ip6_temp_preferred_lifetime - V_ip6_desync_factor){ pltime0 = V_ip6_temp_preferred_lifetime - V_ip6_desync_factor; } } else pltime0 = V_ip6_temp_preferred_lifetime - V_ip6_desync_factor; ifra.ifra_lifetime.ia6t_vltime = vltime0; ifra.ifra_lifetime.ia6t_pltime = pltime0; /* * A temporary address is created only if this calculated Preferred * Lifetime is greater than REGEN_ADVANCE time units. */ if (ifra.ifra_lifetime.ia6t_pltime <= V_ip6_temp_regen_advance) return (0); /* XXX: scope zone ID? */ ifra.ifra_flags |= (IN6_IFF_AUTOCONF|IN6_IFF_TEMPORARY); /* allocate ifaddr structure, link into chain, etc. */ updateflags = 0; if (delay) updateflags |= IN6_IFAUPDATE_DADDELAY; if ((error = in6_update_ifa(ifp, &ifra, NULL, updateflags)) != 0) return (error); newia = in6ifa_ifpwithaddr(ifp, &ifra.ifra_addr.sin6_addr); if (newia == NULL) { /* XXX: can it happen? */ nd6log((LOG_ERR, "in6_tmpifadd: ifa update succeeded, but we got " "no ifaddr\n")); return (EINVAL); /* XXX */ } newia->ia6_ndpr = ia0->ia6_ndpr; newia->ia6_ndpr->ndpr_addrcnt++; ifa_free(&newia->ia_ifa); /* * A newly added address might affect the status of other addresses. * XXX: when the temporary address is generated with a new public * address, the onlink check is redundant. However, it would be safe * to do the check explicitly everywhere a new address is generated, * and, in fact, we surely need the check when we create a new * temporary address due to deprecation of an old temporary address. */ pfxlist_onlink_check(); return (0); } static int in6_init_prefix_ltimes(struct nd_prefix *ndpr) { if (ndpr->ndpr_pltime == ND6_INFINITE_LIFETIME) ndpr->ndpr_preferred = 0; else ndpr->ndpr_preferred = time_uptime + ndpr->ndpr_pltime; if (ndpr->ndpr_vltime == ND6_INFINITE_LIFETIME) ndpr->ndpr_expire = 0; else ndpr->ndpr_expire = time_uptime + ndpr->ndpr_vltime; return 0; } static void in6_init_address_ltimes(struct nd_prefix *new, struct in6_addrlifetime *lt6) { /* init ia6t_expire */ if (lt6->ia6t_vltime == ND6_INFINITE_LIFETIME) lt6->ia6t_expire = 0; else { lt6->ia6t_expire = time_uptime; lt6->ia6t_expire += lt6->ia6t_vltime; } /* init ia6t_preferred */ if (lt6->ia6t_pltime == ND6_INFINITE_LIFETIME) lt6->ia6t_preferred = 0; else { lt6->ia6t_preferred = time_uptime; lt6->ia6t_preferred += lt6->ia6t_pltime; } } /* * Delete all the routing table entries that use the specified gateway. * XXX: this function causes search through all entries of routing table, so * it shouldn't be called when acting as a router. */ void rt6_flush(struct in6_addr *gateway, struct ifnet *ifp) { /* We'll care only link-local addresses */ if (!IN6_IS_ADDR_LINKLOCAL(gateway)) return; /* XXX Do we really need to walk any but the default FIB? */ rt_foreach_fib_walk_del(AF_INET6, rt6_deleteroute, (void *)gateway); } static int rt6_deleteroute(const struct rtentry *rt, void *arg) { #define SIN6(s) ((struct sockaddr_in6 *)s) struct in6_addr *gate = (struct in6_addr *)arg; if (rt->rt_gateway == NULL || rt->rt_gateway->sa_family != AF_INET6) return (0); if (!IN6_ARE_ADDR_EQUAL(gate, &SIN6(rt->rt_gateway)->sin6_addr)) { return (0); } /* * Do not delete a static route. * XXX: this seems to be a bit ad-hoc. Should we consider the * 'cloned' bit instead? */ if ((rt->rt_flags & RTF_STATIC) != 0) return (0); /* * We delete only host route. This means, in particular, we don't * delete default route. */ if ((rt->rt_flags & RTF_HOST) == 0) return (0); return (1); #undef SIN6 } int nd6_setdefaultiface(int ifindex) { int error = 0; if (ifindex < 0 || V_if_index < ifindex) return (EINVAL); if (ifindex != 0 && !ifnet_byindex(ifindex)) return (EINVAL); if (V_nd6_defifindex != ifindex) { V_nd6_defifindex = ifindex; if (V_nd6_defifindex > 0) V_nd6_defifp = ifnet_byindex(V_nd6_defifindex); else V_nd6_defifp = NULL; /* * Our current implementation assumes one-to-one maping between * interfaces and links, so it would be natural to use the * default interface as the default link. */ scope6_setdefault(V_nd6_defifp); } return (error); } Index: user/alc/PQ_LAUNDRY/sys/sys/apm.h =================================================================== --- user/alc/PQ_LAUNDRY/sys/sys/apm.h (revision 306831) +++ user/alc/PQ_LAUNDRY/sys/sys/apm.h (revision 306832) @@ -1,69 +1,5 @@ /*- - * Copyright (c) 2007 Marcel Moolenaar - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR - * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES - * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. - * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, - * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT - * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF - * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - * $FreeBSD$ + * This file is in the public domain. */ - -#ifndef _SYS_APM_H_ -#define _SYS_APM_H_ - -/* Driver Descriptor Record. */ -struct apm_ddr { - uint16_t ddr_sig; -#define APM_DDR_SIG 0x4552 - uint16_t ddr_blksize; - uint32_t ddr_blkcount; -}; - -#define APM_ENT_NAMELEN 32 -#define APM_ENT_TYPELEN 32 - -/* Partition Map Entry Record. */ -struct apm_ent { - uint16_t ent_sig; -#define APM_ENT_SIG 0x504d - uint16_t _pad_; - uint32_t ent_pmblkcnt; - uint32_t ent_start; - uint32_t ent_size; - char ent_name[APM_ENT_NAMELEN]; - char ent_type[APM_ENT_TYPELEN]; -}; - -#define APM_ENT_TYPE_SELF "Apple_partition_map" -#define APM_ENT_TYPE_UNUSED "Apple_Free" - -#define APM_ENT_TYPE_FREEBSD "FreeBSD" -#define APM_ENT_TYPE_FREEBSD_NANDFS "FreeBSD-nandfs" -#define APM_ENT_TYPE_FREEBSD_SWAP "FreeBSD-swap" -#define APM_ENT_TYPE_FREEBSD_UFS "FreeBSD-UFS" -#define APM_ENT_TYPE_FREEBSD_VINUM "FreeBSD-Vinum" -#define APM_ENT_TYPE_FREEBSD_ZFS "FreeBSD-ZFS" - -#define APM_ENT_TYPE_APPLE_BOOT "Apple_Bootstrap" -#define APM_ENT_TYPE_APPLE_HFS "Apple_HFS" -#define APM_ENT_TYPE_APPLE_UFS "Apple_UNIX_SVR2" - -#endif /* _SYS_APM_H_ */ +/* $FreeBSD$ */ +#include Property changes on: user/alc/PQ_LAUNDRY/sys/sys/apm.h ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:mime-type ## -0,0 +1 ## +text/plain \ No newline at end of property Index: user/alc/PQ_LAUNDRY/sys/sys/disk/apm.h =================================================================== --- user/alc/PQ_LAUNDRY/sys/sys/disk/apm.h (nonexistent) +++ user/alc/PQ_LAUNDRY/sys/sys/disk/apm.h (revision 306832) @@ -0,0 +1,69 @@ +/*- + * Copyright (c) 2007 Marcel Moolenaar + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _SYS_DISK_APM_H_ +#define _SYS_DISK_APM_H_ + +/* Driver Descriptor Record. */ +struct apm_ddr { + uint16_t ddr_sig; +#define APM_DDR_SIG 0x4552 + uint16_t ddr_blksize; + uint32_t ddr_blkcount; +}; + +#define APM_ENT_NAMELEN 32 +#define APM_ENT_TYPELEN 32 + +/* Partition Map Entry Record. */ +struct apm_ent { + uint16_t ent_sig; +#define APM_ENT_SIG 0x504d + uint16_t _pad_; + uint32_t ent_pmblkcnt; + uint32_t ent_start; + uint32_t ent_size; + char ent_name[APM_ENT_NAMELEN]; + char ent_type[APM_ENT_TYPELEN]; +}; + +#define APM_ENT_TYPE_SELF "Apple_partition_map" +#define APM_ENT_TYPE_UNUSED "Apple_Free" + +#define APM_ENT_TYPE_FREEBSD "FreeBSD" +#define APM_ENT_TYPE_FREEBSD_NANDFS "FreeBSD-nandfs" +#define APM_ENT_TYPE_FREEBSD_SWAP "FreeBSD-swap" +#define APM_ENT_TYPE_FREEBSD_UFS "FreeBSD-UFS" +#define APM_ENT_TYPE_FREEBSD_VINUM "FreeBSD-Vinum" +#define APM_ENT_TYPE_FREEBSD_ZFS "FreeBSD-ZFS" + +#define APM_ENT_TYPE_APPLE_BOOT "Apple_Bootstrap" +#define APM_ENT_TYPE_APPLE_HFS "Apple_HFS" +#define APM_ENT_TYPE_APPLE_UFS "Apple_UNIX_SVR2" + +#endif /* _SYS_DISK_APM_H_ */ Property changes on: user/alc/PQ_LAUNDRY/sys/sys/disk/apm.h ___________________________________________________________________ Added: svn:keywords ## -0,0 +1 ## +FreeBSD=%H \ No newline at end of property Index: user/alc/PQ_LAUNDRY/sys/sys/disk/bsd.h =================================================================== --- user/alc/PQ_LAUNDRY/sys/sys/disk/bsd.h (nonexistent) +++ user/alc/PQ_LAUNDRY/sys/sys/disk/bsd.h (revision 306832) @@ -0,0 +1,191 @@ +/*- + * Copyright (c) 1987, 1988, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)disklabel.h 8.2 (Berkeley) 7/10/94 + * $FreeBSD$ + */ + +#ifndef _SYS_DISK_BSD_H_ +#define _SYS_DISK_BSD_H_ + +/* The disk magic number */ +#define BSD_MAGIC 0x82564557U + +#define BSD_NPARTS_MIN 8 +#define BSD_NPARTS_MAX 20 + +/* Size of bootblock area in sector-size neutral bytes */ +#define BSD_BOOTBLOCK_SIZE 8192 + +/* partition containing whole disk */ +#define BSD_PART_RAW 2 + +/* partition normally containing swap */ +#define BSD_PART_SWAP 1 + +/* Drive-type specific data size (in number of 32-bit inegrals) */ +#define BSD_NDRIVEDATA 5 + +/* Number of spare 32-bit integrals following drive-type data */ +#define BSD_NSPARE 5 + +struct disklabel { + uint32_t d_magic; /* the magic number */ + uint16_t d_type; /* drive type */ + uint16_t d_subtype; /* controller/d_type specific */ + char d_typename[16]; /* type name, e.g. "eagle" */ + + char d_packname[16]; /* pack identifier */ + + /* disk geometry: */ + uint32_t d_secsize; /* # of bytes per sector */ + uint32_t d_nsectors; /* # of data sectors per track */ + uint32_t d_ntracks; /* # of tracks per cylinder */ + uint32_t d_ncylinders; /* # of data cylinders per unit */ + uint32_t d_secpercyl; /* # of data sectors per cylinder */ + uint32_t d_secperunit; /* # of data sectors per unit */ + + /* + * Spares (bad sector replacements) below are not counted in + * d_nsectors or d_secpercyl. Spare sectors are assumed to + * be physical sectors which occupy space at the end of each + * track and/or cylinder. + */ + uint16_t d_sparespertrack; /* # of spare sectors per track */ + uint16_t d_sparespercyl; /* # of spare sectors per cylinder */ + /* + * Alternate cylinders include maintenance, replacement, configuration + * description areas, etc. + */ + uint32_t d_acylinders; /* # of alt. cylinders per unit */ + + /* hardware characteristics: */ + /* + * d_interleave, d_trackskew and d_cylskew describe perturbations + * in the media format used to compensate for a slow controller. + * Interleave is physical sector interleave, set up by the + * formatter or controller when formatting. When interleaving is + * in use, logically adjacent sectors are not physically + * contiguous, but instead are separated by some number of + * sectors. It is specified as the ratio of physical sectors + * traversed per logical sector. Thus an interleave of 1:1 + * implies contiguous layout, while 2:1 implies that logical + * sector 0 is separated by one sector from logical sector 1. + * d_trackskew is the offset of sector 0 on track N relative to + * sector 0 on track N-1 on the same cylinder. Finally, d_cylskew + * is the offset of sector 0 on cylinder N relative to sector 0 + * on cylinder N-1. + */ + uint16_t d_rpm; /* rotational speed */ + uint16_t d_interleave; /* hardware sector interleave */ + uint16_t d_trackskew; /* sector 0 skew, per track */ + uint16_t d_cylskew; /* sector 0 skew, per cylinder */ + uint32_t d_headswitch; /* head switch time, usec */ + uint32_t d_trkseek; /* track-to-track seek, usec */ + uint32_t d_flags; /* generic flags */ + uint32_t d_drivedata[BSD_NDRIVEDATA]; /* drive-type specific data */ + uint32_t d_spare[BSD_NSPARE]; /* reserved for future use */ + uint32_t d_magic2; /* the magic number (again) */ + uint16_t d_checksum; /* xor of data incl. partitions */ + + /* filesystem and partition information: */ + uint16_t d_npartitions; /* number of partitions in following */ + uint32_t d_bbsize; /* size of boot area at sn0, bytes */ + uint32_t d_sbsize; /* max size of fs superblock, bytes */ + struct partition { /* the partition table */ + uint32_t p_size; /* number of sectors in partition */ + uint32_t p_offset; /* starting sector */ + uint32_t p_fsize; /* filesystem basic fragment size */ + uint8_t p_fstype; /* filesystem type, see below */ + uint8_t p_frag; /* filesystem fragments per block */ + uint16_t p_cpg; /* filesystem cylinders per group */ + } d_partitions[BSD_NPARTS_MIN]; /* actually may be more */ +}; +#ifdef CTASSERT +CTASSERT(sizeof(struct disklabel) == 148 + BSD_NPARTS_MIN * 16); +#endif + +/* d_type values: */ +#define DTYPE_SMD 1 /* SMD, XSMD; VAX hp/up */ +#define DTYPE_MSCP 2 /* MSCP */ +#define DTYPE_DEC 3 /* other DEC (rk, rl) */ +#define DTYPE_SCSI 4 /* SCSI */ +#define DTYPE_ESDI 5 /* ESDI interface */ +#define DTYPE_ST506 6 /* ST506 etc. */ +#define DTYPE_HPIB 7 /* CS/80 on HP-IB */ +#define DTYPE_HPFL 8 /* HP Fiber-link */ +#define DTYPE_FLOPPY 10 /* floppy */ +#define DTYPE_CCD 11 /* concatenated disk */ +#define DTYPE_VINUM 12 /* vinum volume */ +#define DTYPE_DOC2K 13 /* Msys DiskOnChip */ +#define DTYPE_RAID 14 /* CMU RAIDFrame */ +#define DTYPE_JFS2 16 /* IBM JFS 2 */ + +/* + * Filesystem type and version. + * Used to interpret other filesystem-specific + * per-partition information. + */ +#define FS_UNUSED 0 /* unused */ +#define FS_SWAP 1 /* swap */ +#define FS_V6 2 /* Sixth Edition */ +#define FS_V7 3 /* Seventh Edition */ +#define FS_SYSV 4 /* System V */ +#define FS_V71K 5 /* V7 with 1K blocks (4.1, 2.9) */ +#define FS_V8 6 /* Eighth Edition, 4K blocks */ +#define FS_BSDFFS 7 /* 4.2BSD fast filesystem */ +#define FS_MSDOS 8 /* MSDOS filesystem */ +#define FS_BSDLFS 9 /* 4.4BSD log-structured filesystem */ +#define FS_OTHER 10 /* in use, but unknown/unsupported */ +#define FS_HPFS 11 /* OS/2 high-performance filesystem */ +#define FS_ISO9660 12 /* ISO 9660, normally CD-ROM */ +#define FS_BOOT 13 /* partition contains bootstrap */ +#define FS_VINUM 14 /* Vinum drive */ +#define FS_RAID 15 /* RAIDFrame drive */ +#define FS_FILECORE 16 /* Acorn Filecore Filing System */ +#define FS_EXT2FS 17 /* ext2fs */ +#define FS_NTFS 18 /* Windows/NT file system */ +#define FS_CCD 20 /* concatenated disk component */ +#define FS_JFS2 21 /* IBM JFS2 */ +#define FS_HAMMER 22 /* DragonFlyBSD Hammer FS */ +#define FS_HAMMER2 23 /* DragonFlyBSD Hammer2 FS */ +#define FS_UDF 24 /* UDF */ +#define FS_EFS 26 /* SGI's Extent File system */ +#define FS_ZFS 27 /* Sun's ZFS */ +#define FS_NANDFS 30 /* FreeBSD nandfs (NiLFS derived) */ + +/* + * flags shared by various drives: + */ +#define D_REMOVABLE 0x01 /* removable media */ +#define D_ECC 0x02 /* supports ECC */ +#define D_BADSECT 0x04 /* supports bad sector forw. */ +#define D_RAMDISK 0x08 /* disk emulator */ +#define D_CHAIN 0x10 /* can do back-back transfers */ + +#endif /* !_SYS_DISK_BSD_H_ */ Property changes on: user/alc/PQ_LAUNDRY/sys/sys/disk/bsd.h ___________________________________________________________________ Added: svn:keywords ## -0,0 +1 ## +FreeBSD=%H \ No newline at end of property Index: user/alc/PQ_LAUNDRY/sys/sys/disk/gpt.h =================================================================== --- user/alc/PQ_LAUNDRY/sys/sys/disk/gpt.h (nonexistent) +++ user/alc/PQ_LAUNDRY/sys/sys/disk/gpt.h (revision 306832) @@ -0,0 +1,237 @@ +/*- + * Copyright (c) 2002 Marcel Moolenaar + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _SYS_DISK_GPT_H_ +#define _SYS_DISK_GPT_H_ + +/* + * Applications can define GPT_UUID_TYPE if they want the GPT structures + * to use a particular type definition for UUIDs/GUIDs. This header uses + * a generic (DCE 1.1 compatible) definition otherwise. + */ +#ifndef GPT_UUID_TYPE +struct gpt_uuid { + uint32_t time_low; + uint16_t time_mid; + uint16_t time_hi_and_version; + uint8_t clock_seq_hi_and_reserved; + uint8_t clock_seq_low; + uint8_t node[6]; +}; +#define GPT_UUID_TYPE struct gpt_uuid +#endif /* !GPT_UUID_TYPE */ + +typedef GPT_UUID_TYPE gpt_uuid_t; + +#ifdef CTASSERT +CTASSERT(sizeof(gpt_uuid_t) == 16); +#endif + +struct gpt_hdr { + char hdr_sig[8]; +#define GPT_HDR_SIG "EFI PART" + uint32_t hdr_revision; +#define GPT_HDR_REVISION 0x00010000 + uint32_t hdr_size; + uint32_t hdr_crc_self; + uint32_t __reserved; + uint64_t hdr_lba_self; + uint64_t hdr_lba_alt; + uint64_t hdr_lba_start; + uint64_t hdr_lba_end; + gpt_uuid_t hdr_uuid; + uint64_t hdr_lba_table; + uint32_t hdr_entries; + uint32_t hdr_entsz; + uint32_t hdr_crc_table; + /* + * The header as defined in the EFI spec is not a multiple of 8 bytes + * and given that the alignment requirement is on an 8 byte boundary, + * padding will happen. We make the padding explicit so that we can + * correct the value returned by sizeof() when we put the size of the + * header in field hdr_size, or otherwise use offsetof(). + */ + uint32_t padding; +}; +#ifdef CTASSERT +CTASSERT(offsetof(struct gpt_hdr, padding) == 92); +#endif + +struct gpt_ent { + gpt_uuid_t ent_type; + gpt_uuid_t ent_uuid; + uint64_t ent_lba_start; + uint64_t ent_lba_end; + uint64_t ent_attr; +#define GPT_ENT_ATTR_PLATFORM (1ULL << 0) +#define GPT_ENT_ATTR_BOOTME (1ULL << 59) +#define GPT_ENT_ATTR_BOOTONCE (1ULL << 58) +#define GPT_ENT_ATTR_BOOTFAILED (1ULL << 57) + uint16_t ent_name[36]; /* UTF-16. */ +}; +#ifdef CTASSERT +CTASSERT(sizeof(struct gpt_ent) == 128); +#endif /* CTASSERT */ + +#define GPT_ENT_TYPE_UNUSED \ + {0x00000000,0x0000,0x0000,0x00,0x00,{0x00,0x00,0x00,0x00,0x00,0x00}} +#define GPT_ENT_TYPE_EFI \ + {0xc12a7328,0xf81f,0x11d2,0xba,0x4b,{0x00,0xa0,0xc9,0x3e,0xc9,0x3b}} +#define GPT_ENT_TYPE_MBR \ + {0x024dee41,0x33e7,0x11d3,0x9d,0x69,{0x00,0x08,0xc7,0x81,0xf3,0x9f}} +#define GPT_ENT_TYPE_FREEBSD \ + {0x516e7cb4,0x6ecf,0x11d6,0x8f,0xf8,{0x00,0x02,0x2d,0x09,0x71,0x2b}} +#define GPT_ENT_TYPE_FREEBSD_BOOT \ + {0x83bd6b9d,0x7f41,0x11dc,0xbe,0x0b,{0x00,0x15,0x60,0xb8,0x4f,0x0f}} +#define GPT_ENT_TYPE_FREEBSD_NANDFS \ + {0x74ba7dd9,0xa689,0x11e1,0xbd,0x04,{0x00,0xe0,0x81,0x28,0x6a,0xcf}} +#define GPT_ENT_TYPE_FREEBSD_SWAP \ + {0x516e7cb5,0x6ecf,0x11d6,0x8f,0xf8,{0x00,0x02,0x2d,0x09,0x71,0x2b}} +#define GPT_ENT_TYPE_FREEBSD_UFS \ + {0x516e7cb6,0x6ecf,0x11d6,0x8f,0xf8,{0x00,0x02,0x2d,0x09,0x71,0x2b}} +#define GPT_ENT_TYPE_FREEBSD_VINUM \ + {0x516e7cb8,0x6ecf,0x11d6,0x8f,0xf8,{0x00,0x02,0x2d,0x09,0x71,0x2b}} +#define GPT_ENT_TYPE_FREEBSD_ZFS \ + {0x516e7cba,0x6ecf,0x11d6,0x8f,0xf8,{0x00,0x02,0x2d,0x09,0x71,0x2b}} +#define GPT_ENT_TYPE_PREP_BOOT \ + {0x9e1a2d38,0xc612,0x4316,0xaa,0x26,{0x8b,0x49,0x52,0x1e,0x5a,0x8b}} + +/* + * The following are unused but documented here to avoid reuse. + * + * GPT_ENT_TYPE_FREEBSD_UFS2 \ + * {0x516e7cb7,0x6ecf,0x11d6,0x8f,0xf8,{0x00,0x02,0x2d,0x09,0x71,0x2b}} + */ + +/* + * Foreign partition types that we're likely to encounter. Note that Linux + * apparently choose to share data partitions with MS. I don't what the + * advantage might be. I can see how sharing swap partitions is advantageous + * though. + */ +#define GPT_ENT_TYPE_MS_BASIC_DATA \ + {0xebd0a0a2,0xb9e5,0x4433,0x87,0xc0,{0x68,0xb6,0xb7,0x26,0x99,0xc7}} +#define GPT_ENT_TYPE_MS_LDM_DATA \ + {0xaf9b60a0,0x1431,0x4f62,0xbc,0x68,{0x33,0x11,0x71,0x4a,0x69,0xad}} +#define GPT_ENT_TYPE_MS_LDM_METADATA \ + {0x5808c8aa,0x7e8f,0x42e0,0x85,0xd2,{0xe1,0xe9,0x04,0x34,0xcf,0xb3}} +#define GPT_ENT_TYPE_MS_RECOVERY \ + {0xde94bba4,0x06d1,0x4d40,0xa1,0x6a,{0xbf,0xd5,0x01,0x79,0xd6,0xac}} +#define GPT_ENT_TYPE_MS_RESERVED \ + {0xe3c9e316,0x0b5c,0x4db8,0x81,0x7d,{0xf9,0x2d,0xf0,0x02,0x15,0xae}} +#define GPT_ENT_TYPE_MS_SPACES \ + {0xe75caf8f,0xf680,0x4cee,0xaf,0xa3,{0xb0,0x01,0xe5,0x6e,0xfc,0x2d}} + +#define GPT_ENT_TYPE_LINUX_DATA \ + {0x0fc63daf,0x8483,0x4772,0x8e,0x79,{0x3d,0x69,0xd8,0x47,0x7d,0xe4}} +#define GPT_ENT_TYPE_LINUX_RAID \ + {0xa19d880f,0x05fc,0x4d3b,0xa0,0x06,{0x74,0x3f,0x0f,0x84,0x91,0x1e}} +#define GPT_ENT_TYPE_LINUX_SWAP \ + {0x0657fd6d,0xa4ab,0x43c4,0x84,0xe5,{0x09,0x33,0xc8,0x4b,0x4f,0x4f}} +#define GPT_ENT_TYPE_LINUX_LVM \ + {0xe6d6d379,0xf507,0x44c2,0xa2,0x3c,{0x23,0x8f,0x2a,0x3d,0xf9,0x28}} + +#define GPT_ENT_TYPE_VMFS \ + {0xaa31e02a,0x400f,0x11db,0x95,0x90,{0x00,0x0c,0x29,0x11,0xd1,0xb8}} +#define GPT_ENT_TYPE_VMKDIAG \ + {0x9d275380,0x40ad,0x11db,0xbf,0x97,{0x00,0x0c,0x29,0x11,0xd1,0xb8}} +#define GPT_ENT_TYPE_VMRESERVED \ + {0x9198effc,0x31c0,0x11db,0x8f,0x78,{0x00,0x0c,0x29,0x11,0xd1,0xb8}} +#define GPT_ENT_TYPE_VMVSANHDR \ + {0x381cfccc,0x7288,0x11e0,0x92,0xee,{0x00,0x0c,0x29,0x11,0xd0,0xb2}} + +#define GPT_ENT_TYPE_APPLE_BOOT \ + {0x426F6F74,0x0000,0x11aa,0xaa,0x11,{0x00,0x30,0x65,0x43,0xec,0xac}} +#define GPT_ENT_TYPE_APPLE_HFS \ + {0x48465300,0x0000,0x11aa,0xaa,0x11,{0x00,0x30,0x65,0x43,0xec,0xac}} +#define GPT_ENT_TYPE_APPLE_UFS \ + {0x55465300,0x0000,0x11aa,0xaa,0x11,{0x00,0x30,0x65,0x43,0xec,0xac}} +#define GPT_ENT_TYPE_APPLE_ZFS \ + {0x6a898cc3,0x1dd2,0x11b2,0x99,0xa6,{0x08,0x00,0x20,0x73,0x66,0x31}} +#define GPT_ENT_TYPE_APPLE_RAID \ + {0x52414944,0x0000,0x11aa,0xaa,0x22,{0x00,0x30,0x65,0x43,0xec,0xac}} +#define GPT_ENT_TYPE_APPLE_RAID_OFFLINE \ + {0x52414944,0x5f4f,0x11aa,0xaa,0x22,{0x00,0x30,0x65,0x43,0xec,0xac}} +#define GPT_ENT_TYPE_APPLE_LABEL \ + {0x4C616265,0x6c00,0x11aa,0xaa,0x11,{0x00,0x30,0x65,0x43,0xec,0xac}} +#define GPT_ENT_TYPE_APPLE_TV_RECOVERY \ + {0x5265636f,0x7665,0x11AA,0xaa,0x11,{0x00,0x30,0x65,0x43,0xec,0xac}} +#define GPT_ENT_TYPE_APPLE_CORE_STORAGE \ + {0x53746f72,0x6167,0x11AA,0xaa,0x11,{0x00,0x30,0x65,0x43,0xec,0xac}} + +#define GPT_ENT_TYPE_NETBSD_FFS \ + {0x49f48d5a,0xb10e,0x11dc,0xb9,0x9b,{0x00,0x19,0xd1,0x87,0x96,0x48}} +#define GPT_ENT_TYPE_NETBSD_LFS \ + {0x49f48d82,0xb10e,0x11dc,0xb9,0x9b,{0x00,0x19,0xd1,0x87,0x96,0x48}} +#define GPT_ENT_TYPE_NETBSD_SWAP \ + {0x49f48d32,0xb10e,0x11dc,0xB9,0x9B,{0x00,0x19,0xd1,0x87,0x96,0x48}} +#define GPT_ENT_TYPE_NETBSD_RAID \ + {0x49f48daa,0xb10e,0x11dc,0xb9,0x9b,{0x00,0x19,0xd1,0x87,0x96,0x48}} +#define GPT_ENT_TYPE_NETBSD_CCD \ + {0x2db519c4,0xb10f,0x11dc,0xb9,0x9b,{0x00,0x19,0xd1,0x87,0x96,0x48}} +#define GPT_ENT_TYPE_NETBSD_CGD \ + {0x2db519ec,0xb10f,0x11dc,0xb9,0x9b,{0x00,0x19,0xd1,0x87,0x96,0x48}} + +#define GPT_ENT_TYPE_DRAGONFLY_LABEL32 \ + {0x9d087404,0x1ca5,0x11dc,0x88,0x17,{0x01,0x30,0x1b,0xb8,0xa9,0xf5}} +#define GPT_ENT_TYPE_DRAGONFLY_SWAP \ + {0x9d58fdbd,0x1ca5,0x11dc,0x88,0x17,{0x01,0x30,0x1b,0xb8,0xa9,0xf5}} +#define GPT_ENT_TYPE_DRAGONFLY_UFS1 \ + {0x9d94ce7c,0x1ca5,0x11dc,0x88,0x17,{0x01,0x30,0x1b,0xb8,0xa9,0xf5}} +#define GPT_ENT_TYPE_DRAGONFLY_VINUM \ + {0x9dd4478f,0x1ca5,0x11dc,0x88,0x17,{0x01,0x30,0x1b,0xb8,0xa9,0xf5}} +#define GPT_ENT_TYPE_DRAGONFLY_CCD \ + {0xdbd5211b,0x1ca5,0x11dc,0x88,0x17,{0x01,0x30,0x1b,0xb8,0xa9,0xf5}} +#define GPT_ENT_TYPE_DRAGONFLY_LABEL64 \ + {0x3d48ce54,0x1d16,0x11dc,0x86,0x96,{0x01,0x30,0x1b,0xb8,0xa9,0xf5}} +#define GPT_ENT_TYPE_DRAGONFLY_LEGACY \ + {0xbd215ab2,0x1d16,0x11dc,0x86,0x96,{0x01,0x30,0x1b,0xb8,0xa9,0xf5}} +#define GPT_ENT_TYPE_DRAGONFLY_HAMMER \ + {0x61dc63ac,0x6e38,0x11dc,0x85,0x13,{0x01,0x30,0x1b,0xb8,0xa9,0xf5}} +#define GPT_ENT_TYPE_DRAGONFLY_HAMMER2 \ + {0x5cbb9ad1,0x862d,0x11dc,0xa9,0x4d,{0x01,0x30,0x1b,0xb8,0xa9,0xf5}} + +#define GPT_ENT_TYPE_CHROMEOS_FIRMWARE \ + {0xcab6e88e,0xabf3,0x4102,0xa0,0x7a,{0xd4,0xbb,0x9b,0xe3,0xc1,0xd3}} +#define GPT_ENT_TYPE_CHROMEOS_KERNEL \ + {0xfe3a2a5d,0x4f32,0x41a7,0xb7,0x25,{0xac,0xcc,0x32,0x85,0xa3,0x09}} +#define GPT_ENT_TYPE_CHROMEOS_RESERVED \ + {0x2e0a753d,0x9e48,0x43b0,0x83,0x37,{0xb1,0x51,0x92,0xcb,0x1b,0x5e}} +#define GPT_ENT_TYPE_CHROMEOS_ROOT \ + {0x3cb8e202,0x3b7e,0x47dd,0x8a,0x3c,{0x7f,0xf2,0xa1,0x3c,0xfc,0xec}} + +#define GPT_ENT_TYPE_OPENBSD_DATA \ + {0x824cc7a0,0x36a8,0x11e3,0x89,0x0a,{0x95,0x25,0x19,0xad,0x3f,0x61}} + +/* + * Boot partition used by GRUB 2. + */ +#define GPT_ENT_TYPE_BIOS_BOOT \ + {0x21686148,0x6449,0x6e6f,0x74,0x4e,{0x65,0x65,0x64,0x45,0x46,0x49}} + +#endif /* _SYS_DISK_GPT_H_ */ Property changes on: user/alc/PQ_LAUNDRY/sys/sys/disk/gpt.h ___________________________________________________________________ Added: svn:keywords ## -0,0 +1 ## +FreeBSD=%H \ No newline at end of property Index: user/alc/PQ_LAUNDRY/sys/sys/disk/mbr.h =================================================================== --- user/alc/PQ_LAUNDRY/sys/sys/disk/mbr.h (nonexistent) +++ user/alc/PQ_LAUNDRY/sys/sys/disk/mbr.h (revision 306832) @@ -0,0 +1,84 @@ +/*- + * Copyright (c) 1987, 1988, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)disklabel.h 8.2 (Berkeley) 7/10/94 + * $FreeBSD$ + */ + +#ifndef _SYS_DISK_MBR_H_ +#define _SYS_DISK_MBR_H_ + +#define DOSBBSECTOR 0 /* DOS boot block relative sector number */ +#define DOSDSNOFF 440 /* WinNT/2K/XP Drive Serial Number offset */ +#define DOSPARTOFF 446 +#define DOSPARTSIZE 16 +#define NDOSPART 4 +#define NEXTDOSPART 32 +#define DOSMAGICOFFSET 510 +#define DOSMAGIC 0xAA55 + +#define DOSPTYP_EXT 0x05 /* DOS extended partition */ +#define DOSPTYP_FAT16 0x06 /* FAT16 partition */ +#define DOSPTYP_NTFS 0x07 /* NTFS partition */ +#define DOSPTYP_FAT32 0x0b /* FAT32 partition */ +#define DOSPTYP_EXTLBA 0x0f /* DOS extended partition */ +#define DOSPTYP_PPCBOOT 0x41 /* PReP/CHRP boot partition */ +#define DOSPTYP_LDM 0x42 /* Win2k dynamic extended partition */ +#define DOSPTYP_LINSWP 0x82 /* Linux swap partition */ +#define DOSPTYP_LINUX 0x83 /* Linux partition */ +#define DOSPTYP_LINLVM 0x8e /* Linux LVM partition */ +#define DOSPTYP_386BSD 0xa5 /* 386BSD partition type */ +#define DOSPTYP_APPLE_UFS 0xa8 /* Apple Mac OS X boot */ +#define DOSPTYP_APPLE_BOOT 0xab /* Apple Mac OS X UFS */ +#define DOSPTYP_HFS 0xaf /* HFS/HFS+ partition type */ +#define DOSPTYP_PMBR 0xee /* GPT Protective MBR */ +#define DOSPTYP_EFI 0xef /* EFI FAT parition */ +#define DOSPTYP_VMFS 0xfb /* VMware VMFS partition */ +#define DOSPTYP_VMKDIAG 0xfc /* VMware vmkDiagnostic partition */ +#define DOSPTYP_LINRAID 0xfd /* Linux raid partition */ + +struct dos_partition { + unsigned char dp_flag; /* bootstrap flags */ + unsigned char dp_shd; /* starting head */ + unsigned char dp_ssect; /* starting sector */ + unsigned char dp_scyl; /* starting cylinder */ + unsigned char dp_typ; /* partition type */ + unsigned char dp_ehd; /* end head */ + unsigned char dp_esect; /* end sector */ + unsigned char dp_ecyl; /* end cylinder */ + uint32_t dp_start; /* absolute starting sector number */ + uint32_t dp_size; /* partition size in sectors */ +}; +#ifdef CTASSERT +CTASSERT(sizeof (struct dos_partition) == DOSPARTSIZE); +#endif + +#define DPSECT(s) ((s) & 0x3f) /* isolate relevant bits of sector */ +#define DPCYL(c, s) ((c) + (((s) & 0xc0)<<2)) /* and those that are cylinder */ + +#endif /* !_SYS_DISK_MBR_H_ */ Property changes on: user/alc/PQ_LAUNDRY/sys/sys/disk/mbr.h ___________________________________________________________________ Added: svn:keywords ## -0,0 +1 ## +FreeBSD=%H \ No newline at end of property Index: user/alc/PQ_LAUNDRY/sys/sys/disk/pc98.h =================================================================== --- user/alc/PQ_LAUNDRY/sys/sys/disk/pc98.h (nonexistent) +++ user/alc/PQ_LAUNDRY/sys/sys/disk/pc98.h (revision 306832) @@ -0,0 +1,75 @@ +/*- + * Copyright (c) 1987, 1988, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)disklabel.h 8.2 (Berkeley) 7/10/94 + * $FreeBSD$ + */ + +#ifndef _SYS_DISK_PC98_H_ +#define _SYS_DISK_PC98_H_ + +#define PC98_BBSECTOR 1 /* DOS boot block relative sector number */ +#define PC98_PARTOFF 0 +#define PC98_PARTSIZE 32 +#define PC98_NPARTS 16 +#define PC98_MAGICOFS 510 +#define PC98_MAGIC 0xAA55 + +#define PC98_MID_BOOTABLE 0x80 +#define PC98_MID_MASK 0x7f +#define PC98_MID_386BSD 0x14 + +#define PC98_SID_ACTIVE 0x80 +#define PC98_SID_MASK 0x7f +#define PC98_SID_386BSD 0x44 + +#define __DOSMID_386BSD (PC98_MID_386BSD | PC98_MID_BOOTABLE) +#define __DOSSID_386BSD (PC98_SID_386BSD | PC98_SID_ACTIVE) +#define PC98_PTYP_386BSD (__DOSSID_386BSD << 8 | __DOSMID_386BSD) + +struct pc98_partition { + unsigned char dp_mid; + unsigned char dp_sid; + unsigned char dp_dum1; + unsigned char dp_dum2; + unsigned char dp_ipl_sct; + unsigned char dp_ipl_head; + unsigned short dp_ipl_cyl; + unsigned char dp_ssect; /* starting sector */ + unsigned char dp_shd; /* starting head */ + unsigned short dp_scyl; /* starting cylinder */ + unsigned char dp_esect; /* end sector */ + unsigned char dp_ehd; /* end head */ + unsigned short dp_ecyl; /* end cylinder */ + unsigned char dp_name[16]; +}; +#ifdef CTASSERT +CTASSERT(sizeof (struct pc98_partition) == PC98_PARTSIZE); +#endif + +#endif /* !_SYS_DISK_PC98_H_ */ Property changes on: user/alc/PQ_LAUNDRY/sys/sys/disk/pc98.h ___________________________________________________________________ Added: svn:keywords ## -0,0 +1 ## +FreeBSD=%H \ No newline at end of property Index: user/alc/PQ_LAUNDRY/sys/sys/disk/vtoc.h =================================================================== --- user/alc/PQ_LAUNDRY/sys/sys/disk/vtoc.h (nonexistent) +++ user/alc/PQ_LAUNDRY/sys/sys/disk/vtoc.h (revision 306832) @@ -0,0 +1,108 @@ +/*- + * Copyright (c) 2008 Marcel Moolenaar + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _SYS_DISK_VTOC_H_ +#define _SYS_DISK_VTOC_H_ + +#define VTOC_TAG_UNASSIGNED 0x00 +#define VTOC_TAG_BOOT 0x01 +#define VTOC_TAG_ROOT 0x02 +#define VTOC_TAG_SWAP 0x03 +#define VTOC_TAG_USR 0x04 +#define VTOC_TAG_BACKUP 0x05 /* "c" partition */ +#define VTOC_TAG_STAND 0x06 +#define VTOC_TAG_VAR 0x07 +#define VTOC_TAG_HOME 0x08 +#define VTOC_TAG_ALTSCTR 0x09 /* alternate sector partition */ +#define VTOC_TAG_CACHE 0x0a /* Solaris cachefs partition */ +#define VTOC_TAG_VXVM_PUB 0x0e /* VxVM public region */ +#define VTOC_TAG_VXVM_PRIV 0x0f /* VxVM private region */ + +/* NetBSD/mips defines this */ +#define VTOC_TAG_NETBSD_FFS 0xff + +/* FreeBSD tags: the high byte equals ELFOSABI_FREEBSD */ +#define VTOC_TAG_FREEBSD_SWAP 0x0901 +#define VTOC_TAG_FREEBSD_UFS 0x0902 +#define VTOC_TAG_FREEBSD_VINUM 0x0903 +#define VTOC_TAG_FREEBSD_ZFS 0x0904 +#define VTOC_TAG_FREEBSD_NANDFS 0x0905 + +#define VTOC_FLAG_UNMNT 0x01 /* unmountable partition */ +#define VTOC_FLAG_RDONLY 0x10 /* partition is read/only */ + +#define VTOC_ASCII_LEN 128 +#define VTOC_BOOTSIZE 8192 /* 16 sectors */ +#define VTOC_MAGIC 0xdabe +#define VTOC_RAW_PART 2 +#define VTOC_SANITY 0x600ddeee +#define VTOC_VERSION 1 +#define VTOC_VOLUME_LEN 8 + +#define VTOC8_NPARTS 8 + +struct vtoc8 { + char ascii[VTOC_ASCII_LEN]; + uint32_t version; + char volume[VTOC_VOLUME_LEN]; + uint16_t nparts; + struct { + uint16_t tag; + uint16_t flag; + } part[VTOC8_NPARTS]; + uint16_t __alignment; + uint32_t bootinfo[3]; + uint32_t sanity; + uint32_t reserved[10]; + uint32_t timestamp[VTOC8_NPARTS]; + uint16_t wskip; + uint16_t rskip; + char padding[152]; + uint16_t rpm; + uint16_t physcyls; + uint16_t sparesecs; + uint16_t spare1[2]; + uint16_t interleave; + uint16_t ncyls; + uint16_t altcyls; + uint16_t nheads; + uint16_t nsecs; + uint16_t spare2[2]; + struct { + uint32_t cyl; + uint32_t nblks; + } map[VTOC8_NPARTS]; + uint16_t magic; + uint16_t cksum; +}; + +#ifdef CTASSERT +CTASSERT(sizeof(struct vtoc8) == 512); +#endif + +#endif /* _SYS_DISK_VTOC_H_ */ Property changes on: user/alc/PQ_LAUNDRY/sys/sys/disk/vtoc.h ___________________________________________________________________ Added: svn:keywords ## -0,0 +1 ## +FreeBSD=%H \ No newline at end of property Index: user/alc/PQ_LAUNDRY/sys/sys/disklabel.h =================================================================== --- user/alc/PQ_LAUNDRY/sys/sys/disklabel.h (revision 306831) +++ user/alc/PQ_LAUNDRY/sys/sys/disklabel.h (revision 306832) @@ -1,305 +1,169 @@ /*- * Copyright (c) 1987, 1988, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)disklabel.h 8.2 (Berkeley) 7/10/94 * $FreeBSD$ */ #ifndef _SYS_DISKLABEL_H_ #define _SYS_DISKLABEL_H_ #ifndef _KERNEL #include #endif #include -/* - * Disk description table, see disktab(5) - */ +#include + +/* Disk description table, see disktab(5) */ #define _PATH_DISKTAB "/etc/disktab" /* - * Each disk has a label which includes information about the hardware - * disk geometry, filesystem partitions, and drive specific information. * The label is in block 0 or 1, possibly offset from the beginning * to leave room for a bootstrap, etc. + * XXX these should be defined per controller (or drive) elsewhere, not here! + * XXX in actuality it can't even be per controller or drive. It should be + * constant/fixed across storage hardware and CPU architectures. Disks can + * travel from one machine to another and a label created on one machine + * should be detectable and understood by the other. */ - -/* XXX these should be defined per controller (or drive) elsewhere, not here! */ -#if defined(__i386__) || defined(__amd64__) || defined(__arm__) || \ - defined(__powerpc__) || defined(__mips__) #define LABELSECTOR 1 /* sector containing label */ #define LABELOFFSET 0 /* offset of label in sector */ -#endif -#define DISKMAGIC ((u_int32_t)0x82564557) /* The disk magic number */ +#define DISKMAGIC BSD_MAGIC /* The disk magic number */ + #ifndef MAXPARTITIONS -#define MAXPARTITIONS 8 +#define MAXPARTITIONS BSD_NPARTS_MIN #endif /* Size of bootblock area in sector-size neutral bytes */ -#define BBSIZE 8192 +#define BBSIZE BSD_BOOTBLOCK_SIZE -#define LABEL_PART 2 /* partition containing label */ -#define RAW_PART 2 /* partition containing whole disk */ -#define SWAP_PART 1 /* partition normally containing swap */ +#define LABEL_PART BSD_PART_RAW +#define RAW_PART BSD_PART_RAW +#define SWAP_PART BSD_PART_SWAP -struct disklabel { - u_int32_t d_magic; /* the magic number */ - u_int16_t d_type; /* drive type */ - u_int16_t d_subtype; /* controller/d_type specific */ - char d_typename[16]; /* type name, e.g. "eagle" */ +#define NDDATA BSD_NDRIVEDATA +#define NSPARE BSD_NSPARE - char d_packname[16]; /* pack identifier */ - - /* disk geometry: */ - u_int32_t d_secsize; /* # of bytes per sector */ - u_int32_t d_nsectors; /* # of data sectors per track */ - u_int32_t d_ntracks; /* # of tracks per cylinder */ - u_int32_t d_ncylinders; /* # of data cylinders per unit */ - u_int32_t d_secpercyl; /* # of data sectors per cylinder */ - u_int32_t d_secperunit; /* # of data sectors per unit */ - - /* - * Spares (bad sector replacements) below are not counted in - * d_nsectors or d_secpercyl. Spare sectors are assumed to - * be physical sectors which occupy space at the end of each - * track and/or cylinder. - */ - u_int16_t d_sparespertrack; /* # of spare sectors per track */ - u_int16_t d_sparespercyl; /* # of spare sectors per cylinder */ - /* - * Alternate cylinders include maintenance, replacement, configuration - * description areas, etc. - */ - u_int32_t d_acylinders; /* # of alt. cylinders per unit */ - - /* hardware characteristics: */ - /* - * d_interleave, d_trackskew and d_cylskew describe perturbations - * in the media format used to compensate for a slow controller. - * Interleave is physical sector interleave, set up by the - * formatter or controller when formatting. When interleaving is - * in use, logically adjacent sectors are not physically - * contiguous, but instead are separated by some number of - * sectors. It is specified as the ratio of physical sectors - * traversed per logical sector. Thus an interleave of 1:1 - * implies contiguous layout, while 2:1 implies that logical - * sector 0 is separated by one sector from logical sector 1. - * d_trackskew is the offset of sector 0 on track N relative to - * sector 0 on track N-1 on the same cylinder. Finally, d_cylskew - * is the offset of sector 0 on cylinder N relative to sector 0 - * on cylinder N-1. - */ - u_int16_t d_rpm; /* rotational speed */ - u_int16_t d_interleave; /* hardware sector interleave */ - u_int16_t d_trackskew; /* sector 0 skew, per track */ - u_int16_t d_cylskew; /* sector 0 skew, per cylinder */ - u_int32_t d_headswitch; /* head switch time, usec */ - u_int32_t d_trkseek; /* track-to-track seek, usec */ - u_int32_t d_flags; /* generic flags */ -#define NDDATA 5 - u_int32_t d_drivedata[NDDATA]; /* drive-type specific information */ -#define NSPARE 5 - u_int32_t d_spare[NSPARE]; /* reserved for future use */ - u_int32_t d_magic2; /* the magic number (again) */ - u_int16_t d_checksum; /* xor of data incl. partitions */ - - /* filesystem and partition information: */ - u_int16_t d_npartitions; /* number of partitions in following */ - u_int32_t d_bbsize; /* size of boot area at sn0, bytes */ - u_int32_t d_sbsize; /* max size of fs superblock, bytes */ - struct partition { /* the partition table */ - u_int32_t p_size; /* number of sectors in partition */ - u_int32_t p_offset; /* starting sector */ - u_int32_t p_fsize; /* filesystem basic fragment size */ - u_int8_t p_fstype; /* filesystem type, see below */ - u_int8_t p_frag; /* filesystem fragments per block */ - u_int16_t p_cpg; /* filesystem cylinders per group */ - } d_partitions[MAXPARTITIONS]; /* actually may be more */ -}; - -#ifdef CTASSERT -CTASSERT(sizeof(struct disklabel) == 148 + MAXPARTITIONS * 16); -#endif - static __inline u_int16_t dkcksum(struct disklabel *lp); static __inline u_int16_t dkcksum(struct disklabel *lp) { u_int16_t *start, *end; u_int16_t sum = 0; start = (u_int16_t *)lp; end = (u_int16_t *)&lp->d_partitions[lp->d_npartitions]; while (start < end) sum ^= *start++; return (sum); } - -/* d_type values: */ -#define DTYPE_SMD 1 /* SMD, XSMD; VAX hp/up */ -#define DTYPE_MSCP 2 /* MSCP */ -#define DTYPE_DEC 3 /* other DEC (rk, rl) */ -#define DTYPE_SCSI 4 /* SCSI */ -#define DTYPE_ESDI 5 /* ESDI interface */ -#define DTYPE_ST506 6 /* ST506 etc. */ -#define DTYPE_HPIB 7 /* CS/80 on HP-IB */ -#define DTYPE_HPFL 8 /* HP Fiber-link */ -#define DTYPE_FLOPPY 10 /* floppy */ -#define DTYPE_CCD 11 /* concatenated disk */ -#define DTYPE_VINUM 12 /* vinum volume */ -#define DTYPE_DOC2K 13 /* Msys DiskOnChip */ -#define DTYPE_RAID 14 /* CMU RAIDFrame */ -#define DTYPE_JFS2 16 /* IBM JFS 2 */ - #ifdef DKTYPENAMES static const char *dktypenames[] = { "unknown", "SMD", "MSCP", "old DEC", "SCSI", "ESDI", "ST506", "HP-IB", "HP-FL", "type 9", "floppy", "CCD", "Vinum", "DOC2K", "Raid", "?", "jfs", NULL }; #define DKMAXTYPES (sizeof(dktypenames) / sizeof(dktypenames[0]) - 1) #endif -/* - * Filesystem type and version. - * Used to interpret other filesystem-specific - * per-partition information. - */ -#define FS_UNUSED 0 /* unused */ -#define FS_SWAP 1 /* swap */ -#define FS_V6 2 /* Sixth Edition */ -#define FS_V7 3 /* Seventh Edition */ -#define FS_SYSV 4 /* System V */ -#define FS_V71K 5 /* V7 with 1K blocks (4.1, 2.9) */ -#define FS_V8 6 /* Eighth Edition, 4K blocks */ -#define FS_BSDFFS 7 /* 4.2BSD fast filesystem */ -#define FS_MSDOS 8 /* MSDOS filesystem */ -#define FS_BSDLFS 9 /* 4.4BSD log-structured filesystem */ -#define FS_OTHER 10 /* in use, but unknown/unsupported */ -#define FS_HPFS 11 /* OS/2 high-performance filesystem */ -#define FS_ISO9660 12 /* ISO 9660, normally CD-ROM */ -#define FS_BOOT 13 /* partition contains bootstrap */ -#define FS_VINUM 14 /* Vinum drive */ -#define FS_RAID 15 /* RAIDFrame drive */ -#define FS_FILECORE 16 /* Acorn Filecore Filing System */ -#define FS_EXT2FS 17 /* ext2fs */ -#define FS_NTFS 18 /* Windows/NT file system */ -#define FS_CCD 20 /* concatenated disk component */ -#define FS_JFS2 21 /* IBM JFS2 */ -#define FS_HAMMER 22 /* DragonFlyBSD Hammer FS */ -#define FS_HAMMER2 23 /* DragonFlyBSD Hammer2 FS */ -#define FS_UDF 24 /* UDF */ -#define FS_EFS 26 /* SGI's Extent File system */ -#define FS_ZFS 27 /* Sun's ZFS */ -#define FS_NANDFS 30 /* FreeBSD nandfs (NiLFS derived) */ - #ifdef FSTYPENAMES static const char *fstypenames[] = { "unused", "swap", "Version 6", "Version 7", "System V", "4.1BSD", "Eighth Edition", "4.2BSD", "MSDOS", "4.4LFS", "unknown", "HPFS", "ISO9660", "boot", "vinum", "raid", "Filecore", "EXT2FS", "NTFS", "?", "ccd", "jfs", "HAMMER", "HAMMER2", "UDF", "?", "EFS", "ZFS", "?", "?", "nandfs", NULL }; #define FSMAXTYPES (sizeof(fstypenames) / sizeof(fstypenames[0]) - 1) #endif - -/* - * flags shared by various drives: - */ -#define D_REMOVABLE 0x01 /* removable media */ -#define D_ECC 0x02 /* supports ECC */ -#define D_BADSECT 0x04 /* supports bad sector forw. */ -#define D_RAMDISK 0x08 /* disk emulator */ -#define D_CHAIN 0x10 /* can do back-back transfers */ /* * NB: defines ioctls from 'd'/128 and up. */ /* * Functions for proper encoding/decoding of struct disklabel into/from * bytestring. */ void bsd_partition_le_dec(u_char *ptr, struct partition *d); int bsd_disklabel_le_dec(u_char *ptr, struct disklabel *d, int maxpart); void bsd_partition_le_enc(u_char *ptr, struct partition *d); void bsd_disklabel_le_enc(u_char *ptr, struct disklabel *d); #ifndef _KERNEL __BEGIN_DECLS struct disklabel *getdiskbyname(const char *); __END_DECLS #endif #endif /* !_SYS_DISKLABEL_H_ */ Property changes on: user/alc/PQ_LAUNDRY/sys/sys/disklabel.h ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:mime-type ## -0,0 +1 ## +text/plain \ No newline at end of property Index: user/alc/PQ_LAUNDRY/sys/sys/diskmbr.h =================================================================== --- user/alc/PQ_LAUNDRY/sys/sys/diskmbr.h (revision 306831) +++ user/alc/PQ_LAUNDRY/sys/sys/diskmbr.h (revision 306832) @@ -1,91 +1,44 @@ /*- * Copyright (c) 1987, 1988, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)disklabel.h 8.2 (Berkeley) 7/10/94 * $FreeBSD$ */ #ifndef _SYS_DISKMBR_H_ #define _SYS_DISKMBR_H_ +#include #include -#define DOSBBSECTOR 0 /* DOS boot block relative sector number */ -#define DOSDSNOFF 440 /* WinNT/2K/XP Drive Serial Number offset */ -#define DOSPARTOFF 446 -#define DOSPARTSIZE 16 -#define NDOSPART 4 -#define NEXTDOSPART 32 -#define DOSMAGICOFFSET 510 -#define DOSMAGIC 0xAA55 - -#define DOSPTYP_EXT 0x05 /* DOS extended partition */ -#define DOSPTYP_FAT16 0x06 /* FAT16 partition */ -#define DOSPTYP_NTFS 0x07 /* NTFS partition */ -#define DOSPTYP_FAT32 0x0b /* FAT32 partition */ -#define DOSPTYP_EXTLBA 0x0f /* DOS extended partition */ -#define DOSPTYP_PPCBOOT 0x41 /* PReP/CHRP boot partition */ -#define DOSPTYP_LDM 0x42 /* Win2k dynamic extended partition */ -#define DOSPTYP_LINSWP 0x82 /* Linux swap partition */ -#define DOSPTYP_LINUX 0x83 /* Linux partition */ -#define DOSPTYP_LINLVM 0x8e /* Linux LVM partition */ -#define DOSPTYP_386BSD 0xa5 /* 386BSD partition type */ -#define DOSPTYP_APPLE_UFS 0xa8 /* Apple Mac OS X boot */ -#define DOSPTYP_APPLE_BOOT 0xab /* Apple Mac OS X UFS */ -#define DOSPTYP_HFS 0xaf /* HFS/HFS+ partition type */ -#define DOSPTYP_PMBR 0xee /* GPT Protective MBR */ -#define DOSPTYP_EFI 0xef /* EFI FAT parition */ -#define DOSPTYP_VMFS 0xfb /* VMware VMFS partition */ -#define DOSPTYP_VMKDIAG 0xfc /* VMware vmkDiagnostic partition */ -#define DOSPTYP_LINRAID 0xfd /* Linux raid partition */ - -struct dos_partition { - unsigned char dp_flag; /* bootstrap flags */ - unsigned char dp_shd; /* starting head */ - unsigned char dp_ssect; /* starting sector */ - unsigned char dp_scyl; /* starting cylinder */ - unsigned char dp_typ; /* partition type */ - unsigned char dp_ehd; /* end head */ - unsigned char dp_esect; /* end sector */ - unsigned char dp_ecyl; /* end cylinder */ - u_int32_t dp_start; /* absolute starting sector number */ - u_int32_t dp_size; /* partition size in sectors */ -}; -#ifdef CTASSERT -CTASSERT(sizeof (struct dos_partition) == DOSPARTSIZE); -#endif - void dos_partition_dec(void const *pp, struct dos_partition *d); void dos_partition_enc(void *pp, struct dos_partition *d); - -#define DPSECT(s) ((s) & 0x3f) /* isolate relevant bits of sector */ -#define DPCYL(c, s) ((c) + (((s) & 0xc0)<<2)) /* and those that are cylinder */ #define DIOCSMBR _IOW('M', 129, u_char[512]) #endif /* !_SYS_DISKMBR_H_ */ Property changes on: user/alc/PQ_LAUNDRY/sys/sys/diskmbr.h ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:mime-type ## -0,0 +1 ## +text/plain \ No newline at end of property Index: user/alc/PQ_LAUNDRY/sys/sys/diskpc98.h =================================================================== --- user/alc/PQ_LAUNDRY/sys/sys/diskpc98.h (revision 306831) +++ user/alc/PQ_LAUNDRY/sys/sys/diskpc98.h (revision 306832) @@ -1,82 +1,47 @@ /*- * Copyright (c) 1987, 1988, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)disklabel.h 8.2 (Berkeley) 7/10/94 * $FreeBSD$ */ #ifndef _SYS_DISKPC98_H_ #define _SYS_DISKPC98_H_ +#include #include -#define PC98_BBSECTOR 1 /* DOS boot block relative sector number */ -#define PC98_PARTOFF 0 -#define PC98_PARTSIZE 32 -#define PC98_NPARTS 16 -#define PC98_MAGICOFS 510 -#define PC98_MAGIC 0xAA55 - -#define PC98_MID_BOOTABLE 0x80 -#define PC98_MID_MASK 0x7f -#define PC98_MID_386BSD 0x14 - -#define PC98_SID_ACTIVE 0x80 -#define PC98_SID_MASK 0x7f -#define PC98_SID_386BSD 0x44 - -#define DOSMID_386BSD (PC98_MID_386BSD | PC98_MID_BOOTABLE) -#define DOSSID_386BSD (PC98_SID_386BSD | PC98_SID_ACTIVE) -#define PC98_PTYP_386BSD (DOSSID_386BSD << 8 | DOSMID_386BSD) - -struct pc98_partition { - unsigned char dp_mid; - unsigned char dp_sid; - unsigned char dp_dum1; - unsigned char dp_dum2; - unsigned char dp_ipl_sct; - unsigned char dp_ipl_head; - unsigned short dp_ipl_cyl; - unsigned char dp_ssect; /* starting sector */ - unsigned char dp_shd; /* starting head */ - unsigned short dp_scyl; /* starting cylinder */ - unsigned char dp_esect; /* end sector */ - unsigned char dp_ehd; /* end head */ - unsigned short dp_ecyl; /* end cylinder */ - unsigned char dp_name[16]; -}; -#ifdef CTASSERT -CTASSERT(sizeof (struct pc98_partition) == PC98_PARTSIZE); -#endif +#define DOSMID_386BSD __DOSMID_386BSD +#define DOSSID_386BSD __DOSSID_386BSD void pc98_partition_dec(void const *pp, struct pc98_partition *d); void pc98_partition_enc(void *pp, struct pc98_partition *d); #define DIOCSPC98 _IOW('M', 129, u_char[8192]) #endif /* !_SYS_DISKPC98_H_ */ Property changes on: user/alc/PQ_LAUNDRY/sys/sys/diskpc98.h ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:mime-type ## -0,0 +1 ## +text/plain \ No newline at end of property Index: user/alc/PQ_LAUNDRY/sys/sys/gpt.h =================================================================== --- user/alc/PQ_LAUNDRY/sys/sys/gpt.h (revision 306831) +++ user/alc/PQ_LAUNDRY/sys/sys/gpt.h (revision 306832) @@ -1,210 +1,37 @@ /*- * Copyright (c) 2002 Marcel Moolenaar * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _SYS_GPT_H_ #define _SYS_GPT_H_ #include -struct gpt_hdr { - char hdr_sig[8]; -#define GPT_HDR_SIG "EFI PART" - uint32_t hdr_revision; -#define GPT_HDR_REVISION 0x00010000 - uint32_t hdr_size; - uint32_t hdr_crc_self; - uint32_t __reserved; - uint64_t hdr_lba_self; - uint64_t hdr_lba_alt; - uint64_t hdr_lba_start; - uint64_t hdr_lba_end; - struct uuid hdr_uuid; - uint64_t hdr_lba_table; - uint32_t hdr_entries; - uint32_t hdr_entsz; - uint32_t hdr_crc_table; - /* - * The header as defined in the EFI spec is not a multiple of 8 bytes - * and given that the alignment requirement is on an 8 byte boundary, - * padding will happen. We make the padding explicit so that we can - * correct the value returned by sizeof() when we put the size of the - * header in field hdr_size, or otherwise use offsetof(). - */ - uint32_t padding; -}; - -struct gpt_ent { - struct uuid ent_type; - struct uuid ent_uuid; - uint64_t ent_lba_start; - uint64_t ent_lba_end; - uint64_t ent_attr; -#define GPT_ENT_ATTR_PLATFORM (1ULL << 0) -#define GPT_ENT_ATTR_BOOTME (1ULL << 59) -#define GPT_ENT_ATTR_BOOTONCE (1ULL << 58) -#define GPT_ENT_ATTR_BOOTFAILED (1ULL << 57) - uint16_t ent_name[36]; /* UTF-16. */ -}; - -#define GPT_ENT_TYPE_UNUSED \ - {0x00000000,0x0000,0x0000,0x00,0x00,{0x00,0x00,0x00,0x00,0x00,0x00}} -#define GPT_ENT_TYPE_EFI \ - {0xc12a7328,0xf81f,0x11d2,0xba,0x4b,{0x00,0xa0,0xc9,0x3e,0xc9,0x3b}} -#define GPT_ENT_TYPE_MBR \ - {0x024dee41,0x33e7,0x11d3,0x9d,0x69,{0x00,0x08,0xc7,0x81,0xf3,0x9f}} -#define GPT_ENT_TYPE_FREEBSD \ - {0x516e7cb4,0x6ecf,0x11d6,0x8f,0xf8,{0x00,0x02,0x2d,0x09,0x71,0x2b}} -#define GPT_ENT_TYPE_FREEBSD_BOOT \ - {0x83bd6b9d,0x7f41,0x11dc,0xbe,0x0b,{0x00,0x15,0x60,0xb8,0x4f,0x0f}} -#define GPT_ENT_TYPE_FREEBSD_NANDFS \ - {0x74ba7dd9,0xa689,0x11e1,0xbd,0x04,{0x00,0xe0,0x81,0x28,0x6a,0xcf}} -#define GPT_ENT_TYPE_FREEBSD_SWAP \ - {0x516e7cb5,0x6ecf,0x11d6,0x8f,0xf8,{0x00,0x02,0x2d,0x09,0x71,0x2b}} -#define GPT_ENT_TYPE_FREEBSD_UFS \ - {0x516e7cb6,0x6ecf,0x11d6,0x8f,0xf8,{0x00,0x02,0x2d,0x09,0x71,0x2b}} -#define GPT_ENT_TYPE_FREEBSD_VINUM \ - {0x516e7cb8,0x6ecf,0x11d6,0x8f,0xf8,{0x00,0x02,0x2d,0x09,0x71,0x2b}} -#define GPT_ENT_TYPE_FREEBSD_ZFS \ - {0x516e7cba,0x6ecf,0x11d6,0x8f,0xf8,{0x00,0x02,0x2d,0x09,0x71,0x2b}} -#define GPT_ENT_TYPE_PREP_BOOT \ - {0x9e1a2d38,0xc612,0x4316,0xaa,0x26,{0x8b,0x49,0x52,0x1e,0x5a,0x8b}} - -/* - * The following are unused but documented here to avoid reuse. - * - * GPT_ENT_TYPE_FREEBSD_UFS2 \ - * {0x516e7cb7,0x6ecf,0x11d6,0x8f,0xf8,{0x00,0x02,0x2d,0x09,0x71,0x2b}} - */ - -/* - * Foreign partition types that we're likely to encounter. Note that Linux - * apparently choose to share data partitions with MS. I don't what the - * advantage might be. I can see how sharing swap partitions is advantageous - * though. - */ -#define GPT_ENT_TYPE_MS_BASIC_DATA \ - {0xebd0a0a2,0xb9e5,0x4433,0x87,0xc0,{0x68,0xb6,0xb7,0x26,0x99,0xc7}} -#define GPT_ENT_TYPE_MS_LDM_DATA \ - {0xaf9b60a0,0x1431,0x4f62,0xbc,0x68,{0x33,0x11,0x71,0x4a,0x69,0xad}} -#define GPT_ENT_TYPE_MS_LDM_METADATA \ - {0x5808c8aa,0x7e8f,0x42e0,0x85,0xd2,{0xe1,0xe9,0x04,0x34,0xcf,0xb3}} -#define GPT_ENT_TYPE_MS_RECOVERY \ - {0xde94bba4,0x06d1,0x4d40,0xa1,0x6a,{0xbf,0xd5,0x01,0x79,0xd6,0xac}} -#define GPT_ENT_TYPE_MS_RESERVED \ - {0xe3c9e316,0x0b5c,0x4db8,0x81,0x7d,{0xf9,0x2d,0xf0,0x02,0x15,0xae}} -#define GPT_ENT_TYPE_MS_SPACES \ - {0xe75caf8f,0xf680,0x4cee,0xaf,0xa3,{0xb0,0x01,0xe5,0x6e,0xfc,0x2d}} - -#define GPT_ENT_TYPE_LINUX_DATA \ - {0x0fc63daf,0x8483,0x4772,0x8e,0x79,{0x3d,0x69,0xd8,0x47,0x7d,0xe4}} -#define GPT_ENT_TYPE_LINUX_RAID \ - {0xa19d880f,0x05fc,0x4d3b,0xa0,0x06,{0x74,0x3f,0x0f,0x84,0x91,0x1e}} -#define GPT_ENT_TYPE_LINUX_SWAP \ - {0x0657fd6d,0xa4ab,0x43c4,0x84,0xe5,{0x09,0x33,0xc8,0x4b,0x4f,0x4f}} -#define GPT_ENT_TYPE_LINUX_LVM \ - {0xe6d6d379,0xf507,0x44c2,0xa2,0x3c,{0x23,0x8f,0x2a,0x3d,0xf9,0x28}} - -#define GPT_ENT_TYPE_VMFS \ - {0xaa31e02a,0x400f,0x11db,0x95,0x90,{0x00,0x0c,0x29,0x11,0xd1,0xb8}} -#define GPT_ENT_TYPE_VMKDIAG \ - {0x9d275380,0x40ad,0x11db,0xbf,0x97,{0x00,0x0c,0x29,0x11,0xd1,0xb8}} -#define GPT_ENT_TYPE_VMRESERVED \ - {0x9198effc,0x31c0,0x11db,0x8f,0x78,{0x00,0x0c,0x29,0x11,0xd1,0xb8}} -#define GPT_ENT_TYPE_VMVSANHDR \ - {0x381cfccc,0x7288,0x11e0,0x92,0xee,{0x00,0x0c,0x29,0x11,0xd0,0xb2}} - -#define GPT_ENT_TYPE_APPLE_BOOT \ - {0x426F6F74,0x0000,0x11aa,0xaa,0x11,{0x00,0x30,0x65,0x43,0xec,0xac}} -#define GPT_ENT_TYPE_APPLE_HFS \ - {0x48465300,0x0000,0x11aa,0xaa,0x11,{0x00,0x30,0x65,0x43,0xec,0xac}} -#define GPT_ENT_TYPE_APPLE_UFS \ - {0x55465300,0x0000,0x11aa,0xaa,0x11,{0x00,0x30,0x65,0x43,0xec,0xac}} -#define GPT_ENT_TYPE_APPLE_ZFS \ - {0x6a898cc3,0x1dd2,0x11b2,0x99,0xa6,{0x08,0x00,0x20,0x73,0x66,0x31}} -#define GPT_ENT_TYPE_APPLE_RAID \ - {0x52414944,0x0000,0x11aa,0xaa,0x22,{0x00,0x30,0x65,0x43,0xec,0xac}} -#define GPT_ENT_TYPE_APPLE_RAID_OFFLINE \ - {0x52414944,0x5f4f,0x11aa,0xaa,0x22,{0x00,0x30,0x65,0x43,0xec,0xac}} -#define GPT_ENT_TYPE_APPLE_LABEL \ - {0x4C616265,0x6c00,0x11aa,0xaa,0x11,{0x00,0x30,0x65,0x43,0xec,0xac}} -#define GPT_ENT_TYPE_APPLE_TV_RECOVERY \ - {0x5265636f,0x7665,0x11AA,0xaa,0x11,{0x00,0x30,0x65,0x43,0xec,0xac}} -#define GPT_ENT_TYPE_APPLE_CORE_STORAGE \ - {0x53746f72,0x6167,0x11AA,0xaa,0x11,{0x00,0x30,0x65,0x43,0xec,0xac}} - -#define GPT_ENT_TYPE_NETBSD_FFS \ - {0x49f48d5a,0xb10e,0x11dc,0xb9,0x9b,{0x00,0x19,0xd1,0x87,0x96,0x48}} -#define GPT_ENT_TYPE_NETBSD_LFS \ - {0x49f48d82,0xb10e,0x11dc,0xb9,0x9b,{0x00,0x19,0xd1,0x87,0x96,0x48}} -#define GPT_ENT_TYPE_NETBSD_SWAP \ - {0x49f48d32,0xb10e,0x11dc,0xB9,0x9B,{0x00,0x19,0xd1,0x87,0x96,0x48}} -#define GPT_ENT_TYPE_NETBSD_RAID \ - {0x49f48daa,0xb10e,0x11dc,0xb9,0x9b,{0x00,0x19,0xd1,0x87,0x96,0x48}} -#define GPT_ENT_TYPE_NETBSD_CCD \ - {0x2db519c4,0xb10f,0x11dc,0xb9,0x9b,{0x00,0x19,0xd1,0x87,0x96,0x48}} -#define GPT_ENT_TYPE_NETBSD_CGD \ - {0x2db519ec,0xb10f,0x11dc,0xb9,0x9b,{0x00,0x19,0xd1,0x87,0x96,0x48}} - -#define GPT_ENT_TYPE_DRAGONFLY_LABEL32 \ - {0x9d087404,0x1ca5,0x11dc,0x88,0x17,{0x01,0x30,0x1b,0xb8,0xa9,0xf5}} -#define GPT_ENT_TYPE_DRAGONFLY_SWAP \ - {0x9d58fdbd,0x1ca5,0x11dc,0x88,0x17,{0x01,0x30,0x1b,0xb8,0xa9,0xf5}} -#define GPT_ENT_TYPE_DRAGONFLY_UFS1 \ - {0x9d94ce7c,0x1ca5,0x11dc,0x88,0x17,{0x01,0x30,0x1b,0xb8,0xa9,0xf5}} -#define GPT_ENT_TYPE_DRAGONFLY_VINUM \ - {0x9dd4478f,0x1ca5,0x11dc,0x88,0x17,{0x01,0x30,0x1b,0xb8,0xa9,0xf5}} -#define GPT_ENT_TYPE_DRAGONFLY_CCD \ - {0xdbd5211b,0x1ca5,0x11dc,0x88,0x17,{0x01,0x30,0x1b,0xb8,0xa9,0xf5}} -#define GPT_ENT_TYPE_DRAGONFLY_LABEL64 \ - {0x3d48ce54,0x1d16,0x11dc,0x86,0x96,{0x01,0x30,0x1b,0xb8,0xa9,0xf5}} -#define GPT_ENT_TYPE_DRAGONFLY_LEGACY \ - {0xbd215ab2,0x1d16,0x11dc,0x86,0x96,{0x01,0x30,0x1b,0xb8,0xa9,0xf5}} -#define GPT_ENT_TYPE_DRAGONFLY_HAMMER \ - {0x61dc63ac,0x6e38,0x11dc,0x85,0x13,{0x01,0x30,0x1b,0xb8,0xa9,0xf5}} -#define GPT_ENT_TYPE_DRAGONFLY_HAMMER2 \ - {0x5cbb9ad1,0x862d,0x11dc,0xa9,0x4d,{0x01,0x30,0x1b,0xb8,0xa9,0xf5}} - -#define GPT_ENT_TYPE_CHROMEOS_FIRMWARE \ - {0xcab6e88e,0xabf3,0x4102,0xa0,0x7a,{0xd4,0xbb,0x9b,0xe3,0xc1,0xd3}} -#define GPT_ENT_TYPE_CHROMEOS_KERNEL \ - {0xfe3a2a5d,0x4f32,0x41a7,0xb7,0x25,{0xac,0xcc,0x32,0x85,0xa3,0x09}} -#define GPT_ENT_TYPE_CHROMEOS_RESERVED \ - {0x2e0a753d,0x9e48,0x43b0,0x83,0x37,{0xb1,0x51,0x92,0xcb,0x1b,0x5e}} -#define GPT_ENT_TYPE_CHROMEOS_ROOT \ - {0x3cb8e202,0x3b7e,0x47dd,0x8a,0x3c,{0x7f,0xf2,0xa1,0x3c,0xfc,0xec}} - -#define GPT_ENT_TYPE_OPENBSD_DATA \ - {0x824cc7a0,0x36a8,0x11e3,0x89,0x0a,{0x95,0x25,0x19,0xad,0x3f,0x61}} - -/* - * Boot partition used by GRUB 2. - */ -#define GPT_ENT_TYPE_BIOS_BOOT \ - {0x21686148,0x6449,0x6e6f,0x74,0x4e,{0x65,0x65,0x64,0x45,0x46,0x49}} +#define GPT_UUID_TYPE struct uuid +#include #endif /* _SYS_GPT_H_ */ Property changes on: user/alc/PQ_LAUNDRY/sys/sys/gpt.h ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:mime-type ## -0,0 +1 ## +text/plain \ No newline at end of property Index: user/alc/PQ_LAUNDRY/sys/sys/vnode.h =================================================================== --- user/alc/PQ_LAUNDRY/sys/sys/vnode.h (revision 306831) +++ user/alc/PQ_LAUNDRY/sys/sys/vnode.h (revision 306832) @@ -1,884 +1,884 @@ /*- * Copyright (c) 1989, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)vnode.h 8.7 (Berkeley) 2/4/94 * $FreeBSD$ */ #ifndef _SYS_VNODE_H_ #define _SYS_VNODE_H_ #include #include #include #include #include #include #include #include #include #include /* * The vnode is the focus of all file activity in UNIX. There is a * unique vnode allocated for each active file, each current directory, * each mounted-on file, text file, and the root. */ /* * Vnode types. VNON means no type. */ enum vtype { VNON, VREG, VDIR, VBLK, VCHR, VLNK, VSOCK, VFIFO, VBAD, VMARKER }; /* * Each underlying filesystem allocates its own private area and hangs * it from v_data. If non-null, this area is freed in getnewvnode(). */ struct namecache; struct vpollinfo { struct mtx vpi_lock; /* lock to protect below */ struct selinfo vpi_selinfo; /* identity of poller(s) */ short vpi_events; /* what they are looking for */ short vpi_revents; /* what has happened */ }; /* * Reading or writing any of these items requires holding the appropriate lock. * * Lock reference: * c - namecache mutex * i - interlock * l - mp mnt_listmtx or freelist mutex * I - updated with atomics, 0->1 and 1->0 transitions with interlock held * m - mount point interlock * p - pollinfo lock * u - Only a reference to the vnode is needed to read. * v - vnode lock * * Vnodes may be found on many lists. The general way to deal with operating * on a vnode that is on a list is: * 1) Lock the list and find the vnode. * 2) Lock interlock so that the vnode does not go away. * 3) Unlock the list to avoid lock order reversals. * 4) vget with LK_INTERLOCK and check for ENOENT, or * 5) Check for DOOMED if the vnode lock is not required. * 6) Perform your operation, then vput(). */ #if defined(_KERNEL) || defined(_KVM_VNODE) struct vnode { /* * Fields which define the identity of the vnode. These fields are * owned by the filesystem (XXX: and vgone() ?) */ const char *v_tag; /* u type of underlying data */ struct vop_vector *v_op; /* u vnode operations vector */ void *v_data; /* u private data for fs */ /* * Filesystem instance stuff */ struct mount *v_mount; /* u ptr to vfs we are in */ TAILQ_ENTRY(vnode) v_nmntvnodes; /* m vnodes for mount point */ /* * Type specific fields, only one applies to any given vnode. * See #defines below for renaming to v_* namespace. */ union { struct mount *vu_mount; /* v ptr to mountpoint (VDIR) */ struct socket *vu_socket; /* v unix domain net (VSOCK) */ struct cdev *vu_cdev; /* v device (VCHR, VBLK) */ struct fifoinfo *vu_fifoinfo; /* v fifo (VFIFO) */ } v_un; /* * vfs_hash: (mount + inode) -> vnode hash. The hash value * itself is grouped with other int fields, to avoid padding. */ LIST_ENTRY(vnode) v_hashlist; /* * VFS_namecache stuff */ LIST_HEAD(, namecache) v_cache_src; /* c Cache entries from us */ TAILQ_HEAD(, namecache) v_cache_dst; /* c Cache entries to us */ struct namecache *v_cache_dd; /* c Cache entry for .. vnode */ /* * Locking */ struct lock v_lock; /* u (if fs don't have one) */ struct mtx v_interlock; /* lock for "i" things */ struct lock *v_vnlock; /* u pointer to vnode lock */ /* * The machinery of being a vnode */ TAILQ_ENTRY(vnode) v_actfreelist; /* l vnode active/free lists */ struct bufobj v_bufobj; /* * Buffer cache object */ /* * Hooks for various subsystems and features. */ struct vpollinfo *v_pollinfo; /* i Poll events, p for *v_pi */ struct label *v_label; /* MAC label for vnode */ struct lockf *v_lockf; /* Byte-level advisory lock list */ struct rangelock v_rl; /* Byte-range lock */ /* * clustering stuff */ daddr_t v_cstart; /* v start block of cluster */ daddr_t v_lasta; /* v last allocation */ daddr_t v_lastw; /* v last write */ int v_clen; /* v length of cur. cluster */ u_int v_holdcnt; /* I prevents recycling. */ u_int v_usecount; /* I ref count of users */ u_int v_iflag; /* i vnode flags (see below) */ u_int v_vflag; /* v vnode flags */ u_int v_mflag; /* l mnt-specific vnode flags */ int v_writecount; /* v ref count of writers */ u_int v_hash; enum vtype v_type; /* u vnode type */ }; #endif /* defined(_KERNEL) || defined(_KVM_VNODE) */ #define v_mountedhere v_un.vu_mount #define v_socket v_un.vu_socket #define v_rdev v_un.vu_cdev #define v_fifoinfo v_un.vu_fifoinfo #define bo2vnode(bo) __containerof((bo), struct vnode, v_bufobj) /* XXX: These are temporary to avoid a source sweep at this time */ #define v_object v_bufobj.bo_object /* * Userland version of struct vnode, for sysctl. */ struct xvnode { size_t xv_size; /* sizeof(struct xvnode) */ void *xv_vnode; /* address of real vnode */ u_long xv_flag; /* vnode vflags */ int xv_usecount; /* reference count of users */ int xv_writecount; /* reference count of writers */ int xv_holdcnt; /* page & buffer references */ u_long xv_id; /* capability identifier */ void *xv_mount; /* address of parent mount */ long xv_numoutput; /* num of writes in progress */ enum vtype xv_type; /* vnode type */ union { void *xvu_socket; /* socket, if VSOCK */ void *xvu_fifo; /* fifo, if VFIFO */ dev_t xvu_rdev; /* maj/min, if VBLK/VCHR */ struct { dev_t xvu_dev; /* device, if VDIR/VREG/VLNK */ ino_t xvu_ino; /* id, if VDIR/VREG/VLNK */ } xv_uns; } xv_un; }; #define xv_socket xv_un.xvu_socket #define xv_fifo xv_un.xvu_fifo #define xv_rdev xv_un.xvu_rdev #define xv_dev xv_un.xv_uns.xvu_dev #define xv_ino xv_un.xv_uns.xvu_ino /* We don't need to lock the knlist */ #define VN_KNLIST_EMPTY(vp) ((vp)->v_pollinfo == NULL || \ KNLIST_EMPTY(&(vp)->v_pollinfo->vpi_selinfo.si_note)) #define VN_KNOTE(vp, b, a) \ do { \ if (!VN_KNLIST_EMPTY(vp)) \ KNOTE(&vp->v_pollinfo->vpi_selinfo.si_note, (b), \ (a) | KNF_NOKQLOCK); \ } while (0) #define VN_KNOTE_LOCKED(vp, b) VN_KNOTE(vp, b, KNF_LISTLOCKED) #define VN_KNOTE_UNLOCKED(vp, b) VN_KNOTE(vp, b, 0) /* * Vnode flags. * VI flags are protected by interlock and live in v_iflag * VV flags are protected by the vnode lock and live in v_vflag * * VI_DOOMED is doubly protected by the interlock and vnode lock. Both * are required for writing but the status may be checked with either. */ #define VI_MOUNT 0x0020 /* Mount in progress */ #define VI_DOOMED 0x0080 /* This vnode is being recycled */ #define VI_FREE 0x0100 /* This vnode is on the freelist */ #define VI_ACTIVE 0x0200 /* This vnode is on the active list */ #define VI_DOINGINACT 0x0800 /* VOP_INACTIVE is in progress */ #define VI_OWEINACT 0x1000 /* Need to call inactive */ #define VV_ROOT 0x0001 /* root of its filesystem */ #define VV_ISTTY 0x0002 /* vnode represents a tty */ #define VV_NOSYNC 0x0004 /* unlinked, stop syncing */ #define VV_ETERNALDEV 0x0008 /* device that is never destroyed */ #define VV_CACHEDLABEL 0x0010 /* Vnode has valid cached MAC label */ #define VV_TEXT 0x0020 /* vnode is a pure text prototype */ #define VV_COPYONWRITE 0x0040 /* vnode is doing copy-on-write */ #define VV_SYSTEM 0x0080 /* vnode being used by kernel */ #define VV_PROCDEP 0x0100 /* vnode is process dependent */ #define VV_NOKNOTE 0x0200 /* don't activate knotes on this vnode */ #define VV_DELETED 0x0400 /* should be removed */ #define VV_MD 0x0800 /* vnode backs the md device */ #define VV_FORCEINSMQ 0x1000 /* force the insmntque to succeed */ #define VMP_TMPMNTFREELIST 0x0001 /* Vnode is on mnt's tmp free list */ /* * Vnode attributes. A field value of VNOVAL represents a field whose value * is unavailable (getattr) or which is not to be changed (setattr). */ struct vattr { enum vtype va_type; /* vnode type (for create) */ u_short va_mode; /* files access mode and type */ short va_nlink; /* number of references to file */ uid_t va_uid; /* owner user id */ gid_t va_gid; /* owner group id */ dev_t va_fsid; /* filesystem id */ long va_fileid; /* file id */ u_quad_t va_size; /* file size in bytes */ long va_blocksize; /* blocksize preferred for i/o */ struct timespec va_atime; /* time of last access */ struct timespec va_mtime; /* time of last modification */ struct timespec va_ctime; /* time file changed */ struct timespec va_birthtime; /* time file created */ u_long va_gen; /* generation number of file */ u_long va_flags; /* flags defined for file */ dev_t va_rdev; /* device the special file represents */ u_quad_t va_bytes; /* bytes of disk space held by file */ u_quad_t va_filerev; /* file modification number */ u_int va_vaflags; /* operations flags, see below */ long va_spare; /* remain quad aligned */ }; /* * Flags for va_vaflags. */ #define VA_UTIMES_NULL 0x01 /* utimes argument was NULL */ #define VA_EXCLUSIVE 0x02 /* exclusive create request */ #define VA_SYNC 0x04 /* O_SYNC truncation */ /* * Flags for ioflag. (high 16 bits used to ask for read-ahead and * help with write clustering) * NB: IO_NDELAY and IO_DIRECT are linked to fcntl.h */ #define IO_UNIT 0x0001 /* do I/O as atomic unit */ #define IO_APPEND 0x0002 /* append write to end */ #define IO_NDELAY 0x0004 /* FNDELAY flag set in file table */ #define IO_NODELOCKED 0x0008 /* underlying node already locked */ #define IO_ASYNC 0x0010 /* bawrite rather then bdwrite */ #define IO_VMIO 0x0020 /* data already in VMIO space */ #define IO_INVAL 0x0040 /* invalidate after I/O */ #define IO_SYNC 0x0080 /* do I/O synchronously */ #define IO_DIRECT 0x0100 /* attempt to bypass buffer cache */ #define IO_EXT 0x0400 /* operate on external attributes */ #define IO_NORMAL 0x0800 /* operate on regular data */ #define IO_NOMACCHECK 0x1000 /* MAC checks unnecessary */ #define IO_BUFLOCKED 0x2000 /* ffs flag; indir buf is locked */ #define IO_RANGELOCKED 0x4000 /* range locked */ #define IO_SEQMAX 0x7F /* seq heuristic max value */ #define IO_SEQSHIFT 16 /* seq heuristic in upper 16 bits */ /* * Flags for accmode_t. */ #define VEXEC 000000000100 /* execute/search permission */ #define VWRITE 000000000200 /* write permission */ #define VREAD 000000000400 /* read permission */ #define VADMIN 000000010000 /* being the file owner */ #define VAPPEND 000000040000 /* permission to write/append */ /* * VEXPLICIT_DENY makes VOP_ACCESSX(9) return EPERM or EACCES only * if permission was denied explicitly, by a "deny" rule in NFSv4 ACL, * and 0 otherwise. This never happens with ordinary unix access rights * or POSIX.1e ACLs. Obviously, VEXPLICIT_DENY must be OR-ed with * some other V* constant. */ #define VEXPLICIT_DENY 000000100000 #define VREAD_NAMED_ATTRS 000000200000 /* not used */ #define VWRITE_NAMED_ATTRS 000000400000 /* not used */ #define VDELETE_CHILD 000001000000 #define VREAD_ATTRIBUTES 000002000000 /* permission to stat(2) */ #define VWRITE_ATTRIBUTES 000004000000 /* change {m,c,a}time */ #define VDELETE 000010000000 #define VREAD_ACL 000020000000 /* read ACL and file mode */ #define VWRITE_ACL 000040000000 /* change ACL and/or file mode */ #define VWRITE_OWNER 000100000000 /* change file owner */ #define VSYNCHRONIZE 000200000000 /* not used */ #define VCREAT 000400000000 /* creating new file */ #define VVERIFY 001000000000 /* verification required */ /* * Permissions that were traditionally granted only to the file owner. */ #define VADMIN_PERMS (VADMIN | VWRITE_ATTRIBUTES | VWRITE_ACL | \ VWRITE_OWNER) /* * Permissions that were traditionally granted to everyone. */ #define VSTAT_PERMS (VREAD_ATTRIBUTES | VREAD_ACL) /* * Permissions that allow to change the state of the file in any way. */ #define VMODIFY_PERMS (VWRITE | VAPPEND | VADMIN_PERMS | VDELETE_CHILD | \ VDELETE) /* * Token indicating no attribute value yet assigned. */ #define VNOVAL (-1) /* * LK_TIMELOCK timeout for vnode locks (used mainly by the pageout daemon) */ #define VLKTIMEOUT (hz / 20 + 1) #ifdef _KERNEL #ifdef MALLOC_DECLARE MALLOC_DECLARE(M_VNODE); #endif extern u_int ncsizefactor; /* * Convert between vnode types and inode formats (since POSIX.1 * defines mode word of stat structure in terms of inode formats). */ extern enum vtype iftovt_tab[]; extern int vttoif_tab[]; #define IFTOVT(mode) (iftovt_tab[((mode) & S_IFMT) >> 12]) #define VTTOIF(indx) (vttoif_tab[(int)(indx)]) #define MAKEIMODE(indx, mode) (int)(VTTOIF(indx) | (mode)) /* * Flags to various vnode functions. */ #define SKIPSYSTEM 0x0001 /* vflush: skip vnodes marked VSYSTEM */ #define FORCECLOSE 0x0002 /* vflush: force file closure */ #define WRITECLOSE 0x0004 /* vflush: only close writable files */ #define EARLYFLUSH 0x0008 /* vflush: early call for ffs_flushfiles */ #define V_SAVE 0x0001 /* vinvalbuf: sync file first */ #define V_ALT 0x0002 /* vinvalbuf: invalidate only alternate bufs */ #define V_NORMAL 0x0004 /* vinvalbuf: invalidate only regular bufs */ #define V_CLEANONLY 0x0008 /* vinvalbuf: invalidate only clean bufs */ #define REVOKEALL 0x0001 /* vop_revoke: revoke all aliases */ #define V_WAIT 0x0001 /* vn_start_write: sleep for suspend */ #define V_NOWAIT 0x0002 /* vn_start_write: don't sleep for suspend */ #define V_XSLEEP 0x0004 /* vn_start_write: just return after sleep */ #define V_MNTREF 0x0010 /* vn_start_write: mp is already ref-ed */ #define VR_START_WRITE 0x0001 /* vfs_write_resume: start write atomically */ #define VR_NO_SUSPCLR 0x0002 /* vfs_write_resume: do not clear suspension */ #define VS_SKIP_UNMOUNT 0x0001 /* vfs_write_suspend: fail if the filesystem is being unmounted */ #define VREF(vp) vref(vp) #ifdef DIAGNOSTIC #define VATTR_NULL(vap) vattr_null(vap) #else #define VATTR_NULL(vap) (*(vap) = va_null) /* initialize a vattr */ #endif /* DIAGNOSTIC */ #define NULLVP ((struct vnode *)NULL) /* * Global vnode data. */ extern struct vnode *rootvnode; /* root (i.e. "/") vnode */ extern struct mount *rootdevmp; /* "/dev" mount */ extern int desiredvnodes; /* number of vnodes desired */ extern struct uma_zone *namei_zone; extern struct vattr va_null; /* predefined null vattr structure */ #define VI_LOCK(vp) mtx_lock(&(vp)->v_interlock) #define VI_LOCK_FLAGS(vp, flags) mtx_lock_flags(&(vp)->v_interlock, (flags)) #define VI_TRYLOCK(vp) mtx_trylock(&(vp)->v_interlock) #define VI_UNLOCK(vp) mtx_unlock(&(vp)->v_interlock) #define VI_MTX(vp) (&(vp)->v_interlock) #define VN_LOCK_AREC(vp) lockallowrecurse((vp)->v_vnlock) #define VN_LOCK_ASHARE(vp) lockallowshare((vp)->v_vnlock) #define VN_LOCK_DSHARE(vp) lockdisableshare((vp)->v_vnlock) #endif /* _KERNEL */ /* * Mods for extensibility. */ /* * Flags for vdesc_flags: */ #define VDESC_MAX_VPS 16 /* Low order 16 flag bits are reserved for willrele flags for vp arguments. */ #define VDESC_VP0_WILLRELE 0x0001 #define VDESC_VP1_WILLRELE 0x0002 #define VDESC_VP2_WILLRELE 0x0004 #define VDESC_VP3_WILLRELE 0x0008 #define VDESC_NOMAP_VPP 0x0100 #define VDESC_VPP_WILLRELE 0x0200 /* * A generic structure. * This can be used by bypass routines to identify generic arguments. */ struct vop_generic_args { struct vnodeop_desc *a_desc; /* other random data follows, presumably */ }; typedef int vop_bypass_t(struct vop_generic_args *); /* * VDESC_NO_OFFSET is used to identify the end of the offset list * and in places where no such field exists. */ #define VDESC_NO_OFFSET -1 /* * This structure describes the vnode operation taking place. */ struct vnodeop_desc { char *vdesc_name; /* a readable name for debugging */ int vdesc_flags; /* VDESC_* flags */ vop_bypass_t *vdesc_call; /* Function to call */ /* * These ops are used by bypass routines to map and locate arguments. * Creds and procs are not needed in bypass routines, but sometimes * they are useful to (for example) transport layers. * Nameidata is useful because it has a cred in it. */ int *vdesc_vp_offsets; /* list ended by VDESC_NO_OFFSET */ int vdesc_vpp_offset; /* return vpp location */ int vdesc_cred_offset; /* cred location, if any */ int vdesc_thread_offset; /* thread location, if any */ int vdesc_componentname_offset; /* if any */ }; #ifdef _KERNEL /* * A list of all the operation descs. */ extern struct vnodeop_desc *vnodeop_descs[]; #define VOPARG_OFFSETOF(s_type, field) __offsetof(s_type, field) #define VOPARG_OFFSETTO(s_type, s_offset, struct_p) \ ((s_type)(((char*)(struct_p)) + (s_offset))) #ifdef DEBUG_VFS_LOCKS /* * Support code to aid in debugging VFS locking problems. Not totally * reliable since if the thread sleeps between changing the lock * state and checking it with the assert, some other thread could * change the state. They are good enough for debugging a single * filesystem using a single-threaded test. Note that the unreliability is * limited to false negatives; efforts were made to ensure that false * positives cannot occur. */ void assert_vi_locked(struct vnode *vp, const char *str); void assert_vi_unlocked(struct vnode *vp, const char *str); void assert_vop_elocked(struct vnode *vp, const char *str); void assert_vop_locked(struct vnode *vp, const char *str); void assert_vop_unlocked(struct vnode *vp, const char *str); #define ASSERT_VI_LOCKED(vp, str) assert_vi_locked((vp), (str)) #define ASSERT_VI_UNLOCKED(vp, str) assert_vi_unlocked((vp), (str)) #define ASSERT_VOP_ELOCKED(vp, str) assert_vop_elocked((vp), (str)) #define ASSERT_VOP_LOCKED(vp, str) assert_vop_locked((vp), (str)) #define ASSERT_VOP_UNLOCKED(vp, str) assert_vop_unlocked((vp), (str)) #else /* !DEBUG_VFS_LOCKS */ #define ASSERT_VI_LOCKED(vp, str) ((void)0) #define ASSERT_VI_UNLOCKED(vp, str) ((void)0) #define ASSERT_VOP_ELOCKED(vp, str) ((void)0) #define ASSERT_VOP_LOCKED(vp, str) ((void)0) #define ASSERT_VOP_UNLOCKED(vp, str) ((void)0) #endif /* DEBUG_VFS_LOCKS */ /* * This call works for vnodes in the kernel. */ #define VCALL(c) ((c)->a_desc->vdesc_call(c)) #define DOINGASYNC(vp) \ (((vp)->v_mount->mnt_kern_flag & MNTK_ASYNC) != 0 && \ ((curthread->td_pflags & TDP_SYNCIO) == 0)) /* * VMIO support inline */ extern int vmiodirenable; static __inline int vn_canvmio(struct vnode *vp) { if (vp && (vp->v_type == VREG || (vmiodirenable && vp->v_type == VDIR))) return(TRUE); return(FALSE); } /* * Finally, include the default set of vnode operations. */ typedef void vop_getpages_iodone_t(void *, vm_page_t *, int, int); #include "vnode_if.h" /* vn_open_flags */ #define VN_OPEN_NOAUDIT 0x00000001 #define VN_OPEN_NOCAPCHECK 0x00000002 #define VN_OPEN_NAMECACHE 0x00000004 /* * Public vnode manipulation functions. */ struct componentname; struct file; struct mount; struct nameidata; struct ostat; struct thread; struct proc; struct stat; struct nstat; struct ucred; struct uio; struct vattr; struct vfsops; struct vnode; typedef int (*vn_get_ino_t)(struct mount *, void *, int, struct vnode **); int bnoreuselist(struct bufv *bufv, struct bufobj *bo, daddr_t startn, daddr_t endn); /* cache_* may belong in namei.h. */ void cache_changesize(int newhashsize); #define cache_enter(dvp, vp, cnp) \ cache_enter_time(dvp, vp, cnp, NULL, NULL) void cache_enter_time(struct vnode *dvp, struct vnode *vp, struct componentname *cnp, struct timespec *tsp, struct timespec *dtsp); int cache_lookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, struct timespec *tsp, int *ticksp); void cache_purge(struct vnode *vp); void cache_purge_negative(struct vnode *vp); -void cache_purgevfs(struct mount *mp); +void cache_purgevfs(struct mount *mp, bool force); int change_dir(struct vnode *vp, struct thread *td); void cvtstat(struct stat *st, struct ostat *ost); void cvtnstat(struct stat *sb, struct nstat *nsb); int getnewvnode(const char *tag, struct mount *mp, struct vop_vector *vops, struct vnode **vpp); void getnewvnode_reserve(u_int count); void getnewvnode_drop_reserve(void); int insmntque1(struct vnode *vp, struct mount *mp, void (*dtr)(struct vnode *, void *), void *dtr_arg); int insmntque(struct vnode *vp, struct mount *mp); u_quad_t init_va_filerev(void); int speedup_syncer(void); int vn_vptocnp(struct vnode **vp, struct ucred *cred, char *buf, u_int *buflen); int vn_fullpath(struct thread *td, struct vnode *vn, char **retbuf, char **freebuf); int vn_fullpath_global(struct thread *td, struct vnode *vn, char **retbuf, char **freebuf); struct vnode * vn_dir_dd_ino(struct vnode *vp); int vn_commname(struct vnode *vn, char *buf, u_int buflen); int vn_path_to_global_path(struct thread *td, struct vnode *vp, char *path, u_int pathlen); int vaccess(enum vtype type, mode_t file_mode, uid_t file_uid, gid_t file_gid, accmode_t accmode, struct ucred *cred, int *privused); int vaccess_acl_nfs4(enum vtype type, uid_t file_uid, gid_t file_gid, struct acl *aclp, accmode_t accmode, struct ucred *cred, int *privused); int vaccess_acl_posix1e(enum vtype type, uid_t file_uid, gid_t file_gid, struct acl *acl, accmode_t accmode, struct ucred *cred, int *privused); void vattr_null(struct vattr *vap); int vcount(struct vnode *vp); #define vdrop(vp) _vdrop((vp), 0) #define vdropl(vp) _vdrop((vp), 1) void _vdrop(struct vnode *, bool); int vflush(struct mount *mp, int rootrefs, int flags, struct thread *td); int vget(struct vnode *vp, int lockflag, struct thread *td); void vgone(struct vnode *vp); #define vhold(vp) _vhold((vp), 0) #define vholdl(vp) _vhold((vp), 1) void _vhold(struct vnode *, bool); void vinactive(struct vnode *, struct thread *); int vinvalbuf(struct vnode *vp, int save, int slpflag, int slptimeo); int vtruncbuf(struct vnode *vp, struct ucred *cred, off_t length, int blksize); void vunref(struct vnode *); void vn_printf(struct vnode *vp, const char *fmt, ...) __printflike(2,3); int vrecycle(struct vnode *vp); int vrecyclel(struct vnode *vp); int vn_bmap_seekhole(struct vnode *vp, u_long cmd, off_t *off, struct ucred *cred); int vn_close(struct vnode *vp, int flags, struct ucred *file_cred, struct thread *td); void vn_finished_write(struct mount *mp); void vn_finished_secondary_write(struct mount *mp); int vn_isdisk(struct vnode *vp, int *errp); int _vn_lock(struct vnode *vp, int flags, char *file, int line); #define vn_lock(vp, flags) _vn_lock(vp, flags, __FILE__, __LINE__) int vn_open(struct nameidata *ndp, int *flagp, int cmode, struct file *fp); int vn_open_cred(struct nameidata *ndp, int *flagp, int cmode, u_int vn_open_flags, struct ucred *cred, struct file *fp); int vn_open_vnode(struct vnode *vp, int fmode, struct ucred *cred, struct thread *td, struct file *fp); void vn_pages_remove(struct vnode *vp, vm_pindex_t start, vm_pindex_t end); int vn_pollrecord(struct vnode *vp, struct thread *p, int events); int vn_rdwr(enum uio_rw rw, struct vnode *vp, void *base, int len, off_t offset, enum uio_seg segflg, int ioflg, struct ucred *active_cred, struct ucred *file_cred, ssize_t *aresid, struct thread *td); int vn_rdwr_inchunks(enum uio_rw rw, struct vnode *vp, void *base, size_t len, off_t offset, enum uio_seg segflg, int ioflg, struct ucred *active_cred, struct ucred *file_cred, size_t *aresid, struct thread *td); int vn_rlimit_fsize(const struct vnode *vn, const struct uio *uio, struct thread *td); int vn_stat(struct vnode *vp, struct stat *sb, struct ucred *active_cred, struct ucred *file_cred, struct thread *td); int vn_start_write(struct vnode *vp, struct mount **mpp, int flags); int vn_start_secondary_write(struct vnode *vp, struct mount **mpp, int flags); int vn_writechk(struct vnode *vp); int vn_extattr_get(struct vnode *vp, int ioflg, int attrnamespace, const char *attrname, int *buflen, char *buf, struct thread *td); int vn_extattr_set(struct vnode *vp, int ioflg, int attrnamespace, const char *attrname, int buflen, char *buf, struct thread *td); int vn_extattr_rm(struct vnode *vp, int ioflg, int attrnamespace, const char *attrname, struct thread *td); int vn_vget_ino(struct vnode *vp, ino_t ino, int lkflags, struct vnode **rvp); int vn_vget_ino_gen(struct vnode *vp, vn_get_ino_t alloc, void *alloc_arg, int lkflags, struct vnode **rvp); int vn_utimes_perm(struct vnode *vp, struct vattr *vap, struct ucred *cred, struct thread *td); int vn_io_fault_uiomove(char *data, int xfersize, struct uio *uio); int vn_io_fault_pgmove(vm_page_t ma[], vm_offset_t offset, int xfersize, struct uio *uio); #define vn_rangelock_unlock(vp, cookie) \ rangelock_unlock(&(vp)->v_rl, (cookie), VI_MTX(vp)) #define vn_rangelock_unlock_range(vp, cookie, start, end) \ rangelock_unlock_range(&(vp)->v_rl, (cookie), (start), (end), \ VI_MTX(vp)) #define vn_rangelock_rlock(vp, start, end) \ rangelock_rlock(&(vp)->v_rl, (start), (end), VI_MTX(vp)) #define vn_rangelock_wlock(vp, start, end) \ rangelock_wlock(&(vp)->v_rl, (start), (end), VI_MTX(vp)) int vfs_cache_lookup(struct vop_lookup_args *ap); void vfs_timestamp(struct timespec *); void vfs_write_resume(struct mount *mp, int flags); int vfs_write_suspend(struct mount *mp, int flags); int vfs_write_suspend_umnt(struct mount *mp); void vnlru_free(int, struct vfsops *); int vop_stdbmap(struct vop_bmap_args *); int vop_stdfdatasync_buf(struct vop_fdatasync_args *); int vop_stdfsync(struct vop_fsync_args *); int vop_stdgetwritemount(struct vop_getwritemount_args *); int vop_stdgetpages(struct vop_getpages_args *); int vop_stdinactive(struct vop_inactive_args *); int vop_stdislocked(struct vop_islocked_args *); int vop_stdkqfilter(struct vop_kqfilter_args *); int vop_stdlock(struct vop_lock1_args *); int vop_stdputpages(struct vop_putpages_args *); int vop_stdunlock(struct vop_unlock_args *); int vop_nopoll(struct vop_poll_args *); int vop_stdaccess(struct vop_access_args *ap); int vop_stdaccessx(struct vop_accessx_args *ap); int vop_stdadvise(struct vop_advise_args *ap); int vop_stdadvlock(struct vop_advlock_args *ap); int vop_stdadvlockasync(struct vop_advlockasync_args *ap); int vop_stdadvlockpurge(struct vop_advlockpurge_args *ap); int vop_stdallocate(struct vop_allocate_args *ap); int vop_stdpathconf(struct vop_pathconf_args *); int vop_stdpoll(struct vop_poll_args *); int vop_stdvptocnp(struct vop_vptocnp_args *ap); int vop_stdvptofh(struct vop_vptofh_args *ap); int vop_stdunp_bind(struct vop_unp_bind_args *ap); int vop_stdunp_connect(struct vop_unp_connect_args *ap); int vop_stdunp_detach(struct vop_unp_detach_args *ap); int vop_eopnotsupp(struct vop_generic_args *ap); int vop_ebadf(struct vop_generic_args *ap); int vop_einval(struct vop_generic_args *ap); int vop_enoent(struct vop_generic_args *ap); int vop_enotty(struct vop_generic_args *ap); int vop_null(struct vop_generic_args *ap); int vop_panic(struct vop_generic_args *ap); int dead_poll(struct vop_poll_args *ap); int dead_read(struct vop_read_args *ap); int dead_write(struct vop_write_args *ap); /* These are called from within the actual VOPS. */ void vop_close_post(void *a, int rc); void vop_create_post(void *a, int rc); void vop_deleteextattr_post(void *a, int rc); void vop_link_post(void *a, int rc); void vop_lookup_post(void *a, int rc); void vop_lookup_pre(void *a); void vop_mkdir_post(void *a, int rc); void vop_mknod_post(void *a, int rc); void vop_open_post(void *a, int rc); void vop_read_post(void *a, int rc); void vop_readdir_post(void *a, int rc); void vop_reclaim_post(void *a, int rc); void vop_remove_post(void *a, int rc); void vop_rename_post(void *a, int rc); void vop_rename_pre(void *a); void vop_rmdir_post(void *a, int rc); void vop_setattr_post(void *a, int rc); void vop_setextattr_post(void *a, int rc); void vop_symlink_post(void *a, int rc); #ifdef DEBUG_VFS_LOCKS void vop_strategy_pre(void *a); void vop_lock_pre(void *a); void vop_lock_post(void *a, int rc); void vop_unlock_post(void *a, int rc); void vop_unlock_pre(void *a); #else #define vop_strategy_pre(x) do { } while (0) #define vop_lock_pre(x) do { } while (0) #define vop_lock_post(x, y) do { } while (0) #define vop_unlock_post(x, y) do { } while (0) #define vop_unlock_pre(x) do { } while (0) #endif void vop_rename_fail(struct vop_rename_args *ap); #define VOP_WRITE_PRE(ap) \ struct vattr va; \ int error; \ off_t osize, ooffset, noffset; \ \ osize = ooffset = noffset = 0; \ if (!VN_KNLIST_EMPTY((ap)->a_vp)) { \ error = VOP_GETATTR((ap)->a_vp, &va, (ap)->a_cred); \ if (error) \ return (error); \ ooffset = (ap)->a_uio->uio_offset; \ osize = (off_t)va.va_size; \ } #define VOP_WRITE_POST(ap, ret) \ noffset = (ap)->a_uio->uio_offset; \ if (noffset > ooffset && !VN_KNLIST_EMPTY((ap)->a_vp)) { \ VFS_KNOTE_LOCKED((ap)->a_vp, NOTE_WRITE \ | (noffset > osize ? NOTE_EXTEND : 0)); \ } #define VOP_LOCK(vp, flags) VOP_LOCK1(vp, flags, __FILE__, __LINE__) void vput(struct vnode *vp); void vrele(struct vnode *vp); void vref(struct vnode *vp); void vrefl(struct vnode *vp); int vrefcnt(struct vnode *vp); void v_addpollinfo(struct vnode *vp); int vnode_create_vobject(struct vnode *vp, off_t size, struct thread *td); void vnode_destroy_vobject(struct vnode *vp); extern struct vop_vector fifo_specops; extern struct vop_vector dead_vnodeops; extern struct vop_vector default_vnodeops; #define VOP_PANIC ((void*)(uintptr_t)vop_panic) #define VOP_NULL ((void*)(uintptr_t)vop_null) #define VOP_EBADF ((void*)(uintptr_t)vop_ebadf) #define VOP_ENOTTY ((void*)(uintptr_t)vop_enotty) #define VOP_EINVAL ((void*)(uintptr_t)vop_einval) #define VOP_ENOENT ((void*)(uintptr_t)vop_enoent) #define VOP_EOPNOTSUPP ((void*)(uintptr_t)vop_eopnotsupp) /* fifo_vnops.c */ int fifo_printinfo(struct vnode *); /* vfs_hash.c */ typedef int vfs_hash_cmp_t(struct vnode *vp, void *arg); void vfs_hash_changesize(int newhashsize); int vfs_hash_get(const struct mount *mp, u_int hash, int flags, struct thread *td, struct vnode **vpp, vfs_hash_cmp_t *fn, void *arg); u_int vfs_hash_index(struct vnode *vp); int vfs_hash_insert(struct vnode *vp, u_int hash, int flags, struct thread *td, struct vnode **vpp, vfs_hash_cmp_t *fn, void *arg); void vfs_hash_ref(const struct mount *mp, u_int hash, struct thread *td, struct vnode **vpp, vfs_hash_cmp_t *fn, void *arg); void vfs_hash_rehash(struct vnode *vp, u_int hash); void vfs_hash_remove(struct vnode *vp); int vfs_kqfilter(struct vop_kqfilter_args *); void vfs_mark_atime(struct vnode *vp, struct ucred *cred); struct dirent; int vfs_read_dirent(struct vop_readdir_args *ap, struct dirent *dp, off_t off); int vfs_unixify_accmode(accmode_t *accmode); void vfs_unp_reclaim(struct vnode *vp); int setfmode(struct thread *td, struct ucred *cred, struct vnode *vp, int mode); int setfown(struct thread *td, struct ucred *cred, struct vnode *vp, uid_t uid, gid_t gid); int vn_chmod(struct file *fp, mode_t mode, struct ucred *active_cred, struct thread *td); int vn_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred, struct thread *td); #endif /* _KERNEL */ #endif /* !_SYS_VNODE_H_ */ Index: user/alc/PQ_LAUNDRY/sys/sys/vtoc.h =================================================================== --- user/alc/PQ_LAUNDRY/sys/sys/vtoc.h (revision 306831) +++ user/alc/PQ_LAUNDRY/sys/sys/vtoc.h (revision 306832) @@ -1,108 +1,5 @@ /*- - * Copyright (c) 2008 Marcel Moolenaar - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR - * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES - * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. - * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, - * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT - * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF - * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - * $FreeBSD$ + * This file is in the public domain. */ - -#ifndef _SYS_VTOC_H_ -#define _SYS_VTOC_H_ - -#define VTOC_TAG_UNASSIGNED 0x00 -#define VTOC_TAG_BOOT 0x01 -#define VTOC_TAG_ROOT 0x02 -#define VTOC_TAG_SWAP 0x03 -#define VTOC_TAG_USR 0x04 -#define VTOC_TAG_BACKUP 0x05 /* "c" partition */ -#define VTOC_TAG_STAND 0x06 -#define VTOC_TAG_VAR 0x07 -#define VTOC_TAG_HOME 0x08 -#define VTOC_TAG_ALTSCTR 0x09 /* alternate sector partition */ -#define VTOC_TAG_CACHE 0x0a /* Solaris cachefs partition */ -#define VTOC_TAG_VXVM_PUB 0x0e /* VxVM public region */ -#define VTOC_TAG_VXVM_PRIV 0x0f /* VxVM private region */ - -/* NetBSD/mips defines this */ -#define VTOC_TAG_NETBSD_FFS 0xff - -/* FreeBSD tags: the high byte equals ELFOSABI_FREEBSD */ -#define VTOC_TAG_FREEBSD_SWAP 0x0901 -#define VTOC_TAG_FREEBSD_UFS 0x0902 -#define VTOC_TAG_FREEBSD_VINUM 0x0903 -#define VTOC_TAG_FREEBSD_ZFS 0x0904 -#define VTOC_TAG_FREEBSD_NANDFS 0x0905 - -#define VTOC_FLAG_UNMNT 0x01 /* unmountable partition */ -#define VTOC_FLAG_RDONLY 0x10 /* partition is read/only */ - -#define VTOC_ASCII_LEN 128 -#define VTOC_BOOTSIZE 8192 /* 16 sectors */ -#define VTOC_MAGIC 0xdabe -#define VTOC_RAW_PART 2 -#define VTOC_SANITY 0x600ddeee -#define VTOC_VERSION 1 -#define VTOC_VOLUME_LEN 8 - -#define VTOC8_NPARTS 8 - -struct vtoc8 { - char ascii[VTOC_ASCII_LEN]; - uint32_t version; - char volume[VTOC_VOLUME_LEN]; - uint16_t nparts; - struct { - uint16_t tag; - uint16_t flag; - } part[VTOC8_NPARTS] __packed; - uint16_t __alignment; - uint32_t bootinfo[3]; - uint32_t sanity; - uint32_t reserved[10]; - uint32_t timestamp[VTOC8_NPARTS]; - uint16_t wskip; - uint16_t rskip; - char padding[152]; - uint16_t rpm; - uint16_t physcyls; - uint16_t sparesecs; - uint16_t spare1[2]; - uint16_t interleave; - uint16_t ncyls; - uint16_t altcyls; - uint16_t nheads; - uint16_t nsecs; - uint16_t spare2[2]; - struct { - uint32_t cyl; - uint32_t nblks; - } map[VTOC8_NPARTS]; - uint16_t magic; - uint16_t cksum; -} __packed; - -#ifdef CTASSERT -CTASSERT(sizeof(struct vtoc8) == 512); -#endif - -#endif /* _SYS_VTOC_H_ */ +/* $FreeBSD$ */ +#include Property changes on: user/alc/PQ_LAUNDRY/sys/sys/vtoc.h ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:mime-type ## -0,0 +1 ## +text/plain \ No newline at end of property Index: user/alc/PQ_LAUNDRY/tools/tools/cxgbetool/cxgbetool.8 =================================================================== --- user/alc/PQ_LAUNDRY/tools/tools/cxgbetool/cxgbetool.8 (revision 306831) +++ user/alc/PQ_LAUNDRY/tools/tools/cxgbetool/cxgbetool.8 (revision 306832) @@ -1,490 +1,503 @@ .\" Copyright (c) 2015, Chelsio Inc .\" All rights reserved. .\" .\" Redistribution and use in source and binary forms, with or without .\" modification, are permitted provided that the following conditions are met: .\" .\" 1. Redistributions of source code must retain the above copyright notice, .\" this list of conditions and the following disclaimer. .\" .\" 2. Redistributions in binary form must reproduce the above copyright .\" notice, this list of conditions and the following disclaimer in the .\" documentation and/or other materials provided with the distribution. .\" .\" 3. Neither the name of the Chelsio Inc nor the names of its .\" contributors may be used to endorse or promote products derived from .\" this software without specific prior written permission. .\" .\" THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" .\" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE .\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE .\" ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE .\" LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR .\" CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF .\" SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS .\" INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN .\" CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) .\" ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE .\" POSSIBILITY OF SUCH DAMAGE. .\" .\" * Other names and brands may be claimed as the property of others. .\" .\" $FreeBSD$ .\" .Dd February 1, 2013 .Dt CXGBETOOL 8 .Os .Sh NAME .Nm cxgbetool .Nd Userspace companion to .Xr cxgbe 4 .Sh SYNOPSIS .Bl -item -compact .It .Nm Ar nexus command Op Ar parameter ... .Pp .It .Nm Ar nexus Cm clearstats Ar port_id .It .Nm Ar nexus Cm context Bro Cm ingress | egress | fl | cong Brc Ar cntxt_id .It .Nm Ar nexus Cm filter mode Op Ar match-criteria ... .It .Nm Ar nexus Cm filter Ar idx Bro Ar filter-specification | Cm delete Brc .It .Nm Ar nexus Cm filter list .It .Nm Ar nexus Cm i2c Ar port_id devaddr addr Op Ar len .It +.Nm Ar nexus Cm loadcfg Ar fw-config.txt +.It +.Nm Ar nexus Cm loadcfg clear +.It .Nm Ar nexus Cm loadfw Ar fw-image.bin .It .Nm Ar nexus Cm memdump Ar addr len .It .Nm Ar nexus Bro Cm reg | reg64 Brc Ar addr Ns Op Ar =val .It .Nm Ar nexus Cm regdump Op Ar register-block ... .It .Nm Ar nexus Cm sched-class Ar sub-command Op Ar param Ar value .It .Nm Ar nexus Cm sched-queue Ar port Ar queue Ar class .It .Nm Ar nexus Cm stdio .It .Nm Ar nexus Cm tcb Ar tid .El .Sh DESCRIPTION .Nm provides command-line access to features and debug facilities exported by .Xr cxgbe 4 via private ioctls. The target nexus device, .Va t4nex%d , is always the first argument. (The parent nexus for an Ethernet port .Va cxgbe%d is listed in .Va dev.cxgbe.%d.%parent in the .Xr sysctl 8 MIB). The rest consists of a command and any parameters required by that command. .Ss Commands .Bl -ohang .It Cm clearstats Ar port_id Clear all transmit, receive, and error statistics of all queues associated with a port. The total number of ports attached to a nexus is listed in .Va dev.t4nex.%d.nports and the 0 based .Ar port_id identifies a port within this range. .Pp .Bl -item -compact .It .Cm context ingress Ar ingress_cntxt_id .It .Cm context cong Ar ingress_cntxt_id .It .Cm context egress Ar egress_cntxt_id .It .Cm context fl Ar flm_cntxt_id .El Display hardware context for an ingress queue, congestion manager, egress queue, or freelist manager. .Bl -tag -width ingress_cntxt_id -compact .It Ar ingress_cntxt_id context id of an ingress queue -- the value listed in one of .Va dev.t4nex.%d.fwq.cntxt_id Ns , .Va dev.cxgbe.%d.rxq.%d.cntxt_id Ns , or .Va dev.cxgbe.%d.ofld_rxq.%d.cntxt_id Ns . .It Ar egress_cntxt_id context id of an egress queue -- the value listed in one of .Va dev.t4nex.%d.mgmtq.cntxt_id Ns , .Va dev.cxgbe.%d.txq.%d.cntxt_id Ns , .Va dev.cxgbe.%d.ctrlq.%d.cntxt_id Ns , .Va dev.cxgbe.%d.ofld_txq.%d.cntxt_id Ns , .Va dev.cxgbe.%d.rxq.%d.fl.cntxt_id Ns , or .Va dev.cxgbe.%d.ofld_rxq.%d.fl.cntxt_id Ns . Note that freelists are egress queues too. .It Ar flm_cntxt_id context id of a freelist manager. The FLM context id is displayed in the egress context dump of a freelist as FLMcontextID. .El .It Cm filter mode Op Ar match-criteria ... Display or set the nexus's filter mode. .Ar match-criteria is a whitespace separated list of criteria from the table below. Each criteria has an associated budget which is also listed in the table. The total budget allowed is 36 and attempts to set a filter mode that exceeds this will be rejected. Every filter must conform to the filter mode -- multiple match criteria per filter are allowed but only from among those in the current setting of the filter mode. The filter mode can only be changed when there are no existing filters. Its default value is .Cm ipv4 ipv6 sip dip sport dport matchtype proto vlan iport fcoe .Pp (Note that .Ar mask defaults to all 1s when not provided explicitly. Also note that many of the items being matched are discrete numeric values rather than bit fields and should be masked with caution.) .TS center expand; cb cb cb cbw(40m) cb c l l. Criteria Budget Usage Matches if ... _ ipv4 0 T{ .Cm type ipv4 T} T{ incoming packet is an IPv4 datagram. T} _ ipv6 0 T{ .Cm type ipv6 T} T{ incoming packet is an IPv6 datagram. T} _ sip 0 T{ .Cm sip Ar addr Ns Op / Ns Ar mask T} T{ bitwise and of the source address in an incoming IP datagram with .Ar mask equals .Ar addr Ns . .Ar addr can be an IPv4 or IPv6 address. T} _ dip 0 T{ .Cm dip Ar addr Ns Op / Ns Ar mask T} T{ bitwise and of the destination address in an incoming IP datagram with .Ar mask equals .Ar addr Ns . .Ar addr can be an IPv4 or IPv6 address. T} _ sport 0 T{ .Cm sport Ar port Ns Op : Ns Ar mask T} T{ bitwise and of the source port in an incoming TCP or UDP datagram with .Ar mask equals .Ar port Ns . T} _ dport 0 T{ .Cm dport Ar port Ns Op : Ns Ar mask T} T{ bitwise and of the destination port in an incoming TCP or UDP datagram with .Ar mask equals .Ar port Ns . T} _ fcoe 1 T{ .Cm fcoe Brq 0 | 1 T} T{ incoming frame is Fibre Channel over Ethernet(1) or not(0). T} _ iport 3 T{ .Cm iport Ar val Ns Op : Ns Ar mask T} T{ bitwise and of the ingress port with .Ar mask equals .Ar val Ns . The ingress port is a 3 bit number that identifies the port on which a frame arrived. Physical ports are numbered 0-3 and 4-7 are internal loopback paths within the chip. Note that ingress port is not a bit field so it is not always possible to match an arbitrary subset of ingress ports with a single filter rule. T} _ ovlan 17 T{ .Cm ovlan Ar tag Ns Op : Ns Ar mask T} T{ bitwise and of the 16-bit outer VLAN tag of an incoming frame with .Ar mask equals .Ar tag Ns . T} _ vlan 17 T{ .Cm vlan Ar tag Ns Op : Ns Ar mask T} T{ bitwise and of the 16-bit VLAN tag of an incoming QinQ frame with .Ar mask equals .Ar tag Ns . The inner VLAN tag is used if the incoming frame is QinQ. T} _ tos 8 T{ .Cm tos Ar val Ns Op : Ns Ar mask T} T{ bitwise and of the 8-bit IP Type of Service/IPv6 Traffic Class in an incoming packet with .Ar mask equals .Ar val Ns . T} _ proto 8 T{ .Cm proto Ar ipproto Ns Op : Ns Ar mask T} T{ bitwise and of the 8-bit IP protocol in an incoming packet with .Ar mask equals .Ar ipproto Ns . T} _ ethtype 16 T{ .Cm ethtype Ar type Ns Op : Ns Ar mask T} T{ bitwise and of the 16-bit Ethernet type field of an incoming frame with .Ar mask equals .Ar type Ns . T} _ macidx 9 T{ .Cm macidx Ar idx Ns Op : Ns Ar mask T} T{ bitwise and of the MAC Address Match Index of an incoming frame with .Ar mask equals .Ar idx Ns . The MAC Address Match Index refers to an entry in the MPS TCAM or in the MPS hash. See .Cm matchtype for more information. T} _ matchtype 3 T{ .Cm matchtype Ar type Ns Op : Ns Ar mask T} T{ bitwise and of the Match Type of an incoming frame with .Ar mask equals .Ar idx Ns . Match Type is one of the following: .Bl -tag -width "n" -compact .It 0 destination MAC in incoming frame is a unicast L2 address that is programmed in the MPS TCAM. .Cm macidx can be used to match the index (and thus the MAC address) of the match in the TCAM. .It 1 destination MAC in incoming frame is a unicast L2 address that "hit" a hash entry in the MPS hash table. .Cm macidx can be used to match the index of the entry in the MPS hash table. .It 2 destination MAC in incoming frame is a multicast L2 address that is programmed in the MPS TCAM. .Cm macidx can be used to match the index (and thus the MAC address) of the match in the TCAM. .It 3 destination MAC in incoming frame is a multicast L2 address that "hit" an entry in the MPS hash table. .It 4 interface on which incoming frame was received is in promiscuous mode and the destination MAC in the frame is not a broadcast address, and does not match in the MPS TCAM or the MPS hash either. (The frame would have been discarded if the interface wasn't in promiscuous mode.) .It 5 interface on which incoming frame was received is in promiscuous mode and the destination MAC in the frame is not a broadcast address; it wasn't looked up in the MPS TCAM or the MPS hash because the chip is configured to give precedence to promiscuous mode classification. .It 6 destination MAC in incoming frame is a broadcast address. .It 7 Not documented. Do not use. .El T} _ frag 1 T{ .Cm frag Brq 0 | 1 T} T{ incoming frame is part of a fragmented IP datagram(1) or not(0). T} .TE .It Cm filter Ar idx Ar filter-specification Program a filter at the index specified by .Ar idx Ns . .Ar filter-specification consists of one or more matches to try against an incoming frame and an action to perform when all matches succeed. .It Cm filter Ar idx Cm delete Delete filter that is at the given index. .It Cm filter Cm list List all filters programmed into the hardware. .It Cm i2c Ar port_id devaddr addr Op Ar len +.It Cm loadcfg Ar fw-config.txt +Install the firmware configuration file contained in +.Ar fw-config.txt +to the card. +Set hw.cxgbe.config_file="flash" in loader.conf to get +.Xr cxgbe 4 +to use the on-flash configuration. +.It Cm loadcfg Cm clear +Erase configuration file from the card. .It Cm loadfw Ar fw-image.bin Install the firmware contained in .Ar fw-image.bin to the card. .It Cm memdump Ar addr len Display .Ar len bytes of data of the card's memory starting at .Ar addr Ns . The card's memory map is available in .Va dev.t4nex.%d.misc.meminfo Ns . .It Bro Cm reg | reg64 Brc Ar addr Ns Op Ar =val .It Cm regdump Op Ar register-block ... Display contents of device registers. One or more .Ar register-block can be specified to limit the registers displayed. The default is to display registers for all blocks. Registers with read side effects are not read during a .Cm regdump operation. .Ar register-block can be .Cm sge pci dbg mc ma edc0 edc1 cim tp ulp_rx ulp_tx pmrx pmtx mps cplsw .Cm smb i2c mi uart pmu sf pl le ncsi xgmac Ns . .It Cm sched-class config Op Ar param Ar value Configure optional feature capabilities for the TX scheduler. .Bl -ohang -offset indent .It Sy type Ar scheduler-type Use packet for the packet scheduler. .It Sy minmax Ar value A non-zero value will enable "minmax" mode; a zero value will disable "minmax" mode. .Pp NOTE: Many (most) of the parameters and constraints are adapter-specific - for instance the number of channels and classes which are available whether various modes are implemented, etc. Consult the adapter documentation for specific information on any limitations. .El .It Cm sched-class params Op Ar param Ar value Configure parameters for a scheduling class. .Bl -ohang -offset indent .It Sy type Ar scheduler-type Use packet for packet scheduler. .It Sy level Ar scheduler-hierarchy-level The "level" within the scheduling hierarchy which is being programed: .Pp .Bl -tag -width "cl-wrr" -compact -offset indent .It Sy cl-rl Class Rate Limiting. .Pp .It Sy cl-wrr Class Weighted Round Robin. .Pp .It Sy cl-wrr Channel Rate Limiting. .El .It Sy mode Ar scheduler-mode The mode in which the scheduling class is going to operate: .Pp .Bl -tag -width "class" -compact -offset indent .It Sy class All of the "flows" bound to the scheduling class will be held to aggregate scheduling constraints. .Pp .It Sy flow Each of the "flows" bound to the scheduling class will be held to the scheduling constraints. .El .Pp E.g. if the scheduling class has a TX bandwidth of 10Mb/s, in .Cm class mode, all of the "flows" bound to the class would be limited to an aggregate bandwidth of 10Mb/s; but in .Cm flow mode, each of the "flows" bound to the scheduling class would be limited to 10Mb/s. .It Sy rate-unit Ar scheduler-rate-unit The units of the scheduler rate constraints: .Pp .Bl -tag -width "bits" -compact -offset indent .It Sy bits bit rate in Kb/s. .Pp .It Sy pkts packets/s. .El .It Sy rate-mode Ar scheduler-rate-mode The mode of the scheduler rate constraints: .Pp .Bl -tag -width "relative" -compact -offset indent .It Sy relative percent of port rate. .Pp .It Sy absolute Kb/s. .El .It Sy channel Ar scheduler-channel-index The scheduling channel to which the scheduling class will be bound. .It Sy class Ar scheduler-class-index The scheduling class being programmed. .It Sy min-rate Ar minimum-rate The minimum guaranteed rate to which a rate-limiting scheduling class hierarchy will have access. .It Sy max-rate Ar maximum-rate The maximum rate for a rate-limiting scheduling class hierarchy. .It Sy weight Ar round-robin-weight The weight to be used for a weighted-round-robin scheduling hierarchy. .It Sy pkt-size Ar average-packet-size The average packet size will be used to compute scheduler constraints for a rate-limited scheduler class hierarchy. .Pp NOTE: Many (most) of the parameters and constraints are adapter-specific - for instance the number of channels and classes which are available, whether various modes are implemented, etc. Consult the adapter documentation for specific information on any limitations. .El .It Cm sched-queue Ar port queue class Bind the indicated port's NIC TX .Ar queue to the specified TX Scheduler .Ar class. If the TX .Ar queue is .Cm all, * or any negative value, the binding will apply to all of the TX queues associated with the .Ar interface. If the class is .Cm unbind, clear or any negative value, the TX queue(s) will be unbound from any current TX Scheduler Class binding. .It Cm stdio Switch to interactive mode. .It Cm tcb Ar tid Display contents of the hardware TCB (TCP Control Block) for the connection identfied by .Ar tid Ns . .El .Sh FILES /sys/dev/cxgbe/t4_ioctl.h .Sh AUTHORS This manual page was written by .An Navdeep Parhar Aq np@FreeBSD.org . Index: user/alc/PQ_LAUNDRY/tools/tools/cxgbetool/cxgbetool.c =================================================================== --- user/alc/PQ_LAUNDRY/tools/tools/cxgbetool/cxgbetool.c (revision 306831) +++ user/alc/PQ_LAUNDRY/tools/tools/cxgbetool/cxgbetool.c (revision 306832) @@ -1,2819 +1,2866 @@ /*- * Copyright (c) 2011 Chelsio Communications, Inc. * All rights reserved. * Written by: Navdeep Parhar * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "t4_ioctl.h" #define in_range(val, lo, hi) ( val < 0 || (val <= hi && val >= lo)) #define max(x, y) ((x) > (y) ? (x) : (y)) static const char *progname, *nexus; static int chip_id; /* 4 for T4, 5 for T5 */ struct reg_info { const char *name; uint32_t addr; uint32_t len; }; struct mod_regs { const char *name; const struct reg_info *ri; }; struct field_desc { const char *name; /* Field name */ unsigned short start; /* Start bit position */ unsigned short end; /* End bit position */ unsigned char shift; /* # of low order bits omitted and implicitly 0 */ unsigned char hex; /* Print field in hex instead of decimal */ unsigned char islog2; /* Field contains the base-2 log of the value */ }; #include "reg_defs_t4.c" #include "reg_defs_t5.c" #include "reg_defs_t6.c" #include "reg_defs_t4vf.c" static void usage(FILE *fp) { fprintf(fp, "Usage: %s [operation]\n", progname); fprintf(fp, "\tclearstats clear port statistics\n" "\tcontext show an SGE context\n" "\tfilter [ ] ... set a filter\n" "\tfilter delete|clear delete a filter\n" "\tfilter list list all filters\n" "\tfilter mode [] ... get/set global filter mode\n" "\ti2c [] read from i2c device\n" + "\tloadcfg install configuration file\n" + "\tloadcfg clear remove configuration file\n" "\tloadfw install firmware\n" "\tmemdump dump a memory range\n" "\tmodinfo [raw] optics/cable information\n" "\treg
[=] read/write register\n" "\treg64
[=] read/write 64 bit register\n" "\tregdump [] ... dump registers\n" "\tsched-class params .. configure TX scheduler class\n" "\tsched-queue bind NIC queues to TX Scheduling class\n" "\tstdio interactive mode\n" "\ttcb read TCB\n" "\ttracer tx|rx set and enable a tracer\n" "\ttracer disable|enable disable or enable a tracer\n" "\ttracer list list all tracers\n" ); } static inline unsigned int get_card_vers(unsigned int version) { return (version & 0x3ff); } static int real_doit(unsigned long cmd, void *data, const char *cmdstr) { static int fd = -1; int rc = 0; if (fd == -1) { char buf[64]; snprintf(buf, sizeof(buf), "/dev/%s", nexus); if ((fd = open(buf, O_RDWR)) < 0) { warn("open(%s)", nexus); rc = errno; return (rc); } chip_id = nexus[1] - '0'; } rc = ioctl(fd, cmd, data); if (rc < 0) { warn("%s", cmdstr); rc = errno; } return (rc); } #define doit(x, y) real_doit(x, y, #x) static char * str_to_number(const char *s, long *val, long long *vall) { char *p; if (vall) *vall = strtoll(s, &p, 0); else if (val) *val = strtol(s, &p, 0); else p = NULL; return (p); } static int read_reg(long addr, int size, long long *val) { struct t4_reg reg; int rc; reg.addr = (uint32_t) addr; reg.size = (uint32_t) size; reg.val = 0; rc = doit(CHELSIO_T4_GETREG, ®); *val = reg.val; return (rc); } static int write_reg(long addr, int size, long long val) { struct t4_reg reg; reg.addr = (uint32_t) addr; reg.size = (uint32_t) size; reg.val = (uint64_t) val; return doit(CHELSIO_T4_SETREG, ®); } static int register_io(int argc, const char *argv[], int size) { char *p, *v; long addr; long long val; int w = 0, rc; if (argc == 1) { /* OR = */ p = str_to_number(argv[0], &addr, NULL); if (*p) { if (*p != '=') { warnx("invalid register \"%s\"", argv[0]); return (EINVAL); } w = 1; v = p + 1; p = str_to_number(v, NULL, &val); if (*p) { warnx("invalid value \"%s\"", v); return (EINVAL); } } } else if (argc == 2) { /* */ w = 1; p = str_to_number(argv[0], &addr, NULL); if (*p) { warnx("invalid register \"%s\"", argv[0]); return (EINVAL); } p = str_to_number(argv[1], NULL, &val); if (*p) { warnx("invalid value \"%s\"", argv[1]); return (EINVAL); } } else { warnx("reg: invalid number of arguments (%d)", argc); return (EINVAL); } if (w) rc = write_reg(addr, size, val); else { rc = read_reg(addr, size, &val); if (rc == 0) printf("0x%llx [%llu]\n", val, val); } return (rc); } static inline uint32_t xtract(uint32_t val, int shift, int len) { return (val >> shift) & ((1 << len) - 1); } static int dump_block_regs(const struct reg_info *reg_array, const uint32_t *regs) { uint32_t reg_val = 0; for ( ; reg_array->name; ++reg_array) if (!reg_array->len) { reg_val = regs[reg_array->addr / 4]; printf("[%#7x] %-47s %#-10x %u\n", reg_array->addr, reg_array->name, reg_val, reg_val); } else { uint32_t v = xtract(reg_val, reg_array->addr, reg_array->len); printf(" %*u:%u %-47s %#-10x %u\n", reg_array->addr < 10 ? 3 : 2, reg_array->addr + reg_array->len - 1, reg_array->addr, reg_array->name, v, v); } return (1); } static int dump_regs_table(int argc, const char *argv[], const uint32_t *regs, const struct mod_regs *modtab, int nmodules) { int i, j, match; for (i = 0; i < argc; i++) { for (j = 0; j < nmodules; j++) { if (!strcmp(argv[i], modtab[j].name)) break; } if (j == nmodules) { warnx("invalid register block \"%s\"", argv[i]); fprintf(stderr, "\nAvailable blocks:"); for ( ; nmodules; nmodules--, modtab++) fprintf(stderr, " %s", modtab->name); fprintf(stderr, "\n"); return (EINVAL); } } for ( ; nmodules; nmodules--, modtab++) { match = argc == 0 ? 1 : 0; for (i = 0; !match && i < argc; i++) { if (!strcmp(argv[i], modtab->name)) match = 1; } if (match) dump_block_regs(modtab->ri, regs); } return (0); } #define T4_MODREGS(name) { #name, t4_##name##_regs } static int dump_regs_t4(int argc, const char *argv[], const uint32_t *regs) { static struct mod_regs t4_mod[] = { T4_MODREGS(sge), { "pci", t4_pcie_regs }, T4_MODREGS(dbg), T4_MODREGS(mc), T4_MODREGS(ma), { "edc0", t4_edc_0_regs }, { "edc1", t4_edc_1_regs }, T4_MODREGS(cim), T4_MODREGS(tp), T4_MODREGS(ulp_rx), T4_MODREGS(ulp_tx), { "pmrx", t4_pm_rx_regs }, { "pmtx", t4_pm_tx_regs }, T4_MODREGS(mps), { "cplsw", t4_cpl_switch_regs }, T4_MODREGS(smb), { "i2c", t4_i2cm_regs }, T4_MODREGS(mi), T4_MODREGS(uart), T4_MODREGS(pmu), T4_MODREGS(sf), T4_MODREGS(pl), T4_MODREGS(le), T4_MODREGS(ncsi), T4_MODREGS(xgmac) }; return dump_regs_table(argc, argv, regs, t4_mod, nitems(t4_mod)); } #undef T4_MODREGS #define T5_MODREGS(name) { #name, t5_##name##_regs } static int dump_regs_t5(int argc, const char *argv[], const uint32_t *regs) { static struct mod_regs t5_mod[] = { T5_MODREGS(sge), { "pci", t5_pcie_regs }, T5_MODREGS(dbg), { "mc0", t5_mc_0_regs }, { "mc1", t5_mc_1_regs }, T5_MODREGS(ma), { "edc0", t5_edc_t50_regs }, { "edc1", t5_edc_t51_regs }, T5_MODREGS(cim), T5_MODREGS(tp), { "ulprx", t5_ulp_rx_regs }, { "ulptx", t5_ulp_tx_regs }, { "pmrx", t5_pm_rx_regs }, { "pmtx", t5_pm_tx_regs }, T5_MODREGS(mps), { "cplsw", t5_cpl_switch_regs }, T5_MODREGS(smb), { "i2c", t5_i2cm_regs }, T5_MODREGS(mi), T5_MODREGS(uart), T5_MODREGS(pmu), T5_MODREGS(sf), T5_MODREGS(pl), T5_MODREGS(le), T5_MODREGS(ncsi), T5_MODREGS(mac), { "hma", t5_hma_t5_regs } }; return dump_regs_table(argc, argv, regs, t5_mod, nitems(t5_mod)); } #undef T5_MODREGS #define T6_MODREGS(name) { #name, t6_##name##_regs } static int dump_regs_t6(int argc, const char *argv[], const uint32_t *regs) { static struct mod_regs t6_mod[] = { T6_MODREGS(sge), { "pci", t6_pcie_regs }, T6_MODREGS(dbg), { "mc0", t6_mc_0_regs }, T6_MODREGS(ma), { "edc0", t6_edc_t60_regs }, { "edc1", t6_edc_t61_regs }, T6_MODREGS(cim), T6_MODREGS(tp), { "ulprx", t6_ulp_rx_regs }, { "ulptx", t6_ulp_tx_regs }, { "pmrx", t6_pm_rx_regs }, { "pmtx", t6_pm_tx_regs }, T6_MODREGS(mps), { "cplsw", t6_cpl_switch_regs }, T6_MODREGS(smb), { "i2c", t6_i2cm_regs }, T6_MODREGS(mi), T6_MODREGS(uart), T6_MODREGS(pmu), T6_MODREGS(sf), T6_MODREGS(pl), T6_MODREGS(le), T6_MODREGS(ncsi), T6_MODREGS(mac), { "hma", t6_hma_t6_regs } }; return dump_regs_table(argc, argv, regs, t6_mod, nitems(t6_mod)); } #undef T6_MODREGS static int dump_regs_t4vf(int argc, const char *argv[], const uint32_t *regs) { static struct mod_regs t4vf_mod[] = { { "sge", t4vf_sge_regs }, { "mps", t4vf_mps_regs }, { "pl", t4vf_pl_regs }, { "mbdata", t4vf_mbdata_regs }, { "cim", t4vf_cim_regs }, }; return dump_regs_table(argc, argv, regs, t4vf_mod, nitems(t4vf_mod)); } static int dump_regs_t5vf(int argc, const char *argv[], const uint32_t *regs) { static struct mod_regs t5vf_mod[] = { { "sge", t5vf_sge_regs }, { "mps", t4vf_mps_regs }, { "pl", t5vf_pl_regs }, { "mbdata", t4vf_mbdata_regs }, { "cim", t4vf_cim_regs }, }; return dump_regs_table(argc, argv, regs, t5vf_mod, nitems(t5vf_mod)); } static int dump_regs_t6vf(int argc, const char *argv[], const uint32_t *regs) { static struct mod_regs t6vf_mod[] = { { "sge", t5vf_sge_regs }, { "mps", t4vf_mps_regs }, { "pl", t6vf_pl_regs }, { "mbdata", t4vf_mbdata_regs }, { "cim", t4vf_cim_regs }, }; return dump_regs_table(argc, argv, regs, t6vf_mod, nitems(t6vf_mod)); } static int dump_regs(int argc, const char *argv[]) { int vers, revision, rc; struct t4_regdump regs; uint32_t len; len = max(T4_REGDUMP_SIZE, T5_REGDUMP_SIZE); regs.data = calloc(1, len); if (regs.data == NULL) { warnc(ENOMEM, "regdump"); return (ENOMEM); } regs.len = len; rc = doit(CHELSIO_T4_REGDUMP, ®s); if (rc != 0) return (rc); vers = get_card_vers(regs.version); revision = (regs.version >> 10) & 0x3f; if (vers == 4) { if (revision == 0x3f) rc = dump_regs_t4vf(argc, argv, regs.data); else rc = dump_regs_t4(argc, argv, regs.data); } else if (vers == 5) { if (revision == 0x3f) rc = dump_regs_t5vf(argc, argv, regs.data); else rc = dump_regs_t5(argc, argv, regs.data); } else if (vers == 6) { if (revision == 0x3f) rc = dump_regs_t6vf(argc, argv, regs.data); else rc = dump_regs_t6(argc, argv, regs.data); } else { warnx("%s (type %d, rev %d) is not a known card.", nexus, vers, revision); return (ENOTSUP); } free(regs.data); return (rc); } static void do_show_info_header(uint32_t mode) { uint32_t i; printf("%4s %8s", "Idx", "Hits"); for (i = T4_FILTER_FCoE; i <= T4_FILTER_IP_FRAGMENT; i <<= 1) { switch (mode & i) { case T4_FILTER_FCoE: printf(" FCoE"); break; case T4_FILTER_PORT: printf(" Port"); break; case T4_FILTER_VNIC: if (mode & T4_FILTER_IC_VNIC) printf(" VFvld:PF:VF"); else printf(" vld:oVLAN"); break; case T4_FILTER_VLAN: printf(" vld:VLAN"); break; case T4_FILTER_IP_TOS: printf(" TOS"); break; case T4_FILTER_IP_PROTO: printf(" Prot"); break; case T4_FILTER_ETH_TYPE: printf(" EthType"); break; case T4_FILTER_MAC_IDX: printf(" MACIdx"); break; case T4_FILTER_MPS_HIT_TYPE: printf(" MPS"); break; case T4_FILTER_IP_FRAGMENT: printf(" Frag"); break; default: /* compressed filter field not enabled */ break; } } printf(" %20s %20s %9s %9s %s\n", "DIP", "SIP", "DPORT", "SPORT", "Action"); } /* * Parse an argument sub-vector as a { [:] } * ordered tuple. If the parameter name in the argument sub-vector does not * match the passed in parameter name, then a zero is returned for the * function and no parsing is performed. If there is a match, then the value * and optional mask are parsed and returned in the provided return value * pointers. If no optional mask is specified, then a default mask of all 1s * will be returned. * * An error in parsing the value[:mask] will result in an error message and * program termination. */ static int parse_val_mask(const char *param, const char *args[], uint32_t *val, uint32_t *mask) { char *p; if (strcmp(param, args[0]) != 0) return (EINVAL); *val = strtoul(args[1], &p, 0); if (p > args[1]) { if (p[0] == 0) { *mask = ~0; return (0); } if (p[0] == ':' && p[1] != 0) { *mask = strtoul(p+1, &p, 0); if (p[0] == 0) return (0); } } warnx("parameter \"%s\" has bad \"value[:mask]\" %s", args[0], args[1]); return (EINVAL); } /* * Parse an argument sub-vector as a { [/] } * ordered tuple. If the parameter name in the argument sub-vector does not * match the passed in parameter name, then a zero is returned for the * function and no parsing is performed. If there is a match, then the value * and optional mask are parsed and returned in the provided return value * pointers. If no optional mask is specified, then a default mask of all 1s * will be returned. * * The value return parameter "afp" is used to specify the expected address * family -- IPv4 or IPv6 -- of the address[/mask] and return its actual * format. A passed in value of AF_UNSPEC indicates that either IPv4 or IPv6 * is acceptable; AF_INET means that only IPv4 addresses are acceptable; and * AF_INET6 means that only IPv6 are acceptable. AF_INET is returned for IPv4 * and AF_INET6 for IPv6 addresses, respectively. IPv4 address/mask pairs are * returned in the first four bytes of the address and mask return values with * the address A.B.C.D returned with { A, B, C, D } returned in addresses { 0, * 1, 2, 3}, respectively. * * An error in parsing the value[:mask] will result in an error message and * program termination. */ static int parse_ipaddr(const char *param, const char *args[], int *afp, uint8_t addr[], uint8_t mask[]) { const char *colon, *afn; char *slash; uint8_t *m; int af, ret; unsigned int masksize; /* * Is this our parameter? */ if (strcmp(param, args[0]) != 0) return (EINVAL); /* * Fundamental IPv4 versus IPv6 selection. */ colon = strchr(args[1], ':'); if (!colon) { afn = "IPv4"; af = AF_INET; masksize = 32; } else { afn = "IPv6"; af = AF_INET6; masksize = 128; } if (*afp == AF_UNSPEC) *afp = af; else if (*afp != af) { warnx("address %s is not of expected family %s", args[1], *afp == AF_INET ? "IP" : "IPv6"); return (EINVAL); } /* * Parse address (temporarily stripping off any "/mask" * specification). */ slash = strchr(args[1], '/'); if (slash) *slash = 0; ret = inet_pton(af, args[1], addr); if (slash) *slash = '/'; if (ret <= 0) { warnx("Cannot parse %s %s address %s", param, afn, args[1]); return (EINVAL); } /* * Parse optional mask specification. */ if (slash) { char *p; unsigned int prefix = strtoul(slash + 1, &p, 10); if (p == slash + 1) { warnx("missing address prefix for %s", param); return (EINVAL); } if (*p) { warnx("%s is not a valid address prefix", slash + 1); return (EINVAL); } if (prefix > masksize) { warnx("prefix %u is too long for an %s address", prefix, afn); return (EINVAL); } memset(mask, 0, masksize / 8); masksize = prefix; } /* * Fill in mask. */ for (m = mask; masksize >= 8; m++, masksize -= 8) *m = ~0; if (masksize) *m = ~0 << (8 - masksize); return (0); } /* * Parse an argument sub-vector as a { } ordered * tuple. If the parameter name in the argument sub-vector does not match the * passed in parameter name, then a zero is returned for the function and no * parsing is performed. If there is a match, then the value is parsed and * returned in the provided return value pointer. */ static int parse_val(const char *param, const char *args[], uint32_t *val) { char *p; if (strcmp(param, args[0]) != 0) return (EINVAL); *val = strtoul(args[1], &p, 0); if (p > args[1] && p[0] == 0) return (0); warnx("parameter \"%s\" has bad \"value\" %s", args[0], args[1]); return (EINVAL); } static void filters_show_ipaddr(int type, uint8_t *addr, uint8_t *addrm) { int noctets, octet; printf(" "); if (type == 0) { noctets = 4; printf("%3s", " "); } else noctets = 16; for (octet = 0; octet < noctets; octet++) printf("%02x", addr[octet]); printf("/"); for (octet = 0; octet < noctets; octet++) printf("%02x", addrm[octet]); } static void do_show_one_filter_info(struct t4_filter *t, uint32_t mode) { uint32_t i; printf("%4d", t->idx); if (t->hits == UINT64_MAX) printf(" %8s", "-"); else printf(" %8ju", t->hits); /* * Compressed header portion of filter. */ for (i = T4_FILTER_FCoE; i <= T4_FILTER_IP_FRAGMENT; i <<= 1) { switch (mode & i) { case T4_FILTER_FCoE: printf(" %1d/%1d", t->fs.val.fcoe, t->fs.mask.fcoe); break; case T4_FILTER_PORT: printf(" %1d/%1d", t->fs.val.iport, t->fs.mask.iport); break; case T4_FILTER_VNIC: if (mode & T4_FILTER_IC_VNIC) { printf(" %1d:%1x:%02x/%1d:%1x:%02x", t->fs.val.pfvf_vld, (t->fs.val.vnic >> 13) & 0x7, t->fs.val.vnic & 0x1fff, t->fs.mask.pfvf_vld, (t->fs.mask.vnic >> 13) & 0x7, t->fs.mask.vnic & 0x1fff); } else { printf(" %1d:%04x/%1d:%04x", t->fs.val.ovlan_vld, t->fs.val.vnic, t->fs.mask.ovlan_vld, t->fs.mask.vnic); } break; case T4_FILTER_VLAN: printf(" %1d:%04x/%1d:%04x", t->fs.val.vlan_vld, t->fs.val.vlan, t->fs.mask.vlan_vld, t->fs.mask.vlan); break; case T4_FILTER_IP_TOS: printf(" %02x/%02x", t->fs.val.tos, t->fs.mask.tos); break; case T4_FILTER_IP_PROTO: printf(" %02x/%02x", t->fs.val.proto, t->fs.mask.proto); break; case T4_FILTER_ETH_TYPE: printf(" %04x/%04x", t->fs.val.ethtype, t->fs.mask.ethtype); break; case T4_FILTER_MAC_IDX: printf(" %03x/%03x", t->fs.val.macidx, t->fs.mask.macidx); break; case T4_FILTER_MPS_HIT_TYPE: printf(" %1x/%1x", t->fs.val.matchtype, t->fs.mask.matchtype); break; case T4_FILTER_IP_FRAGMENT: printf(" %1d/%1d", t->fs.val.frag, t->fs.mask.frag); break; default: /* compressed filter field not enabled */ break; } } /* * Fixed portion of filter. */ filters_show_ipaddr(t->fs.type, t->fs.val.dip, t->fs.mask.dip); filters_show_ipaddr(t->fs.type, t->fs.val.sip, t->fs.mask.sip); printf(" %04x/%04x %04x/%04x", t->fs.val.dport, t->fs.mask.dport, t->fs.val.sport, t->fs.mask.sport); /* * Variable length filter action. */ if (t->fs.action == FILTER_DROP) printf(" Drop"); else if (t->fs.action == FILTER_SWITCH) { printf(" Switch: port=%d", t->fs.eport); if (t->fs.newdmac) printf( ", dmac=%02x:%02x:%02x:%02x:%02x:%02x " ", l2tidx=%d", t->fs.dmac[0], t->fs.dmac[1], t->fs.dmac[2], t->fs.dmac[3], t->fs.dmac[4], t->fs.dmac[5], t->l2tidx); if (t->fs.newsmac) printf( ", smac=%02x:%02x:%02x:%02x:%02x:%02x " ", smtidx=%d", t->fs.smac[0], t->fs.smac[1], t->fs.smac[2], t->fs.smac[3], t->fs.smac[4], t->fs.smac[5], t->smtidx); if (t->fs.newvlan == VLAN_REMOVE) printf(", vlan=none"); else if (t->fs.newvlan == VLAN_INSERT) printf(", vlan=insert(%x)", t->fs.vlan); else if (t->fs.newvlan == VLAN_REWRITE) printf(", vlan=rewrite(%x)", t->fs.vlan); } else { printf(" Pass: Q="); if (t->fs.dirsteer == 0) { printf("RSS"); if (t->fs.maskhash) printf("(TCB=hash)"); } else { printf("%d", t->fs.iq); if (t->fs.dirsteerhash == 0) printf("(QID)"); else printf("(hash)"); } } if (t->fs.prio) printf(" Prio"); if (t->fs.rpttid) printf(" RptTID"); printf("\n"); } static int show_filters(void) { uint32_t mode = 0, header = 0; struct t4_filter t; int rc; /* Get the global filter mode first */ rc = doit(CHELSIO_T4_GET_FILTER_MODE, &mode); if (rc != 0) return (rc); t.idx = 0; for (t.idx = 0; ; t.idx++) { rc = doit(CHELSIO_T4_GET_FILTER, &t); if (rc != 0 || t.idx == 0xffffffff) break; if (!header) { do_show_info_header(mode); header = 1; } do_show_one_filter_info(&t, mode); }; return (rc); } static int get_filter_mode(void) { uint32_t mode = 0; int rc; rc = doit(CHELSIO_T4_GET_FILTER_MODE, &mode); if (rc != 0) return (rc); if (mode & T4_FILTER_IPv4) printf("ipv4 "); if (mode & T4_FILTER_IPv6) printf("ipv6 "); if (mode & T4_FILTER_IP_SADDR) printf("sip "); if (mode & T4_FILTER_IP_DADDR) printf("dip "); if (mode & T4_FILTER_IP_SPORT) printf("sport "); if (mode & T4_FILTER_IP_DPORT) printf("dport "); if (mode & T4_FILTER_IP_FRAGMENT) printf("frag "); if (mode & T4_FILTER_MPS_HIT_TYPE) printf("matchtype "); if (mode & T4_FILTER_MAC_IDX) printf("macidx "); if (mode & T4_FILTER_ETH_TYPE) printf("ethtype "); if (mode & T4_FILTER_IP_PROTO) printf("proto "); if (mode & T4_FILTER_IP_TOS) printf("tos "); if (mode & T4_FILTER_VLAN) printf("vlan "); if (mode & T4_FILTER_VNIC) { if (mode & T4_FILTER_IC_VNIC) printf("vnic_id "); else printf("ovlan "); } if (mode & T4_FILTER_PORT) printf("iport "); if (mode & T4_FILTER_FCoE) printf("fcoe "); printf("\n"); return (0); } static int set_filter_mode(int argc, const char *argv[]) { uint32_t mode = 0; int vnic = 0, ovlan = 0; for (; argc; argc--, argv++) { if (!strcmp(argv[0], "frag")) mode |= T4_FILTER_IP_FRAGMENT; if (!strcmp(argv[0], "matchtype")) mode |= T4_FILTER_MPS_HIT_TYPE; if (!strcmp(argv[0], "macidx")) mode |= T4_FILTER_MAC_IDX; if (!strcmp(argv[0], "ethtype")) mode |= T4_FILTER_ETH_TYPE; if (!strcmp(argv[0], "proto")) mode |= T4_FILTER_IP_PROTO; if (!strcmp(argv[0], "tos")) mode |= T4_FILTER_IP_TOS; if (!strcmp(argv[0], "vlan")) mode |= T4_FILTER_VLAN; if (!strcmp(argv[0], "ovlan")) { mode |= T4_FILTER_VNIC; ovlan++; } if (!strcmp(argv[0], "vnic_id")) { mode |= T4_FILTER_VNIC; mode |= T4_FILTER_IC_VNIC; vnic++; } if (!strcmp(argv[0], "iport")) mode |= T4_FILTER_PORT; if (!strcmp(argv[0], "fcoe")) mode |= T4_FILTER_FCoE; } if (vnic > 0 && ovlan > 0) { warnx("\"vnic_id\" and \"ovlan\" are mutually exclusive."); return (EINVAL); } return doit(CHELSIO_T4_SET_FILTER_MODE, &mode); } static int del_filter(uint32_t idx) { struct t4_filter t; t.idx = idx; return doit(CHELSIO_T4_DEL_FILTER, &t); } static int set_filter(uint32_t idx, int argc, const char *argv[]) { int af = AF_UNSPEC, start_arg = 0; struct t4_filter t; if (argc < 2) { warnc(EINVAL, "%s", __func__); return (EINVAL); }; bzero(&t, sizeof (t)); t.idx = idx; t.fs.hitcnts = 1; for (start_arg = 0; start_arg + 2 <= argc; start_arg += 2) { const char **args = &argv[start_arg]; uint32_t val, mask; if (!strcmp(argv[start_arg], "type")) { int newaf; if (!strcasecmp(argv[start_arg + 1], "ipv4")) newaf = AF_INET; else if (!strcasecmp(argv[start_arg + 1], "ipv6")) newaf = AF_INET6; else { warnx("invalid type \"%s\"; " "must be one of \"ipv4\" or \"ipv6\"", argv[start_arg + 1]); return (EINVAL); } if (af != AF_UNSPEC && af != newaf) { warnx("conflicting IPv4/IPv6 specifications."); return (EINVAL); } af = newaf; } else if (!parse_val_mask("fcoe", args, &val, &mask)) { t.fs.val.fcoe = val; t.fs.mask.fcoe = mask; } else if (!parse_val_mask("iport", args, &val, &mask)) { t.fs.val.iport = val; t.fs.mask.iport = mask; } else if (!parse_val_mask("ovlan", args, &val, &mask)) { t.fs.val.vnic = val; t.fs.mask.vnic = mask; t.fs.val.ovlan_vld = 1; t.fs.mask.ovlan_vld = 1; } else if (!parse_val_mask("ivlan", args, &val, &mask)) { t.fs.val.vlan = val; t.fs.mask.vlan = mask; t.fs.val.vlan_vld = 1; t.fs.mask.vlan_vld = 1; } else if (!parse_val_mask("pf", args, &val, &mask)) { t.fs.val.vnic &= 0x1fff; t.fs.val.vnic |= (val & 0x7) << 13; t.fs.mask.vnic &= 0x1fff; t.fs.mask.vnic |= (mask & 0x7) << 13; t.fs.val.pfvf_vld = 1; t.fs.mask.pfvf_vld = 1; } else if (!parse_val_mask("vf", args, &val, &mask)) { t.fs.val.vnic &= 0xe000; t.fs.val.vnic |= val & 0x1fff; t.fs.mask.vnic &= 0xe000; t.fs.mask.vnic |= mask & 0x1fff; t.fs.val.pfvf_vld = 1; t.fs.mask.pfvf_vld = 1; } else if (!parse_val_mask("tos", args, &val, &mask)) { t.fs.val.tos = val; t.fs.mask.tos = mask; } else if (!parse_val_mask("proto", args, &val, &mask)) { t.fs.val.proto = val; t.fs.mask.proto = mask; } else if (!parse_val_mask("ethtype", args, &val, &mask)) { t.fs.val.ethtype = val; t.fs.mask.ethtype = mask; } else if (!parse_val_mask("macidx", args, &val, &mask)) { t.fs.val.macidx = val; t.fs.mask.macidx = mask; } else if (!parse_val_mask("matchtype", args, &val, &mask)) { t.fs.val.matchtype = val; t.fs.mask.matchtype = mask; } else if (!parse_val_mask("frag", args, &val, &mask)) { t.fs.val.frag = val; t.fs.mask.frag = mask; } else if (!parse_val_mask("dport", args, &val, &mask)) { t.fs.val.dport = val; t.fs.mask.dport = mask; } else if (!parse_val_mask("sport", args, &val, &mask)) { t.fs.val.sport = val; t.fs.mask.sport = mask; } else if (!parse_ipaddr("dip", args, &af, t.fs.val.dip, t.fs.mask.dip)) { /* nada */; } else if (!parse_ipaddr("sip", args, &af, t.fs.val.sip, t.fs.mask.sip)) { /* nada */; } else if (!strcmp(argv[start_arg], "action")) { if (!strcmp(argv[start_arg + 1], "pass")) t.fs.action = FILTER_PASS; else if (!strcmp(argv[start_arg + 1], "drop")) t.fs.action = FILTER_DROP; else if (!strcmp(argv[start_arg + 1], "switch")) t.fs.action = FILTER_SWITCH; else { warnx("invalid action \"%s\"; must be one of" " \"pass\", \"drop\" or \"switch\"", argv[start_arg + 1]); return (EINVAL); } } else if (!parse_val("hitcnts", args, &val)) { t.fs.hitcnts = val; } else if (!parse_val("prio", args, &val)) { t.fs.prio = val; } else if (!parse_val("rpttid", args, &val)) { t.fs.rpttid = 1; } else if (!parse_val("queue", args, &val)) { t.fs.dirsteer = 1; t.fs.iq = val; } else if (!parse_val("tcbhash", args, &val)) { t.fs.maskhash = 1; t.fs.dirsteerhash = 1; } else if (!parse_val("eport", args, &val)) { t.fs.eport = val; } else if (!strcmp(argv[start_arg], "dmac")) { struct ether_addr *daddr; daddr = ether_aton(argv[start_arg + 1]); if (daddr == NULL) { warnx("invalid dmac address \"%s\"", argv[start_arg + 1]); return (EINVAL); } memcpy(t.fs.dmac, daddr, ETHER_ADDR_LEN); t.fs.newdmac = 1; } else if (!strcmp(argv[start_arg], "smac")) { struct ether_addr *saddr; saddr = ether_aton(argv[start_arg + 1]); if (saddr == NULL) { warnx("invalid smac address \"%s\"", argv[start_arg + 1]); return (EINVAL); } memcpy(t.fs.smac, saddr, ETHER_ADDR_LEN); t.fs.newsmac = 1; } else if (!strcmp(argv[start_arg], "vlan")) { char *p; if (!strcmp(argv[start_arg + 1], "none")) { t.fs.newvlan = VLAN_REMOVE; } else if (argv[start_arg + 1][0] == '=') { t.fs.newvlan = VLAN_REWRITE; } else if (argv[start_arg + 1][0] == '+') { t.fs.newvlan = VLAN_INSERT; } else if (isdigit(argv[start_arg + 1][0]) && !parse_val_mask("vlan", args, &val, &mask)) { t.fs.val.vlan = val; t.fs.mask.vlan = mask; t.fs.val.vlan_vld = 1; t.fs.mask.vlan_vld = 1; } else { warnx("unknown vlan parameter \"%s\"; must" " be one of \"none\", \"=\", " " \"+\", or \"\"", argv[start_arg + 1]); return (EINVAL); } if (t.fs.newvlan == VLAN_REWRITE || t.fs.newvlan == VLAN_INSERT) { t.fs.vlan = strtoul(argv[start_arg + 1] + 1, &p, 0); if (p == argv[start_arg + 1] + 1 || p[0] != 0) { warnx("invalid vlan \"%s\"", argv[start_arg + 1]); return (EINVAL); } } } else { warnx("invalid parameter \"%s\"", argv[start_arg]); return (EINVAL); } } if (start_arg != argc) { warnx("no value for \"%s\"", argv[start_arg]); return (EINVAL); } /* * Check basic sanity of option combinations. */ if (t.fs.action != FILTER_SWITCH && (t.fs.eport || t.fs.newdmac || t.fs.newsmac || t.fs.newvlan)) { warnx("prio, port dmac, smac and vlan only make sense with" " \"action switch\""); return (EINVAL); } if (t.fs.action != FILTER_PASS && (t.fs.rpttid || t.fs.dirsteer || t.fs.maskhash)) { warnx("rpttid, queue and tcbhash don't make sense with" " action \"drop\" or \"switch\""); return (EINVAL); } if (t.fs.val.ovlan_vld && t.fs.val.pfvf_vld) { warnx("ovlan and vnic_id (pf/vf) are mutually exclusive"); return (EINVAL); } t.fs.type = (af == AF_INET6 ? 1 : 0); /* default IPv4 */ return doit(CHELSIO_T4_SET_FILTER, &t); } static int filter_cmd(int argc, const char *argv[]) { long long val; uint32_t idx; char *s; if (argc == 0) { warnx("filter: no arguments."); return (EINVAL); }; /* list */ if (strcmp(argv[0], "list") == 0) { if (argc != 1) warnx("trailing arguments after \"list\" ignored."); return show_filters(); } /* mode */ if (argc == 1 && strcmp(argv[0], "mode") == 0) return get_filter_mode(); /* mode */ if (strcmp(argv[0], "mode") == 0) return set_filter_mode(argc - 1, argv + 1); /* ... */ s = str_to_number(argv[0], NULL, &val); if (*s || val > 0xffffffffU) { warnx("\"%s\" is neither an index nor a filter subcommand.", argv[0]); return (EINVAL); } idx = (uint32_t) val; /* delete|clear */ if (argc == 2 && (strcmp(argv[1], "delete") == 0 || strcmp(argv[1], "clear") == 0)) { return del_filter(idx); } /* [ ] ... */ return set_filter(idx, argc - 1, argv + 1); } /* * Shows the fields of a multi-word structure. The structure is considered to * consist of @nwords 32-bit words (i.e, it's an (@nwords * 32)-bit structure) * whose fields are described by @fd. The 32-bit words are given in @words * starting with the least significant 32-bit word. */ static void show_struct(const uint32_t *words, int nwords, const struct field_desc *fd) { unsigned int w = 0; const struct field_desc *p; for (p = fd; p->name; p++) w = max(w, strlen(p->name)); while (fd->name) { unsigned long long data; int first_word = fd->start / 32; int shift = fd->start % 32; int width = fd->end - fd->start + 1; unsigned long long mask = (1ULL << width) - 1; data = (words[first_word] >> shift) | ((uint64_t)words[first_word + 1] << (32 - shift)); if (shift) data |= ((uint64_t)words[first_word + 2] << (64 - shift)); data &= mask; if (fd->islog2) data = 1 << data; printf("%-*s ", w, fd->name); printf(fd->hex ? "%#llx\n" : "%llu\n", data << fd->shift); fd++; } } #define FIELD(name, start, end) { name, start, end, 0, 0, 0 } #define FIELD1(name, start) FIELD(name, start, start) static void show_t5t6_ctxt(const struct t4_sge_context *p, int vers) { static struct field_desc egress_t5[] = { FIELD("DCA_ST:", 181, 191), FIELD1("StatusPgNS:", 180), FIELD1("StatusPgRO:", 179), FIELD1("FetchNS:", 178), FIELD1("FetchRO:", 177), FIELD1("Valid:", 176), FIELD("PCIeDataChannel:", 174, 175), FIELD1("StatusPgTPHintEn:", 173), FIELD("StatusPgTPHint:", 171, 172), FIELD1("FetchTPHintEn:", 170), FIELD("FetchTPHint:", 168, 169), FIELD1("FCThreshOverride:", 167), { "WRLength:", 162, 166, 9, 0, 1 }, FIELD1("WRLengthKnown:", 161), FIELD1("ReschedulePending:", 160), FIELD1("OnChipQueue:", 159), FIELD1("FetchSizeMode:", 158), { "FetchBurstMin:", 156, 157, 4, 0, 1 }, FIELD1("FLMPacking:", 155), FIELD("FetchBurstMax:", 153, 154), FIELD("uPToken:", 133, 152), FIELD1("uPTokenEn:", 132), FIELD1("UserModeIO:", 131), FIELD("uPFLCredits:", 123, 130), FIELD1("uPFLCreditEn:", 122), FIELD("FID:", 111, 121), FIELD("HostFCMode:", 109, 110), FIELD1("HostFCOwner:", 108), { "CIDXFlushThresh:", 105, 107, 0, 0, 1 }, FIELD("CIDX:", 89, 104), FIELD("PIDX:", 73, 88), { "BaseAddress:", 18, 72, 9, 1 }, FIELD("QueueSize:", 2, 17), FIELD1("QueueType:", 1), FIELD1("CachePriority:", 0), { NULL } }; static struct field_desc egress_t6[] = { FIELD("DCA_ST:", 181, 191), FIELD1("StatusPgNS:", 180), FIELD1("StatusPgRO:", 179), FIELD1("FetchNS:", 178), FIELD1("FetchRO:", 177), FIELD1("Valid:", 176), FIELD1("ReschedulePending_1:", 175), FIELD1("PCIeDataChannel:", 174), FIELD1("StatusPgTPHintEn:", 173), FIELD("StatusPgTPHint:", 171, 172), FIELD1("FetchTPHintEn:", 170), FIELD("FetchTPHint:", 168, 169), FIELD1("FCThreshOverride:", 167), { "WRLength:", 162, 166, 9, 0, 1 }, FIELD1("WRLengthKnown:", 161), FIELD1("ReschedulePending:", 160), FIELD("TimerIx:", 157, 159), FIELD1("FetchBurstMin:", 156), FIELD1("FLMPacking:", 155), FIELD("FetchBurstMax:", 153, 154), FIELD("uPToken:", 133, 152), FIELD1("uPTokenEn:", 132), FIELD1("UserModeIO:", 131), FIELD("uPFLCredits:", 123, 130), FIELD1("uPFLCreditEn:", 122), FIELD("FID:", 111, 121), FIELD("HostFCMode:", 109, 110), FIELD1("HostFCOwner:", 108), { "CIDXFlushThresh:", 105, 107, 0, 0, 1 }, FIELD("CIDX:", 89, 104), FIELD("PIDX:", 73, 88), { "BaseAddress:", 18, 72, 9, 1 }, FIELD("QueueSize:", 2, 17), FIELD1("QueueType:", 1), FIELD1("FetchSizeMode:", 0), { NULL } }; static struct field_desc fl_t5[] = { FIELD("DCA_ST:", 181, 191), FIELD1("StatusPgNS:", 180), FIELD1("StatusPgRO:", 179), FIELD1("FetchNS:", 178), FIELD1("FetchRO:", 177), FIELD1("Valid:", 176), FIELD("PCIeDataChannel:", 174, 175), FIELD1("StatusPgTPHintEn:", 173), FIELD("StatusPgTPHint:", 171, 172), FIELD1("FetchTPHintEn:", 170), FIELD("FetchTPHint:", 168, 169), FIELD1("FCThreshOverride:", 167), FIELD1("ReschedulePending:", 160), FIELD1("OnChipQueue:", 159), FIELD1("FetchSizeMode:", 158), { "FetchBurstMin:", 156, 157, 4, 0, 1 }, FIELD1("FLMPacking:", 155), FIELD("FetchBurstMax:", 153, 154), FIELD1("FLMcongMode:", 152), FIELD("MaxuPFLCredits:", 144, 151), FIELD("FLMcontextID:", 133, 143), FIELD1("uPTokenEn:", 132), FIELD1("UserModeIO:", 131), FIELD("uPFLCredits:", 123, 130), FIELD1("uPFLCreditEn:", 122), FIELD("FID:", 111, 121), FIELD("HostFCMode:", 109, 110), FIELD1("HostFCOwner:", 108), { "CIDXFlushThresh:", 105, 107, 0, 0, 1 }, FIELD("CIDX:", 89, 104), FIELD("PIDX:", 73, 88), { "BaseAddress:", 18, 72, 9, 1 }, FIELD("QueueSize:", 2, 17), FIELD1("QueueType:", 1), FIELD1("CachePriority:", 0), { NULL } }; static struct field_desc ingress_t5[] = { FIELD("DCA_ST:", 143, 153), FIELD1("ISCSICoalescing:", 142), FIELD1("Queue_Valid:", 141), FIELD1("TimerPending:", 140), FIELD1("DropRSS:", 139), FIELD("PCIeChannel:", 137, 138), FIELD1("SEInterruptArmed:", 136), FIELD1("CongestionMgtEnable:", 135), FIELD1("NoSnoop:", 134), FIELD1("RelaxedOrdering:", 133), FIELD1("GTSmode:", 132), FIELD1("TPHintEn:", 131), FIELD("TPHint:", 129, 130), FIELD1("UpdateScheduling:", 128), FIELD("UpdateDelivery:", 126, 127), FIELD1("InterruptSent:", 125), FIELD("InterruptIDX:", 114, 124), FIELD1("InterruptDestination:", 113), FIELD1("InterruptArmed:", 112), FIELD("RxIntCounter:", 106, 111), FIELD("RxIntCounterThreshold:", 104, 105), FIELD1("Generation:", 103), { "BaseAddress:", 48, 102, 9, 1 }, FIELD("PIDX:", 32, 47), FIELD("CIDX:", 16, 31), { "QueueSize:", 4, 15, 4, 0 }, { "QueueEntrySize:", 2, 3, 4, 0, 1 }, FIELD1("QueueEntryOverride:", 1), FIELD1("CachePriority:", 0), { NULL } }; static struct field_desc ingress_t6[] = { FIELD1("SP_NS:", 158), FIELD1("SP_RO:", 157), FIELD1("SP_TPHintEn:", 156), FIELD("SP_TPHint:", 154, 155), FIELD("DCA_ST:", 143, 153), FIELD1("ISCSICoalescing:", 142), FIELD1("Queue_Valid:", 141), FIELD1("TimerPending:", 140), FIELD1("DropRSS:", 139), FIELD("PCIeChannel:", 137, 138), FIELD1("SEInterruptArmed:", 136), FIELD1("CongestionMgtEnable:", 135), FIELD1("NoSnoop:", 134), FIELD1("RelaxedOrdering:", 133), FIELD1("GTSmode:", 132), FIELD1("TPHintEn:", 131), FIELD("TPHint:", 129, 130), FIELD1("UpdateScheduling:", 128), FIELD("UpdateDelivery:", 126, 127), FIELD1("InterruptSent:", 125), FIELD("InterruptIDX:", 114, 124), FIELD1("InterruptDestination:", 113), FIELD1("InterruptArmed:", 112), FIELD("RxIntCounter:", 106, 111), FIELD("RxIntCounterThreshold:", 104, 105), FIELD1("Generation:", 103), { "BaseAddress:", 48, 102, 9, 1 }, FIELD("PIDX:", 32, 47), FIELD("CIDX:", 16, 31), { "QueueSize:", 4, 15, 4, 0 }, { "QueueEntrySize:", 2, 3, 4, 0, 1 }, FIELD1("QueueEntryOverride:", 1), FIELD1("CachePriority:", 0), { NULL } }; static struct field_desc flm_t5[] = { FIELD1("Valid:", 89), FIELD("SplitLenMode:", 87, 88), FIELD1("TPHintEn:", 86), FIELD("TPHint:", 84, 85), FIELD1("NoSnoop:", 83), FIELD1("RelaxedOrdering:", 82), FIELD("DCA_ST:", 71, 81), FIELD("EQid:", 54, 70), FIELD("SplitEn:", 52, 53), FIELD1("PadEn:", 51), FIELD1("PackEn:", 50), FIELD1("Cache_Lock :", 49), FIELD1("CongDrop:", 48), FIELD("PackOffset:", 16, 47), FIELD("CIDX:", 8, 15), FIELD("PIDX:", 0, 7), { NULL } }; static struct field_desc flm_t6[] = { FIELD1("Valid:", 89), FIELD("SplitLenMode:", 87, 88), FIELD1("TPHintEn:", 86), FIELD("TPHint:", 84, 85), FIELD1("NoSnoop:", 83), FIELD1("RelaxedOrdering:", 82), FIELD("DCA_ST:", 71, 81), FIELD("EQid:", 54, 70), FIELD("SplitEn:", 52, 53), FIELD1("PadEn:", 51), FIELD1("PackEn:", 50), FIELD1("Cache_Lock :", 49), FIELD1("CongDrop:", 48), FIELD1("Inflight:", 47), FIELD1("CongEn:", 46), FIELD1("CongMode:", 45), FIELD("PackOffset:", 20, 39), FIELD("CIDX:", 8, 15), FIELD("PIDX:", 0, 7), { NULL } }; static struct field_desc conm_t5[] = { FIELD1("CngMPSEnable:", 21), FIELD("CngTPMode:", 19, 20), FIELD1("CngDBPHdr:", 18), FIELD1("CngDBPData:", 17), FIELD1("CngIMSG:", 16), { "CngChMap:", 0, 15, 0, 1, 0 }, { NULL } }; if (p->mem_id == SGE_CONTEXT_EGRESS) { if (p->data[0] & 2) show_struct(p->data, 6, fl_t5); else if (vers == 5) show_struct(p->data, 6, egress_t5); else show_struct(p->data, 6, egress_t6); } else if (p->mem_id == SGE_CONTEXT_FLM) show_struct(p->data, 3, vers == 5 ? flm_t5 : flm_t6); else if (p->mem_id == SGE_CONTEXT_INGRESS) show_struct(p->data, 5, vers == 5 ? ingress_t5 : ingress_t6); else if (p->mem_id == SGE_CONTEXT_CNM) show_struct(p->data, 1, conm_t5); } static void show_t4_ctxt(const struct t4_sge_context *p) { static struct field_desc egress_t4[] = { FIELD1("StatusPgNS:", 180), FIELD1("StatusPgRO:", 179), FIELD1("FetchNS:", 178), FIELD1("FetchRO:", 177), FIELD1("Valid:", 176), FIELD("PCIeDataChannel:", 174, 175), FIELD1("DCAEgrQEn:", 173), FIELD("DCACPUID:", 168, 172), FIELD1("FCThreshOverride:", 167), FIELD("WRLength:", 162, 166), FIELD1("WRLengthKnown:", 161), FIELD1("ReschedulePending:", 160), FIELD1("OnChipQueue:", 159), FIELD1("FetchSizeMode", 158), { "FetchBurstMin:", 156, 157, 4, 0, 1 }, { "FetchBurstMax:", 153, 154, 6, 0, 1 }, FIELD("uPToken:", 133, 152), FIELD1("uPTokenEn:", 132), FIELD1("UserModeIO:", 131), FIELD("uPFLCredits:", 123, 130), FIELD1("uPFLCreditEn:", 122), FIELD("FID:", 111, 121), FIELD("HostFCMode:", 109, 110), FIELD1("HostFCOwner:", 108), { "CIDXFlushThresh:", 105, 107, 0, 0, 1 }, FIELD("CIDX:", 89, 104), FIELD("PIDX:", 73, 88), { "BaseAddress:", 18, 72, 9, 1 }, FIELD("QueueSize:", 2, 17), FIELD1("QueueType:", 1), FIELD1("CachePriority:", 0), { NULL } }; static struct field_desc fl_t4[] = { FIELD1("StatusPgNS:", 180), FIELD1("StatusPgRO:", 179), FIELD1("FetchNS:", 178), FIELD1("FetchRO:", 177), FIELD1("Valid:", 176), FIELD("PCIeDataChannel:", 174, 175), FIELD1("DCAEgrQEn:", 173), FIELD("DCACPUID:", 168, 172), FIELD1("FCThreshOverride:", 167), FIELD1("ReschedulePending:", 160), FIELD1("OnChipQueue:", 159), FIELD1("FetchSizeMode", 158), { "FetchBurstMin:", 156, 157, 4, 0, 1 }, { "FetchBurstMax:", 153, 154, 6, 0, 1 }, FIELD1("FLMcongMode:", 152), FIELD("MaxuPFLCredits:", 144, 151), FIELD("FLMcontextID:", 133, 143), FIELD1("uPTokenEn:", 132), FIELD1("UserModeIO:", 131), FIELD("uPFLCredits:", 123, 130), FIELD1("uPFLCreditEn:", 122), FIELD("FID:", 111, 121), FIELD("HostFCMode:", 109, 110), FIELD1("HostFCOwner:", 108), { "CIDXFlushThresh:", 105, 107, 0, 0, 1 }, FIELD("CIDX:", 89, 104), FIELD("PIDX:", 73, 88), { "BaseAddress:", 18, 72, 9, 1 }, FIELD("QueueSize:", 2, 17), FIELD1("QueueType:", 1), FIELD1("CachePriority:", 0), { NULL } }; static struct field_desc ingress_t4[] = { FIELD1("NoSnoop:", 145), FIELD1("RelaxedOrdering:", 144), FIELD1("GTSmode:", 143), FIELD1("ISCSICoalescing:", 142), FIELD1("Valid:", 141), FIELD1("TimerPending:", 140), FIELD1("DropRSS:", 139), FIELD("PCIeChannel:", 137, 138), FIELD1("SEInterruptArmed:", 136), FIELD1("CongestionMgtEnable:", 135), FIELD1("DCAIngQEnable:", 134), FIELD("DCACPUID:", 129, 133), FIELD1("UpdateScheduling:", 128), FIELD("UpdateDelivery:", 126, 127), FIELD1("InterruptSent:", 125), FIELD("InterruptIDX:", 114, 124), FIELD1("InterruptDestination:", 113), FIELD1("InterruptArmed:", 112), FIELD("RxIntCounter:", 106, 111), FIELD("RxIntCounterThreshold:", 104, 105), FIELD1("Generation:", 103), { "BaseAddress:", 48, 102, 9, 1 }, FIELD("PIDX:", 32, 47), FIELD("CIDX:", 16, 31), { "QueueSize:", 4, 15, 4, 0 }, { "QueueEntrySize:", 2, 3, 4, 0, 1 }, FIELD1("QueueEntryOverride:", 1), FIELD1("CachePriority:", 0), { NULL } }; static struct field_desc flm_t4[] = { FIELD1("NoSnoop:", 79), FIELD1("RelaxedOrdering:", 78), FIELD1("Valid:", 77), FIELD("DCACPUID:", 72, 76), FIELD1("DCAFLEn:", 71), FIELD("EQid:", 54, 70), FIELD("SplitEn:", 52, 53), FIELD1("PadEn:", 51), FIELD1("PackEn:", 50), FIELD1("DBpriority:", 48), FIELD("PackOffset:", 16, 47), FIELD("CIDX:", 8, 15), FIELD("PIDX:", 0, 7), { NULL } }; static struct field_desc conm_t4[] = { FIELD1("CngDBPHdr:", 6), FIELD1("CngDBPData:", 5), FIELD1("CngIMSG:", 4), { "CngChMap:", 0, 3, 0, 1, 0}, { NULL } }; if (p->mem_id == SGE_CONTEXT_EGRESS) show_struct(p->data, 6, (p->data[0] & 2) ? fl_t4 : egress_t4); else if (p->mem_id == SGE_CONTEXT_FLM) show_struct(p->data, 3, flm_t4); else if (p->mem_id == SGE_CONTEXT_INGRESS) show_struct(p->data, 5, ingress_t4); else if (p->mem_id == SGE_CONTEXT_CNM) show_struct(p->data, 1, conm_t4); } #undef FIELD #undef FIELD1 static int get_sge_context(int argc, const char *argv[]) { int rc; char *p; long cid; struct t4_sge_context cntxt = {0}; if (argc != 2) { warnx("sge_context: incorrect number of arguments."); return (EINVAL); } if (!strcmp(argv[0], "egress")) cntxt.mem_id = SGE_CONTEXT_EGRESS; else if (!strcmp(argv[0], "ingress")) cntxt.mem_id = SGE_CONTEXT_INGRESS; else if (!strcmp(argv[0], "fl")) cntxt.mem_id = SGE_CONTEXT_FLM; else if (!strcmp(argv[0], "cong")) cntxt.mem_id = SGE_CONTEXT_CNM; else { warnx("unknown context type \"%s\"; known types are egress, " "ingress, fl, and cong.", argv[0]); return (EINVAL); } p = str_to_number(argv[1], &cid, NULL); if (*p) { warnx("invalid context id \"%s\"", argv[1]); return (EINVAL); } cntxt.cid = cid; rc = doit(CHELSIO_T4_GET_SGE_CONTEXT, &cntxt); if (rc != 0) return (rc); if (chip_id == 4) show_t4_ctxt(&cntxt); else show_t5t6_ctxt(&cntxt, chip_id); return (0); } static int loadfw(int argc, const char *argv[]) { int rc, fd; struct t4_data data = {0}; const char *fname = argv[0]; struct stat st = {0}; if (argc != 1) { warnx("loadfw: incorrect number of arguments."); return (EINVAL); } fd = open(fname, O_RDONLY); if (fd < 0) { warn("open(%s)", fname); return (errno); } if (fstat(fd, &st) < 0) { warn("fstat"); close(fd); return (errno); } data.len = st.st_size; data.data = mmap(0, data.len, PROT_READ, MAP_PRIVATE, fd, 0); if (data.data == MAP_FAILED) { warn("mmap"); close(fd); return (errno); } rc = doit(CHELSIO_T4_LOAD_FW, &data); munmap(data.data, data.len); close(fd); return (rc); } static int +loadcfg(int argc, const char *argv[]) +{ + int rc, fd; + struct t4_data data = {0}; + const char *fname = argv[0]; + struct stat st = {0}; + + if (argc != 1) { + warnx("loadcfg: incorrect number of arguments."); + return (EINVAL); + } + + if (strcmp(fname, "clear") == 0) + return (doit(CHELSIO_T4_LOAD_CFG, &data)); + + fd = open(fname, O_RDONLY); + if (fd < 0) { + warn("open(%s)", fname); + return (errno); + } + + if (fstat(fd, &st) < 0) { + warn("fstat"); + close(fd); + return (errno); + } + + data.len = st.st_size; + data.len &= ~3; /* Clip off to make it a multiple of 4 */ + data.data = mmap(0, data.len, PROT_READ, MAP_PRIVATE, fd, 0); + if (data.data == MAP_FAILED) { + warn("mmap"); + close(fd); + return (errno); + } + + rc = doit(CHELSIO_T4_LOAD_CFG, &data); + munmap(data.data, data.len); + close(fd); + return (rc); +} + +static int read_mem(uint32_t addr, uint32_t len, void (*output)(uint32_t *, uint32_t)) { int rc; struct t4_mem_range mr; mr.addr = addr; mr.len = len; mr.data = malloc(mr.len); if (mr.data == 0) { warn("read_mem: malloc"); return (errno); } rc = doit(CHELSIO_T4_GET_MEM, &mr); if (rc != 0) goto done; if (output) (*output)(mr.data, mr.len); done: free(mr.data); return (rc); } /* * Display memory as list of 'n' 4-byte values per line. */ static void show_mem(uint32_t *buf, uint32_t len) { const char *s; int i, n = 8; while (len) { for (i = 0; len && i < n; i++, buf++, len -= 4) { s = i ? " " : ""; printf("%s%08x", s, htonl(*buf)); } printf("\n"); } } static int memdump(int argc, const char *argv[]) { char *p; long l; uint32_t addr, len; if (argc != 2) { warnx("incorrect number of arguments."); return (EINVAL); } p = str_to_number(argv[0], &l, NULL); if (*p) { warnx("invalid address \"%s\"", argv[0]); return (EINVAL); } addr = l; p = str_to_number(argv[1], &l, NULL); if (*p) { warnx("memdump: invalid length \"%s\"", argv[1]); return (EINVAL); } len = l; return (read_mem(addr, len, show_mem)); } /* * Display TCB as list of 'n' 4-byte values per line. */ static void show_tcb(uint32_t *buf, uint32_t len) { const char *s; int i, n = 8; while (len) { for (i = 0; len && i < n; i++, buf++, len -= 4) { s = i ? " " : ""; printf("%s%08x", s, htonl(*buf)); } printf("\n"); } } #define A_TP_CMM_TCB_BASE 0x7d10 #define TCB_SIZE 128 static int read_tcb(int argc, const char *argv[]) { char *p; long l; long long val; unsigned int tid; uint32_t addr; int rc; if (argc != 1) { warnx("incorrect number of arguments."); return (EINVAL); } p = str_to_number(argv[0], &l, NULL); if (*p) { warnx("invalid tid \"%s\"", argv[0]); return (EINVAL); } tid = l; rc = read_reg(A_TP_CMM_TCB_BASE, 4, &val); if (rc != 0) return (rc); addr = val + tid * TCB_SIZE; return (read_mem(addr, TCB_SIZE, show_tcb)); } static int read_i2c(int argc, const char *argv[]) { char *p; long l; struct t4_i2c_data i2cd; int rc, i; if (argc < 3 || argc > 4) { warnx("incorrect number of arguments."); return (EINVAL); } p = str_to_number(argv[0], &l, NULL); if (*p || l > UCHAR_MAX) { warnx("invalid port id \"%s\"", argv[0]); return (EINVAL); } i2cd.port_id = l; p = str_to_number(argv[1], &l, NULL); if (*p || l > UCHAR_MAX) { warnx("invalid i2c device address \"%s\"", argv[1]); return (EINVAL); } i2cd.dev_addr = l; p = str_to_number(argv[2], &l, NULL); if (*p || l > UCHAR_MAX) { warnx("invalid byte offset \"%s\"", argv[2]); return (EINVAL); } i2cd.offset = l; if (argc == 4) { p = str_to_number(argv[3], &l, NULL); if (*p || l > sizeof(i2cd.data)) { warnx("invalid number of bytes \"%s\"", argv[3]); return (EINVAL); } i2cd.len = l; } else i2cd.len = 1; rc = doit(CHELSIO_T4_GET_I2C, &i2cd); if (rc != 0) return (rc); for (i = 0; i < i2cd.len; i++) printf("0x%x [%u]\n", i2cd.data[i], i2cd.data[i]); return (0); } static int clearstats(int argc, const char *argv[]) { char *p; long l; uint32_t port; if (argc != 1) { warnx("incorrect number of arguments."); return (EINVAL); } p = str_to_number(argv[0], &l, NULL); if (*p) { warnx("invalid port id \"%s\"", argv[0]); return (EINVAL); } port = l; return doit(CHELSIO_T4_CLEAR_STATS, &port); } static int show_tracers(void) { struct t4_tracer t; char *s; int rc, port_idx, i; long long val; /* Magic values: MPS_TRC_CFG = 0x9800. MPS_TRC_CFG[1:1] = TrcEn */ rc = read_reg(0x9800, 4, &val); if (rc != 0) return (rc); printf("tracing is %s\n", val & 2 ? "ENABLED" : "DISABLED"); t.idx = 0; for (t.idx = 0; ; t.idx++) { rc = doit(CHELSIO_T4_GET_TRACER, &t); if (rc != 0 || t.idx == 0xff) break; if (t.tp.port < 4) { s = "Rx"; port_idx = t.tp.port; } else if (t.tp.port < 8) { s = "Tx"; port_idx = t.tp.port - 4; } else if (t.tp.port < 12) { s = "loopback"; port_idx = t.tp.port - 8; } else if (t.tp.port < 16) { s = "MPS Rx"; port_idx = t.tp.port - 12; } else if (t.tp.port < 20) { s = "MPS Tx"; port_idx = t.tp.port - 16; } else { s = "unknown"; port_idx = t.tp.port; } printf("\ntracer %u (currently %s) captures ", t.idx, t.enabled ? "ENABLED" : "DISABLED"); if (t.tp.port < 8) printf("port %u %s, ", port_idx, s); else printf("%s %u, ", s, port_idx); printf("snap length: %u, min length: %u\n", t.tp.snap_len, t.tp.min_len); printf("packets captured %smatch filter\n", t.tp.invert ? "do not " : ""); if (t.tp.skip_ofst) { printf("filter pattern: "); for (i = 0; i < t.tp.skip_ofst * 2; i += 2) printf("%08x%08x", t.tp.data[i], t.tp.data[i + 1]); printf("/"); for (i = 0; i < t.tp.skip_ofst * 2; i += 2) printf("%08x%08x", t.tp.mask[i], t.tp.mask[i + 1]); printf("@0\n"); } printf("filter pattern: "); for (i = t.tp.skip_ofst * 2; i < T4_TRACE_LEN / 4; i += 2) printf("%08x%08x", t.tp.data[i], t.tp.data[i + 1]); printf("/"); for (i = t.tp.skip_ofst * 2; i < T4_TRACE_LEN / 4; i += 2) printf("%08x%08x", t.tp.mask[i], t.tp.mask[i + 1]); printf("@%u\n", (t.tp.skip_ofst + t.tp.skip_len) * 8); } return (rc); } static int tracer_onoff(uint8_t idx, int enabled) { struct t4_tracer t; t.idx = idx; t.enabled = enabled; t.valid = 0; return doit(CHELSIO_T4_SET_TRACER, &t); } static void create_tracing_ifnet() { char *cmd[] = { "/sbin/ifconfig", __DECONST(char *, nexus), "create", NULL }; char *env[] = {NULL}; if (vfork() == 0) { close(STDERR_FILENO); execve(cmd[0], cmd, env); _exit(0); } } /* * XXX: Allow user to specify snaplen, minlen, and pattern (including inverted * matching). Right now this is a quick-n-dirty implementation that traces the * first 128B of all tx or rx on a port */ static int set_tracer(uint8_t idx, int argc, const char *argv[]) { struct t4_tracer t; int len, port; bzero(&t, sizeof (t)); t.idx = idx; t.enabled = 1; t.valid = 1; if (argc != 1) { warnx("must specify tx or rx."); return (EINVAL); } len = strlen(argv[0]); if (len != 3) { warnx("argument must be 3 characters (tx or rx)"); return (EINVAL); } if (strncmp(argv[0], "tx", 2) == 0) { port = argv[0][2] - '0'; if (port < 0 || port > 3) { warnx("'%c' in %s is invalid", argv[0][2], argv[0]); return (EINVAL); } port += 4; } else if (strncmp(argv[0], "rx", 2) == 0) { port = argv[0][2] - '0'; if (port < 0 || port > 3) { warnx("'%c' in %s is invalid", argv[0][2], argv[0]); return (EINVAL); } } else { warnx("argument '%s' isn't tx or rx", argv[0]); return (EINVAL); } t.tp.snap_len = 128; t.tp.min_len = 0; t.tp.skip_ofst = 0; t.tp.skip_len = 0; t.tp.invert = 0; t.tp.port = port; create_tracing_ifnet(); return doit(CHELSIO_T4_SET_TRACER, &t); } static int tracer_cmd(int argc, const char *argv[]) { long long val; uint8_t idx; char *s; if (argc == 0) { warnx("tracer: no arguments."); return (EINVAL); }; /* list */ if (strcmp(argv[0], "list") == 0) { if (argc != 1) warnx("trailing arguments after \"list\" ignored."); return show_tracers(); } /* ... */ s = str_to_number(argv[0], NULL, &val); if (*s || val > 0xff) { warnx("\"%s\" is neither an index nor a tracer subcommand.", argv[0]); return (EINVAL); } idx = (int8_t)val; /* disable */ if (argc == 2 && strcmp(argv[1], "disable") == 0) return tracer_onoff(idx, 0); /* enable */ if (argc == 2 && strcmp(argv[1], "enable") == 0) return tracer_onoff(idx, 1); /* ... */ return set_tracer(idx, argc - 1, argv + 1); } static int modinfo_raw(int port_id) { uint8_t offset; struct t4_i2c_data i2cd; int rc; for (offset = 0; offset < 96; offset += sizeof(i2cd.data)) { bzero(&i2cd, sizeof(i2cd)); i2cd.port_id = port_id; i2cd.dev_addr = 0xa0; i2cd.offset = offset; i2cd.len = sizeof(i2cd.data); rc = doit(CHELSIO_T4_GET_I2C, &i2cd); if (rc != 0) return (rc); printf("%02x: %02x %02x %02x %02x %02x %02x %02x %02x", offset, i2cd.data[0], i2cd.data[1], i2cd.data[2], i2cd.data[3], i2cd.data[4], i2cd.data[5], i2cd.data[6], i2cd.data[7]); printf(" %c%c%c%c %c%c%c%c\n", isprint(i2cd.data[0]) ? i2cd.data[0] : '.', isprint(i2cd.data[1]) ? i2cd.data[1] : '.', isprint(i2cd.data[2]) ? i2cd.data[2] : '.', isprint(i2cd.data[3]) ? i2cd.data[3] : '.', isprint(i2cd.data[4]) ? i2cd.data[4] : '.', isprint(i2cd.data[5]) ? i2cd.data[5] : '.', isprint(i2cd.data[6]) ? i2cd.data[6] : '.', isprint(i2cd.data[7]) ? i2cd.data[7] : '.'); } return (0); } static int modinfo(int argc, const char *argv[]) { long port; char string[16], *p; struct t4_i2c_data i2cd; int rc, i; uint16_t temp, vcc, tx_bias, tx_power, rx_power; if (argc < 1) { warnx("must supply a port"); return (EINVAL); } if (argc > 2) { warnx("too many arguments"); return (EINVAL); } p = str_to_number(argv[0], &port, NULL); if (*p || port > UCHAR_MAX) { warnx("invalid port id \"%s\"", argv[0]); return (EINVAL); } if (argc == 2) { if (!strcmp(argv[1], "raw")) return (modinfo_raw(port)); else { warnx("second argument can only be \"raw\""); return (EINVAL); } } bzero(&i2cd, sizeof(i2cd)); i2cd.len = 1; i2cd.port_id = port; i2cd.dev_addr = SFF_8472_BASE; i2cd.offset = SFF_8472_ID; if ((rc = doit(CHELSIO_T4_GET_I2C, &i2cd)) != 0) goto fail; if (i2cd.data[0] > SFF_8472_ID_LAST) printf("Unknown ID\n"); else printf("ID: %s\n", sff_8472_id[i2cd.data[0]]); bzero(&string, sizeof(string)); for (i = SFF_8472_VENDOR_START; i < SFF_8472_VENDOR_END; i++) { i2cd.offset = i; if ((rc = doit(CHELSIO_T4_GET_I2C, &i2cd)) != 0) goto fail; string[i - SFF_8472_VENDOR_START] = i2cd.data[0]; } printf("Vendor %s\n", string); bzero(&string, sizeof(string)); for (i = SFF_8472_SN_START; i < SFF_8472_SN_END; i++) { i2cd.offset = i; if ((rc = doit(CHELSIO_T4_GET_I2C, &i2cd)) != 0) goto fail; string[i - SFF_8472_SN_START] = i2cd.data[0]; } printf("SN %s\n", string); bzero(&string, sizeof(string)); for (i = SFF_8472_PN_START; i < SFF_8472_PN_END; i++) { i2cd.offset = i; if ((rc = doit(CHELSIO_T4_GET_I2C, &i2cd)) != 0) goto fail; string[i - SFF_8472_PN_START] = i2cd.data[0]; } printf("PN %s\n", string); bzero(&string, sizeof(string)); for (i = SFF_8472_REV_START; i < SFF_8472_REV_END; i++) { i2cd.offset = i; if ((rc = doit(CHELSIO_T4_GET_I2C, &i2cd)) != 0) goto fail; string[i - SFF_8472_REV_START] = i2cd.data[0]; } printf("Rev %s\n", string); i2cd.offset = SFF_8472_DIAG_TYPE; if ((rc = doit(CHELSIO_T4_GET_I2C, &i2cd)) != 0) goto fail; if ((char )i2cd.data[0] & (SFF_8472_DIAG_IMPL | SFF_8472_DIAG_INTERNAL)) { /* Switch to reading from the Diagnostic address. */ i2cd.dev_addr = SFF_8472_DIAG; i2cd.len = 1; i2cd.offset = SFF_8472_TEMP; if ((rc = doit(CHELSIO_T4_GET_I2C, &i2cd)) != 0) goto fail; temp = i2cd.data[0] << 8; printf("Temp: "); if ((temp & SFF_8472_TEMP_SIGN) == SFF_8472_TEMP_SIGN) printf("-"); else printf("+"); printf("%dC\n", (temp & SFF_8472_TEMP_MSK) >> SFF_8472_TEMP_SHIFT); i2cd.offset = SFF_8472_VCC; if ((rc = doit(CHELSIO_T4_GET_I2C, &i2cd)) != 0) goto fail; vcc = i2cd.data[0] << 8; printf("Vcc %fV\n", vcc / SFF_8472_VCC_FACTOR); i2cd.offset = SFF_8472_TX_BIAS; if ((rc = doit(CHELSIO_T4_GET_I2C, &i2cd)) != 0) goto fail; tx_bias = i2cd.data[0] << 8; printf("TX Bias %fuA\n", tx_bias / SFF_8472_BIAS_FACTOR); i2cd.offset = SFF_8472_TX_POWER; if ((rc = doit(CHELSIO_T4_GET_I2C, &i2cd)) != 0) goto fail; tx_power = i2cd.data[0] << 8; printf("TX Power %fmW\n", tx_power / SFF_8472_POWER_FACTOR); i2cd.offset = SFF_8472_RX_POWER; if ((rc = doit(CHELSIO_T4_GET_I2C, &i2cd)) != 0) goto fail; rx_power = i2cd.data[0] << 8; printf("RX Power %fmW\n", rx_power / SFF_8472_POWER_FACTOR); } else printf("Diagnostics not supported.\n"); return(0); fail: if (rc == EPERM) warnx("No module/cable in port %ld", port); return (rc); } /* XXX: pass in a low/high and do range checks as well */ static int get_sched_param(const char *param, const char *args[], long *val) { char *p; if (strcmp(param, args[0]) != 0) return (EINVAL); p = str_to_number(args[1], val, NULL); if (*p) { warnx("parameter \"%s\" has bad value \"%s\"", args[0], args[1]); return (EINVAL); } return (0); } static int sched_class(int argc, const char *argv[]) { struct t4_sched_params op; int errs, i; memset(&op, 0xff, sizeof(op)); op.subcmd = -1; op.type = -1; if (argc == 0) { warnx("missing scheduling sub-command"); return (EINVAL); } if (!strcmp(argv[0], "config")) { op.subcmd = SCHED_CLASS_SUBCMD_CONFIG; op.u.config.minmax = -1; } else if (!strcmp(argv[0], "params")) { op.subcmd = SCHED_CLASS_SUBCMD_PARAMS; op.u.params.level = op.u.params.mode = op.u.params.rateunit = op.u.params.ratemode = op.u.params.channel = op.u.params.cl = op.u.params.minrate = op.u.params.maxrate = op.u.params.weight = op.u.params.pktsize = -1; } else { warnx("invalid scheduling sub-command \"%s\"", argv[0]); return (EINVAL); } /* Decode remaining arguments ... */ errs = 0; for (i = 1; i < argc; i += 2) { const char **args = &argv[i]; long l; if (i + 1 == argc) { warnx("missing argument for \"%s\"", args[0]); errs++; break; } if (!strcmp(args[0], "type")) { if (!strcmp(args[1], "packet")) op.type = SCHED_CLASS_TYPE_PACKET; else { warnx("invalid type parameter \"%s\"", args[1]); errs++; } continue; } if (op.subcmd == SCHED_CLASS_SUBCMD_CONFIG) { if(!get_sched_param("minmax", args, &l)) op.u.config.minmax = (int8_t)l; else { warnx("unknown scheduler config parameter " "\"%s\"", args[0]); errs++; } continue; } /* Rest applies only to SUBCMD_PARAMS */ if (op.subcmd != SCHED_CLASS_SUBCMD_PARAMS) continue; if (!strcmp(args[0], "level")) { if (!strcmp(args[1], "cl-rl")) op.u.params.level = SCHED_CLASS_LEVEL_CL_RL; else if (!strcmp(args[1], "cl-wrr")) op.u.params.level = SCHED_CLASS_LEVEL_CL_WRR; else if (!strcmp(args[1], "ch-rl")) op.u.params.level = SCHED_CLASS_LEVEL_CH_RL; else { warnx("invalid level parameter \"%s\"", args[1]); errs++; } } else if (!strcmp(args[0], "mode")) { if (!strcmp(args[1], "class")) op.u.params.mode = SCHED_CLASS_MODE_CLASS; else if (!strcmp(args[1], "flow")) op.u.params.mode = SCHED_CLASS_MODE_FLOW; else { warnx("invalid mode parameter \"%s\"", args[1]); errs++; } } else if (!strcmp(args[0], "rate-unit")) { if (!strcmp(args[1], "bits")) op.u.params.rateunit = SCHED_CLASS_RATEUNIT_BITS; else if (!strcmp(args[1], "pkts")) op.u.params.rateunit = SCHED_CLASS_RATEUNIT_PKTS; else { warnx("invalid rate-unit parameter \"%s\"", args[1]); errs++; } } else if (!strcmp(args[0], "rate-mode")) { if (!strcmp(args[1], "relative")) op.u.params.ratemode = SCHED_CLASS_RATEMODE_REL; else if (!strcmp(args[1], "absolute")) op.u.params.ratemode = SCHED_CLASS_RATEMODE_ABS; else { warnx("invalid rate-mode parameter \"%s\"", args[1]); errs++; } } else if (!get_sched_param("channel", args, &l)) op.u.params.channel = (int8_t)l; else if (!get_sched_param("class", args, &l)) op.u.params.cl = (int8_t)l; else if (!get_sched_param("min-rate", args, &l)) op.u.params.minrate = (int32_t)l; else if (!get_sched_param("max-rate", args, &l)) op.u.params.maxrate = (int32_t)l; else if (!get_sched_param("weight", args, &l)) op.u.params.weight = (int16_t)l; else if (!get_sched_param("pkt-size", args, &l)) op.u.params.pktsize = (int16_t)l; else { warnx("unknown scheduler parameter \"%s\"", args[0]); errs++; } } /* * Catch some logical fallacies in terms of argument combinations here * so we can offer more than just the EINVAL return from the driver. * The driver will be able to catch a lot more issues since it knows * the specifics of the device hardware capabilities like how many * channels, classes, etc. the device supports. */ if (op.type < 0) { warnx("sched \"type\" parameter missing"); errs++; } if (op.subcmd == SCHED_CLASS_SUBCMD_CONFIG) { if (op.u.config.minmax < 0) { warnx("sched config \"minmax\" parameter missing"); errs++; } } if (op.subcmd == SCHED_CLASS_SUBCMD_PARAMS) { if (op.u.params.level < 0) { warnx("sched params \"level\" parameter missing"); errs++; } if (op.u.params.mode < 0) { warnx("sched params \"mode\" parameter missing"); errs++; } if (op.u.params.rateunit < 0) { warnx("sched params \"rate-unit\" parameter missing"); errs++; } if (op.u.params.ratemode < 0) { warnx("sched params \"rate-mode\" parameter missing"); errs++; } if (op.u.params.channel < 0) { warnx("sched params \"channel\" missing"); errs++; } if (op.u.params.cl < 0) { warnx("sched params \"class\" missing"); errs++; } if (op.u.params.maxrate < 0 && (op.u.params.level == SCHED_CLASS_LEVEL_CL_RL || op.u.params.level == SCHED_CLASS_LEVEL_CH_RL)) { warnx("sched params \"max-rate\" missing for " "rate-limit level"); errs++; } if (op.u.params.weight < 0 && op.u.params.level == SCHED_CLASS_LEVEL_CL_WRR) { warnx("sched params \"weight\" missing for " "weighted-round-robin level"); errs++; } if (op.u.params.pktsize < 0 && (op.u.params.level == SCHED_CLASS_LEVEL_CL_RL || op.u.params.level == SCHED_CLASS_LEVEL_CH_RL)) { warnx("sched params \"pkt-size\" missing for " "rate-limit level"); errs++; } if (op.u.params.mode == SCHED_CLASS_MODE_FLOW && op.u.params.ratemode != SCHED_CLASS_RATEMODE_ABS) { warnx("sched params mode flow needs rate-mode absolute"); errs++; } if (op.u.params.ratemode == SCHED_CLASS_RATEMODE_REL && !in_range(op.u.params.maxrate, 1, 100)) { warnx("sched params \"max-rate\" takes " "percentage value(1-100) for rate-mode relative"); errs++; } if (op.u.params.ratemode == SCHED_CLASS_RATEMODE_ABS && !in_range(op.u.params.maxrate, 1, 100000000)) { warnx("sched params \"max-rate\" takes " "value(1-100000000) for rate-mode absolute"); errs++; } if (op.u.params.maxrate > 0 && op.u.params.maxrate < op.u.params.minrate) { warnx("sched params \"max-rate\" is less than " "\"min-rate\""); errs++; } } if (errs > 0) { warnx("%d error%s in sched-class command", errs, errs == 1 ? "" : "s"); return (EINVAL); } return doit(CHELSIO_T4_SCHED_CLASS, &op); } static int sched_queue(int argc, const char *argv[]) { struct t4_sched_queue op = {0}; char *p; long val; if (argc != 3) { /* need " */ warnx("incorrect number of arguments."); return (EINVAL); } p = str_to_number(argv[0], &val, NULL); if (*p || val > UCHAR_MAX) { warnx("invalid port id \"%s\"", argv[0]); return (EINVAL); } op.port = (uint8_t)val; if (!strcmp(argv[1], "all") || !strcmp(argv[1], "*")) op.queue = -1; else { p = str_to_number(argv[1], &val, NULL); if (*p || val < -1) { warnx("invalid queue \"%s\"", argv[1]); return (EINVAL); } op.queue = (int8_t)val; } if (!strcmp(argv[2], "unbind") || !strcmp(argv[2], "clear")) op.cl = -1; else { p = str_to_number(argv[2], &val, NULL); if (*p || val < -1) { warnx("invalid class \"%s\"", argv[2]); return (EINVAL); } op.cl = (int8_t)val; } return doit(CHELSIO_T4_SCHED_QUEUE, &op); } static int run_cmd(int argc, const char *argv[]) { int rc = -1; const char *cmd = argv[0]; /* command */ argc--; argv++; if (!strcmp(cmd, "reg") || !strcmp(cmd, "reg32")) rc = register_io(argc, argv, 4); else if (!strcmp(cmd, "reg64")) rc = register_io(argc, argv, 8); else if (!strcmp(cmd, "regdump")) rc = dump_regs(argc, argv); else if (!strcmp(cmd, "filter")) rc = filter_cmd(argc, argv); else if (!strcmp(cmd, "context")) rc = get_sge_context(argc, argv); else if (!strcmp(cmd, "loadfw")) rc = loadfw(argc, argv); else if (!strcmp(cmd, "memdump")) rc = memdump(argc, argv); else if (!strcmp(cmd, "tcb")) rc = read_tcb(argc, argv); else if (!strcmp(cmd, "i2c")) rc = read_i2c(argc, argv); else if (!strcmp(cmd, "clearstats")) rc = clearstats(argc, argv); else if (!strcmp(cmd, "tracer")) rc = tracer_cmd(argc, argv); else if (!strcmp(cmd, "modinfo")) rc = modinfo(argc, argv); else if (!strcmp(cmd, "sched-class")) rc = sched_class(argc, argv); else if (!strcmp(cmd, "sched-queue")) rc = sched_queue(argc, argv); + else if (!strcmp(cmd, "loadcfg")) + rc = loadcfg(argc, argv); else { rc = EINVAL; warnx("invalid command \"%s\"", cmd); } return (rc); } #define MAX_ARGS 15 static int run_cmd_loop(void) { int i, rc = 0; char buffer[128], *buf; const char *args[MAX_ARGS + 1]; /* * Simple loop: displays a "> " prompt and processes any input as a * cxgbetool command. You're supposed to enter only the part after * "cxgbetool t4nexX". Use "quit" or "exit" to exit. */ for (;;) { fprintf(stdout, "> "); fflush(stdout); buf = fgets(buffer, sizeof(buffer), stdin); if (buf == NULL) { if (ferror(stdin)) { warn("stdin error"); rc = errno; /* errno from fgets */ } break; } i = 0; while ((args[i] = strsep(&buf, " \t\n")) != NULL) { if (args[i][0] != 0 && ++i == MAX_ARGS) break; } args[i] = 0; if (i == 0) continue; /* skip empty line */ if (!strcmp(args[0], "quit") || !strcmp(args[0], "exit")) break; rc = run_cmd(i, args); } /* rc normally comes from the last command (not including quit/exit) */ return (rc); } int main(int argc, const char *argv[]) { int rc = -1; progname = argv[0]; if (argc == 2) { if (!strcmp(argv[1], "-h") || !strcmp(argv[1], "--help")) { usage(stdout); exit(0); } } if (argc < 3) { usage(stderr); exit(EINVAL); } nexus = argv[1]; /* progname and nexus */ argc -= 2; argv += 2; if (argc == 1 && !strcmp(argv[0], "stdio")) rc = run_cmd_loop(); else rc = run_cmd(argc, argv); return (rc); } Index: user/alc/PQ_LAUNDRY/usr.bin/cmp/cmp.c =================================================================== --- user/alc/PQ_LAUNDRY/usr.bin/cmp/cmp.c (revision 306831) +++ user/alc/PQ_LAUNDRY/usr.bin/cmp/cmp.c (revision 306832) @@ -1,229 +1,226 @@ /* * Copyright (c) 1987, 1990, 1993, 1994 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #ifndef lint static const char copyright[] = "@(#) Copyright (c) 1987, 1990, 1993, 1994\n\ The Regents of the University of California. All rights reserved.\n"; #endif #if 0 #ifndef lint static char sccsid[] = "@(#)cmp.c 8.3 (Berkeley) 4/2/94"; #endif #endif #include __FBSDID("$FreeBSD$"); #include #include #include +#include #include #include #include #include #include #include #include -#include #include #include "extern.h" int lflag, sflag, xflag, zflag; static void usage(void); int main(int argc, char *argv[]) { struct stat sb1, sb2; off_t skip1, skip2; int ch, fd1, fd2, oflag, special; const char *file1, *file2; cap_rights_t rights; - unsigned long cmd; uint32_t fcntls; oflag = O_RDONLY; while ((ch = getopt(argc, argv, "hlsxz")) != -1) switch (ch) { case 'h': /* Don't follow symlinks */ oflag |= O_NOFOLLOW; break; case 'l': /* print all differences */ lflag = 1; break; case 's': /* silent run */ sflag = 1; zflag = 1; break; case 'x': /* hex output */ lflag = 1; xflag = 1; break; case 'z': /* compare size first */ zflag = 1; break; case '?': default: usage(); } argv += optind; argc -= optind; if (lflag && sflag) errx(ERR_EXIT, "specifying -s with -l or -x is not permitted"); if (argc < 2 || argc > 4) usage(); /* Backward compatibility -- handle "-" meaning stdin. */ special = 0; if (strcmp(file1 = argv[0], "-") == 0) { special = 1; fd1 = 0; file1 = "stdin"; } else if ((fd1 = open(file1, oflag, 0)) < 0 && errno != EMLINK) { if (!sflag) err(ERR_EXIT, "%s", file1); else exit(ERR_EXIT); } if (strcmp(file2 = argv[1], "-") == 0) { if (special) errx(ERR_EXIT, "standard input may only be specified once"); special = 1; fd2 = 0; file2 = "stdin"; } else if ((fd2 = open(file2, oflag, 0)) < 0 && errno != EMLINK) { if (!sflag) err(ERR_EXIT, "%s", file2); else exit(ERR_EXIT); } skip1 = argc > 2 ? strtol(argv[2], NULL, 0) : 0; skip2 = argc == 4 ? strtol(argv[3], NULL, 0) : 0; if (fd1 == -1) { if (fd2 == -1) { c_link(file1, skip1, file2, skip2); exit(0); } else if (!sflag) errx(ERR_EXIT, "%s: Not a symbolic link", file2); else exit(ERR_EXIT); } else if (fd2 == -1) { if (!sflag) errx(ERR_EXIT, "%s: Not a symbolic link", file1); else exit(ERR_EXIT); } cap_rights_init(&rights, CAP_FCNTL, CAP_FSTAT, CAP_MMAP_R); if (cap_rights_limit(fd1, &rights) < 0 && errno != ENOSYS) err(ERR_EXIT, "unable to limit rights for %s", file1); if (cap_rights_limit(fd2, &rights) < 0 && errno != ENOSYS) err(ERR_EXIT, "unable to limit rights for %s", file2); /* Required for fdopen(3). */ fcntls = CAP_FCNTL_GETFL; if (cap_fcntls_limit(fd1, fcntls) < 0 && errno != ENOSYS) err(ERR_EXIT, "unable to limit fcntls for %s", file1); if (cap_fcntls_limit(fd2, fcntls) < 0 && errno != ENOSYS) err(ERR_EXIT, "unable to limit fcntls for %s", file2); - cap_rights_init(&rights, CAP_FSTAT, CAP_WRITE, CAP_IOCTL); - if (cap_rights_limit(STDOUT_FILENO, &rights) < 0 && errno != ENOSYS) - err(ERR_EXIT, "unable to limit rights for stdout"); + if (!special) { + cap_rights_init(&rights); + if (cap_rights_limit(STDIN_FILENO, &rights) < 0 && + errno != ENOSYS) { + err(ERR_EXIT, "unable to limit stdio"); + } + } - /* Required for printf(3) via isatty(3). */ - cmd = TIOCGETA; - if (cap_ioctls_limit(STDOUT_FILENO, &cmd, 1) < 0 && errno != ENOSYS) - err(ERR_EXIT, "unable to limit ioctls for stdout"); + if (caph_limit_stdout() == -1 || caph_limit_stderr() == -1) + err(ERR_EXIT, "unable to limit stdio"); - /* - * Cache NLS data, for strerror, for err(3), before entering capability - * mode. - */ - (void)catopen("libc", NL_CAT_LOCALE); + caph_cache_catpages(); if (cap_enter() < 0 && errno != ENOSYS) err(ERR_EXIT, "unable to enter capability mode"); if (!special) { if (fstat(fd1, &sb1)) { if (!sflag) err(ERR_EXIT, "%s", file1); else exit(ERR_EXIT); } if (!S_ISREG(sb1.st_mode)) special = 1; else { if (fstat(fd2, &sb2)) { if (!sflag) err(ERR_EXIT, "%s", file2); else exit(ERR_EXIT); } if (!S_ISREG(sb2.st_mode)) special = 1; } } if (special) c_special(fd1, file1, skip1, fd2, file2, skip2); else { if (zflag && sb1.st_size != sb2.st_size) { if (!sflag) (void) printf("%s %s differ: size\n", file1, file2); exit(DIFF_EXIT); } c_regular(fd1, file1, skip1, sb1.st_size, fd2, file2, skip2, sb2.st_size); } exit(0); } static void usage(void) { (void)fprintf(stderr, "usage: cmp [-l | -s | -x] [-hz] file1 file2 [skip1 [skip2]]\n"); exit(ERR_EXIT); } Index: user/alc/PQ_LAUNDRY/usr.bin/col/col.c =================================================================== --- user/alc/PQ_LAUNDRY/usr.bin/col/col.c (revision 306831) +++ user/alc/PQ_LAUNDRY/usr.bin/col/col.c (revision 306832) @@ -1,599 +1,591 @@ /*- * Copyright (c) 1990, 1993, 1994 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * Michael Rendell of the Memorial University of Newfoundland. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #ifndef lint static const char copyright[] = "@(#) Copyright (c) 1990, 1993, 1994\n\ The Regents of the University of California. All rights reserved.\n"; #endif #if 0 #ifndef lint static char sccsid[] = "@(#)col.c 8.5 (Berkeley) 5/4/95"; #endif #endif #include __FBSDID("$FreeBSD$"); #include +#include #include #include #include #include #include #include #include #include #include #include #define BS '\b' /* backspace */ #define TAB '\t' /* tab */ #define SPACE ' ' /* space */ #define NL '\n' /* newline */ #define CR '\r' /* carriage return */ #define ESC '\033' /* escape */ #define SI '\017' /* shift in to normal character set */ #define SO '\016' /* shift out to alternate character set */ #define VT '\013' /* vertical tab (aka reverse line feed) */ #define RLF '7' /* ESC-7 reverse line feed */ #define RHLF '8' /* ESC-8 reverse half-line feed */ #define FHLF '9' /* ESC-9 forward half-line feed */ /* build up at least this many lines before flushing them out */ #define BUFFER_MARGIN 32 typedef char CSET; typedef struct char_str { #define CS_NORMAL 1 #define CS_ALTERNATE 2 short c_column; /* column character is in */ CSET c_set; /* character set (currently only 2) */ wchar_t c_char; /* character in question */ int c_width; /* character width */ } CHAR; typedef struct line_str LINE; struct line_str { CHAR *l_line; /* characters on the line */ LINE *l_prev; /* previous line */ LINE *l_next; /* next line */ int l_lsize; /* allocated sizeof l_line */ int l_line_len; /* strlen(l_line) */ int l_needs_sort; /* set if chars went in out of order */ int l_max_col; /* max column in the line */ }; static void addto_lineno(int *, int); static LINE *alloc_line(void); static void dowarn(int); static void flush_line(LINE *); static void flush_lines(int); static void flush_blanks(void); static void free_line(LINE *); static void usage(void); static CSET last_set; /* char_set of last char printed */ static LINE *lines; static int compress_spaces; /* if doing space -> tab conversion */ static int fine; /* if `fine' resolution (half lines) */ static int max_bufd_lines; /* max # of half lines to keep in memory */ static int nblank_lines; /* # blanks after last flushed line */ static int no_backspaces; /* if not to output any backspaces */ static int pass_unknown_seqs; /* pass unknown control sequences */ #define PUTC(ch) \ do { \ if (putwchar(ch) == WEOF) \ errx(1, "write error"); \ } while (0) int main(int argc, char **argv) { wint_t ch; CHAR *c; CSET cur_set; /* current character set */ LINE *l; /* current line */ int extra_lines; /* # of lines above first line */ int cur_col; /* current column */ int cur_line; /* line number of current position */ int max_line; /* max value of cur_line */ int this_line; /* line l points to */ int nflushd_lines; /* number of lines that were flushed */ int adjust, opt, warned, width; const char *errstr; - cap_rights_t rights; - unsigned long cmd; (void)setlocale(LC_CTYPE, ""); - cap_rights_init(&rights, CAP_FSTAT, CAP_READ); - if (cap_rights_limit(STDIN_FILENO, &rights) < 0 && errno != ENOSYS) - err(1, "unable to limit rights for stdin"); - cap_rights_init(&rights, CAP_FSTAT, CAP_WRITE, CAP_IOCTL); - if (cap_rights_limit(STDOUT_FILENO, &rights) < 0 && errno != ENOSYS) - err(1, "unable to limit rights for stdout"); - cmd = TIOCGETA; /* required by isatty(3) in printf(3) */ - if (cap_ioctls_limit(STDOUT_FILENO, &cmd, 1) < 0 && errno != ENOSYS) - err(1, "unable to limit ioctls for stdout"); + if (caph_limit_stdio() == -1) + err(1, "unable to limit stdio"); if (cap_enter() < 0 && errno != ENOSYS) err(1, "unable to enter capability mode"); max_bufd_lines = 256; compress_spaces = 1; /* compress spaces into tabs */ while ((opt = getopt(argc, argv, "bfhl:px")) != -1) switch (opt) { case 'b': /* do not output backspaces */ no_backspaces = 1; break; case 'f': /* allow half forward line feeds */ fine = 1; break; case 'h': /* compress spaces into tabs */ compress_spaces = 1; break; case 'l': /* buffered line count */ max_bufd_lines = strtonum(optarg, 1, (INT_MAX - BUFFER_MARGIN) / 2, &errstr) * 2; if (errstr != NULL) errx(1, "bad -l argument, %s: %s", errstr, optarg); break; case 'p': /* pass unknown control sequences */ pass_unknown_seqs = 1; break; case 'x': /* do not compress spaces into tabs */ compress_spaces = 0; break; case '?': default: usage(); } if (optind != argc) usage(); adjust = cur_col = extra_lines = warned = 0; cur_line = max_line = nflushd_lines = this_line = 0; cur_set = last_set = CS_NORMAL; lines = l = alloc_line(); while ((ch = getwchar()) != WEOF) { if (!iswgraph(ch)) { switch (ch) { case BS: /* can't go back further */ if (cur_col == 0) continue; --cur_col; continue; case CR: cur_col = 0; continue; case ESC: /* just ignore EOF */ switch(getwchar()) { /* * In the input stream, accept both the * XPG5 sequences ESC-digit and the * traditional BSD sequences ESC-ctrl. */ case '\007': /* FALLTHROUGH */ case RLF: addto_lineno(&cur_line, -2); break; case '\010': /* FALLTHROUGH */ case RHLF: addto_lineno(&cur_line, -1); break; case '\011': /* FALLTHROUGH */ case FHLF: addto_lineno(&cur_line, 1); if (cur_line > max_line) max_line = cur_line; } continue; case NL: addto_lineno(&cur_line, 2); if (cur_line > max_line) max_line = cur_line; cur_col = 0; continue; case SPACE: ++cur_col; continue; case SI: cur_set = CS_NORMAL; continue; case SO: cur_set = CS_ALTERNATE; continue; case TAB: /* adjust column */ cur_col |= 7; ++cur_col; continue; case VT: addto_lineno(&cur_line, -2); continue; } if (iswspace(ch)) { if ((width = wcwidth(ch)) > 0) cur_col += width; continue; } if (!pass_unknown_seqs) continue; } /* Must stuff ch in a line - are we at the right one? */ if (cur_line + adjust != this_line) { LINE *lnew; /* round up to next line */ adjust = !fine && (cur_line & 1); if (cur_line + adjust < this_line) { while (cur_line + adjust < this_line && l->l_prev != NULL) { l = l->l_prev; this_line--; } if (cur_line + adjust < this_line) { if (nflushd_lines == 0) { /* * Allow backup past first * line if nothing has been * flushed yet. */ while (cur_line + adjust < this_line) { lnew = alloc_line(); l->l_prev = lnew; lnew->l_next = l; l = lines = lnew; extra_lines++; this_line--; } } else { if (!warned++) dowarn(cur_line); cur_line = this_line - adjust; } } } else { /* may need to allocate here */ while (cur_line + adjust > this_line) { if (l->l_next == NULL) { l->l_next = alloc_line(); l->l_next->l_prev = l; } l = l->l_next; this_line++; } } if (this_line > nflushd_lines && this_line - nflushd_lines >= max_bufd_lines + BUFFER_MARGIN) { if (extra_lines) { flush_lines(extra_lines); extra_lines = 0; } flush_lines(this_line - nflushd_lines - max_bufd_lines); nflushd_lines = this_line - max_bufd_lines; } } /* grow line's buffer? */ if (l->l_line_len + 1 >= l->l_lsize) { int need; need = l->l_lsize ? l->l_lsize * 2 : 90; if ((l->l_line = realloc(l->l_line, (unsigned)need * sizeof(CHAR))) == NULL) err(1, NULL); l->l_lsize = need; } c = &l->l_line[l->l_line_len++]; c->c_char = ch; c->c_set = cur_set; c->c_column = cur_col; c->c_width = wcwidth(ch); /* * If things are put in out of order, they will need sorting * when it is flushed. */ if (cur_col < l->l_max_col) l->l_needs_sort = 1; else l->l_max_col = cur_col; if (c->c_width > 0) cur_col += c->c_width; } if (ferror(stdin)) err(1, NULL); if (extra_lines) flush_lines(extra_lines); /* goto the last line that had a character on it */ for (; l->l_next; l = l->l_next) this_line++; flush_lines(this_line - nflushd_lines + 1); /* make sure we leave things in a sane state */ if (last_set != CS_NORMAL) PUTC(SI); /* flush out the last few blank lines */ if (max_line > this_line) nblank_lines = max_line - this_line; if (max_line & 1) nblank_lines++; flush_blanks(); exit(0); } static void flush_lines(int nflush) { LINE *l; while (--nflush >= 0) { l = lines; lines = l->l_next; if (l->l_line) { flush_blanks(); flush_line(l); } if (l->l_line || l->l_next) nblank_lines++; if (l->l_line) (void)free(l->l_line); free_line(l); } if (lines) lines->l_prev = NULL; } /* * Print a number of newline/half newlines. If fine flag is set, nblank_lines * is the number of half line feeds, otherwise it is the number of whole line * feeds. */ static void flush_blanks(void) { int half, i, nb; half = 0; nb = nblank_lines; if (nb & 1) { if (fine) half = 1; else nb++; } nb /= 2; for (i = nb; --i >= 0;) PUTC('\n'); if (half) { PUTC(ESC); PUTC(FHLF); if (!nb) PUTC('\r'); } nblank_lines = 0; } /* * Write a line to stdout taking care of space to tab conversion (-h flag) * and character set shifts. */ static void flush_line(LINE *l) { CHAR *c, *endc; int i, j, nchars, last_col, save, this_col, tot; last_col = 0; nchars = l->l_line_len; if (l->l_needs_sort) { static CHAR *sorted; static int count_size, *count, sorted_size; /* * Do an O(n) sort on l->l_line by column being careful to * preserve the order of characters in the same column. */ if (l->l_lsize > sorted_size) { sorted_size = l->l_lsize; if ((sorted = realloc(sorted, (unsigned)sizeof(CHAR) * sorted_size)) == NULL) err(1, NULL); } if (l->l_max_col >= count_size) { count_size = l->l_max_col + 1; if ((count = realloc(count, (unsigned)sizeof(int) * count_size)) == NULL) err(1, NULL); } memset(count, 0, sizeof(int) * l->l_max_col + 1); for (i = nchars, c = l->l_line; --i >= 0; c++) count[c->c_column]++; /* * calculate running total (shifted down by 1) to use as * indices into new line. */ for (tot = 0, i = 0; i <= l->l_max_col; i++) { save = count[i]; count[i] = tot; tot += save; } for (i = nchars, c = l->l_line; --i >= 0; c++) sorted[count[c->c_column]++] = *c; c = sorted; } else c = l->l_line; while (nchars > 0) { this_col = c->c_column; endc = c; do { ++endc; } while (--nchars > 0 && this_col == endc->c_column); /* if -b only print last character */ if (no_backspaces) { c = endc - 1; if (nchars > 0 && this_col + c->c_width > endc->c_column) continue; } if (this_col > last_col) { int nspace = this_col - last_col; if (compress_spaces && nspace > 1) { while (1) { int tab_col, tab_size; tab_col = (last_col + 8) & ~7; if (tab_col > this_col) break; tab_size = tab_col - last_col; if (tab_size == 1) PUTC(' '); else PUTC('\t'); nspace -= tab_size; last_col = tab_col; } } while (--nspace >= 0) PUTC(' '); last_col = this_col; } for (;;) { if (c->c_set != last_set) { switch (c->c_set) { case CS_NORMAL: PUTC(SI); break; case CS_ALTERNATE: PUTC(SO); } last_set = c->c_set; } PUTC(c->c_char); if ((c + 1) < endc) for (j = 0; j < c->c_width; j++) PUTC('\b'); if (++c >= endc) break; } last_col += (c - 1)->c_width; } } /* * Increment or decrement a line number, checking for overflow. * Stop one below INT_MAX such that the adjust variable is safe. */ void addto_lineno(int *lno, int offset) { if (offset > 0) { if (*lno >= INT_MAX - offset) errx(1, "too many lines"); } else { if (*lno < INT_MIN - offset) errx(1, "too many reverse line feeds"); } *lno += offset; } #define NALLOC 64 static LINE *line_freelist; static LINE * alloc_line(void) { LINE *l; int i; if (!line_freelist) { if ((l = realloc(NULL, sizeof(LINE) * NALLOC)) == NULL) err(1, NULL); line_freelist = l; for (i = 1; i < NALLOC; i++, l++) l->l_next = l + 1; l->l_next = NULL; } l = line_freelist; line_freelist = l->l_next; memset(l, 0, sizeof(LINE)); return (l); } static void free_line(LINE *l) { l->l_next = line_freelist; line_freelist = l; } static void usage(void) { (void)fprintf(stderr, "usage: col [-bfhpx] [-l nline]\n"); exit(1); } static void dowarn(int line) { warnx("warning: can't back up %s", line < 0 ? "past first line" : "-- line already flushed"); } Index: user/alc/PQ_LAUNDRY/usr.bin/dtc/checking.cc =================================================================== --- user/alc/PQ_LAUNDRY/usr.bin/dtc/checking.cc (revision 306831) +++ user/alc/PQ_LAUNDRY/usr.bin/dtc/checking.cc (revision 306832) @@ -1,259 +1,285 @@ /*- * Copyright (c) 2013 David Chisnall * All rights reserved. * * This software was developed by SRI International and the University of * Cambridge Computer Laboratory under DARPA/AFRL contract (FA8750-10-C-0237) * ("CTSRD"), as part of the DARPA CRASH research programme. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include "checking.hh" #include +using std::string; - namespace dtc { namespace fdt { namespace checking { namespace { + struct deleted_node_checker : public checker + { + deleted_node_checker(const char *name) : checker(name) {} + virtual bool check_node(device_tree *, const node_ptr &n) + { + auto &deleted = n->deleted_child_nodes(); + if (deleted.empty()) + { + return true; + } + bool plural = deleted.size() > 1; + string errmsg("Attempts to delete "); + errmsg += plural ? "nodes" : "node"; + errmsg += " that "; + errmsg += plural ? "were" : "was"; + errmsg += " not added in merge: "; + for (auto &d : deleted) + { + errmsg += d; + } + report_error(errmsg.c_str()); + return false; + } + }; /** * Checker that verifies that every node that has children has * #address-cells and #size-cells properties. */ struct address_cells_checker : public checker { address_cells_checker(const char *name) : checker(name) {} virtual bool check_node(device_tree *, const node_ptr &n) { // If this has no children, it trivially meets the // conditions. if (n->child_begin() == n->child_end()) { return true; } bool found_address = false; bool found_size = false; for (auto i=n->property_begin(), e=n->property_end() ; i!=e ; ++i) { if (!found_address) { found_address = ((*i)->get_key() == "#address-cells"); } if (!found_size) { found_size = ((*i)->get_key() == "#size-cells"); } if (found_size && found_address) { - break; + break; } } if (!found_address) { - report_error("Missing #address-cells property"); + report_error("Missing #address-cells property"); } if (!found_size) { - report_error("Missing #size-cells property"); + report_error("Missing #size-cells property"); } return found_address && found_size; } }; } // anonymous namespace bool checker::visit_node(device_tree *tree, const node_ptr &n) { path.push_back(std::make_pair(n->name, n->unit_address)); // Check this node if (!check_node(tree, n)) { return false; } // Now check its properties for (auto i=n->property_begin(), e=n->property_end() ; i!=e ; ++i) { if (!check_property(tree, n, *i)) { return false; } } // And then recursively check the children for (node::child_iterator i=n->child_begin(), e=n->child_end() ; i!=e ; ++i) { if (!visit_node(tree, *i)) { return false; } } path.pop_back(); return true; } void checker::report_error(const char *errmsg) { fprintf(stderr, "Error: %s, while checking node: ", errmsg); for (auto &p : path) { putc('/', stderr); - p.first.dump(); + puts(p.first.c_str()); if (!(p.second.empty())) { putc('@', stderr); - p.second.dump(); + puts(p.second.c_str()); } } fprintf(stderr, " [-W%s]\n", checker_name); } bool property_checker::check_property(device_tree *tree, const node_ptr &n, property_ptr p) { if (p->get_key() == key) { if (!check(tree, n, p)) { report_error("property check failed"); return false; } } return true; } bool property_size_checker::check(device_tree *, const node_ptr &, property_ptr p) { uint32_t psize = 0; for (property::value_iterator i=p->begin(),e=p->end() ; i!=e ; ++i) { if (!i->is_binary()) { return false; } psize += i->byte_data.size(); } return psize == size; } template void -check_manager::add_property_type_checker(const char *name, string prop) +check_manager::add_property_type_checker(const char *name, const string &prop) { checkers.insert(std::make_pair(string(name), new property_type_checker(name, prop))); } void check_manager::add_property_size_checker(const char *name, - string prop, + const string &prop, uint32_t size) { checkers.insert(std::make_pair(string(name), new property_size_checker(name, prop, size))); } check_manager::~check_manager() { while (checkers.begin() != checkers.end()) { delete checkers.begin()->second; checkers.erase(checkers.begin()); } while (disabled_checkers.begin() != disabled_checkers.end()) { delete disabled_checkers.begin()->second; disabled_checkers.erase(disabled_checkers.begin()); } } check_manager::check_manager() { // NOTE: All checks listed here MUST have a corresponding line // in the man page! add_property_type_checker( "type-compatible", string("compatible")); add_property_type_checker( "type-model", string("model")); add_property_size_checker("type-phandle", string("phandle"), 4); disabled_checkers.insert(std::make_pair(string("cells-attributes"), new address_cells_checker("cells-attributes"))); + checkers.insert(std::make_pair(string("deleted-nodes"), + new deleted_node_checker("deleted-nodes"))); } bool check_manager::run_checks(device_tree *tree, bool keep_going) { bool success = true; for (auto &i : checkers) { success &= i.second->check_tree(tree); if (!(success || keep_going)) { break; } } return success; } bool -check_manager::disable_checker(string name) +check_manager::disable_checker(const string &name) { auto checker = checkers.find(name); if (checker != checkers.end()) { disabled_checkers.insert(std::make_pair(name, checker->second)); checkers.erase(checker); return true; } return false; } bool -check_manager::enable_checker(string name) +check_manager::enable_checker(const string &name) { auto checker = disabled_checkers.find(name); if (checker != disabled_checkers.end()) { checkers.insert(std::make_pair(name, checker->second)); disabled_checkers.erase(checker); return true; } return false; } } // namespace checking } // namespace fdt } // namespace dtc Index: user/alc/PQ_LAUNDRY/usr.bin/dtc/checking.hh =================================================================== --- user/alc/PQ_LAUNDRY/usr.bin/dtc/checking.hh (revision 306831) +++ user/alc/PQ_LAUNDRY/usr.bin/dtc/checking.hh (revision 306832) @@ -1,308 +1,310 @@ /*- * Copyright (c) 2013 David Chisnall * All rights reserved. * * This software was developed by SRI International and the University of * Cambridge Computer Laboratory under DARPA/AFRL contract (FA8750-10-C-0237) * ("CTSRD"), as part of the DARPA CRASH research programme. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _CHECKING_HH_ #define _CHECKING_HH_ -#include "string.hh" +#include #include "fdt.hh" namespace dtc { namespace fdt { namespace checking { /** * Base class for all checkers. This will visit the entire tree and perform * semantic checks defined in subclasses. Note that device trees are generally * small (a few dozen nodes at most) and so we optimise for flexibility and * extensibility here, not for performance. Each checker will visit the entire * tree. */ class checker { /** * The path to the current node being checked. This is used for * printing error messages. */ device_tree::node_path path; /** * The name of the checker. This is used for printing error messages * and for enabling / disabling specific checkers from the command - * line. + * line. */ const char *checker_name; /** * Visits each node, calling the checker functions on properties and * nodes. */ bool visit_node(device_tree *tree, const node_ptr &n); protected: /** * Prints the error message, along with the path to the node that * caused the error and the name of the checker. */ void report_error(const char *errmsg); public: /** * Constructor. Takes the name of this checker, which is which is used * when reporting errors. */ checker(const char *name) : checker_name(name) {} /** * Virtual destructor in case any subclasses need to do cleanup. */ virtual ~checker() {} /** * Method for checking that a node is valid. The root class version * does nothing, subclasses should override this. */ virtual bool check_node(device_tree *, const node_ptr &) { return true; } /** * Method for checking that a property is valid. The root class * version does nothing, subclasses should override this. */ virtual bool check_property(device_tree *, const node_ptr &, property_ptr ) { return true; } /** * Runs the checker on the specified device tree. */ bool check_tree(fdt::device_tree *tree) { return visit_node(tree, tree->get_root()); } }; /** * Abstract base class for simple property checks. This class defines a check * method for subclasses, which is invoked only when it finds a property with * the matching name. To define simple property checkers, just subclass this * and override the check() method. */ class property_checker : public checker { /** * The name of the property that this checker is looking for. */ - string key; + std::string key; public: /** * Implementation of the generic property-checking method that checks - * for a property with the name specified in the constructor + * for a property with the name specified in the constructor. */ virtual bool check_property(device_tree *tree, const node_ptr &n, property_ptr p); /** * Constructor. Takes the name of the checker and the name of the * property to check. */ - property_checker(const char* name, string property_name) + property_checker(const char* name, const std::string &property_name) : checker(name), key(property_name) {} /** * The check method, which subclasses should implement. */ virtual bool check(device_tree *tree, const node_ptr &n, property_ptr p) = 0; }; /** * Property type checker. */ template struct property_type_checker : public property_checker { /** * Constructor, takes the name of the checker and the name of the * property to check as arguments. */ - property_type_checker(const char* name, string property_name) : + property_type_checker(const char* name, const std::string &property_name) : property_checker(name, property_name) {} virtual bool check(device_tree *tree, const node_ptr &n, property_ptr p) = 0; }; /** * Empty property checker. This checks that the property has no value. */ template<> struct property_type_checker : public property_checker { - property_type_checker(const char* name, string property_name) : + property_type_checker(const char* name, const std::string &property_name) : property_checker(name, property_name) {} virtual bool check(device_tree *, const node_ptr &, property_ptr p) { return p->begin() == p->end(); } }; /** * String property checker. This checks that the property has exactly one * value, which is a string. */ template<> struct property_type_checker : public property_checker { - property_type_checker(const char* name, string property_name) : + property_type_checker(const char* name, const std::string &property_name) : property_checker(name, property_name) {} virtual bool check(device_tree *, const node_ptr &, property_ptr p) { return (p->begin() + 1 == p->end()) && p->begin()->is_string(); } }; /** * String list property checker. This checks that the property has at least * one value, all of which are strings. */ template<> struct property_type_checker : public property_checker { - property_type_checker(const char* name, string property_name) : + property_type_checker(const char* name, const std::string &property_name) : property_checker(name, property_name) {} virtual bool check(device_tree *, const node_ptr &, property_ptr p) { for (property::value_iterator i=p->begin(),e=p->end() ; i!=e ; ++i) { if (!(i->is_string() || i->is_string_list())) { return false; } } return p->begin() != p->end(); } }; /** * Phandle property checker. This checks that the property has exactly one * value, which is a valid phandle. */ template<> struct property_type_checker : public property_checker { - property_type_checker(const char* name, string property_name) : + property_type_checker(const char* name, const std::string &property_name) : property_checker(name, property_name) {} virtual bool check(device_tree *tree, const node_ptr &, property_ptr p) { - return (p->begin() + 1 == p->end()) && + return (p->begin() + 1 == p->end()) && (tree->referenced_node(*p->begin()) != 0); } }; /** * Check that a property has the correct size. */ struct property_size_checker : public property_checker { /** * The expected size of the property. */ uint32_t size; public: /** * Constructor, takes the name of the checker, the name of the property * to check, and its expected size as arguments. */ - property_size_checker(const char* name, string property_name, uint32_t bytes) + property_size_checker(const char* name, + const std::string &property_name, + uint32_t bytes) : property_checker(name, property_name), size(bytes) {} /** * Check, validates that the property has the correct size. */ virtual bool check(device_tree *tree, const node_ptr &n, property_ptr p); }; /** * The check manager is the interface to running the checks. This allows * default checks to be enabled, non-default checks to be enabled, and so on. */ class check_manager { /** * The enabled checkers, indexed by their names. The name is used when * disabling checkers from the command line. When this manager runs, * it will only run the checkers from this map. */ - std::unordered_map checkers; + std::unordered_map checkers; /** * The disabled checkers. Moving checkers to this list disables them, * but allows them to be easily moved back. */ - std::unordered_map disabled_checkers; + std::unordered_map disabled_checkers; /** * Helper function for adding a property value checker. */ template - void add_property_type_checker(const char *name, string prop); + void add_property_type_checker(const char *name, const std::string &prop); /** * Helper function for adding a simple type checker. */ - void add_property_type_checker(const char *name, string prop); + void add_property_type_checker(const char *name, const std::string &prop); /** * Helper function for adding a property value checker. */ void add_property_size_checker(const char *name, - string prop, + const std::string &prop, uint32_t size); public: /** * Delete all of the checkers that are part of this checker manager. */ ~check_manager(); /** * Default constructor, creates check manager containing all of the * default checks. */ check_manager(); /** * Run all of the checks on the specified tree. */ bool run_checks(device_tree *tree, bool keep_going); /** * Disables the named checker. */ - bool disable_checker(string name); + bool disable_checker(const std::string &name); /** - * Enables the named checker. + * Enables the named checker. */ - bool enable_checker(string name); + bool enable_checker(const std::string &name); }; } // namespace checking } // namespace fdt } // namespace dtc #endif // !_CHECKING_HH_ Index: user/alc/PQ_LAUNDRY/usr.bin/dtc/dtb.cc =================================================================== --- user/alc/PQ_LAUNDRY/usr.bin/dtc/dtb.cc (revision 306831) +++ user/alc/PQ_LAUNDRY/usr.bin/dtc/dtb.cc (revision 306832) @@ -1,311 +1,313 @@ /*- * Copyright (c) 2013 David Chisnall * All rights reserved. * * This software was developed by SRI International and the University of * Cambridge Computer Laboratory under DARPA/AFRL contract (FA8750-10-C-0237) * ("CTSRD"), as part of the DARPA CRASH research programme. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include "dtb.hh" #include #include #include #include +using std::string; namespace dtc { namespace dtb { void output_writer::write_data(byte_buffer b) { for (auto i : b) { write_data(i); } } void -binary_writer::write_string(string name) +binary_writer::write_string(const string &name) { - name.push_to_buffer(buffer); + push_string(buffer, name); // Trailing nul buffer.push_back(0); } void binary_writer::write_data(uint8_t v) { buffer.push_back(v); } void binary_writer::write_data(uint32_t v) { while (buffer.size() % 4 != 0) { buffer.push_back(0); } push_big_endian(buffer, v); } void binary_writer::write_data(uint64_t v) { while (buffer.size() % 8 != 0) { buffer.push_back(0); } push_big_endian(buffer, v); } void binary_writer::write_to_file(int fd) { // FIXME: Check return write(fd, buffer.data(), buffer.size()); } uint32_t binary_writer::size() { return buffer.size(); } void -asm_writer::write_string(const char *c) -{ - while (*c) - { - buffer.push_back((uint8_t)*(c++)); - } -} - -void asm_writer::write_line(const char *c) { if (byte_count != 0) { byte_count = 0; buffer.push_back('\n'); } write_string(c); } void asm_writer::write_byte(uint8_t b) { char out[3] = {0}; if (byte_count++ == 0) { buffer.push_back('\t'); } write_string(".byte 0x"); snprintf(out, 3, "%.2hhx", b); buffer.push_back(out[0]); buffer.push_back(out[1]); if (byte_count == 4) { buffer.push_back('\n'); byte_count = 0; } else { buffer.push_back(';'); buffer.push_back(' '); } } void -asm_writer::write_label(string name) +asm_writer::write_label(const string &name) { write_line("\t.globl "); - name.push_to_buffer(buffer); + push_string(buffer, name); buffer.push_back('\n'); - name.push_to_buffer(buffer); + push_string(buffer, name); buffer.push_back(':'); buffer.push_back('\n'); buffer.push_back('_'); - name.push_to_buffer(buffer); + push_string(buffer, name); buffer.push_back(':'); buffer.push_back('\n'); } void -asm_writer::write_comment(string name) +asm_writer::write_comment(const string &name) { write_line("\t/* "); - name.push_to_buffer(buffer); + push_string(buffer, name); write_string(" */\n"); } void -asm_writer::write_string(string name) +asm_writer::write_string(const char *c) { + while (*c) + { + buffer.push_back((uint8_t)*(c++)); + } +} + + +void +asm_writer::write_string(const string &name) +{ write_line("\t.string \""); - name.push_to_buffer(buffer); + push_string(buffer, name); write_line("\"\n"); bytes_written += name.size() + 1; } void asm_writer::write_data(uint8_t v) { write_byte(v); bytes_written++; } void asm_writer::write_data(uint32_t v) { if (bytes_written % 4 != 0) { write_line("\t.balign 4\n"); bytes_written += (4 - (bytes_written % 4)); } write_byte((v >> 24) & 0xff); write_byte((v >> 16) & 0xff); write_byte((v >> 8) & 0xff); write_byte((v >> 0) & 0xff); bytes_written += 4; } void asm_writer::write_data(uint64_t v) { if (bytes_written % 8 != 0) { write_line("\t.balign 8\n"); bytes_written += (8 - (bytes_written % 8)); } write_byte((v >> 56) & 0xff); write_byte((v >> 48) & 0xff); write_byte((v >> 40) & 0xff); write_byte((v >> 32) & 0xff); write_byte((v >> 24) & 0xff); write_byte((v >> 16) & 0xff); write_byte((v >> 8) & 0xff); write_byte((v >> 0) & 0xff); bytes_written += 8; } void asm_writer::write_to_file(int fd) { // FIXME: Check return write(fd, buffer.data(), buffer.size()); } uint32_t asm_writer::size() { return bytes_written; } void header::write(output_writer &out) { - out.write_label(string("dt_blob_start")); - out.write_label(string("dt_header")); + out.write_label("dt_blob_start"); + out.write_label("dt_header"); out.write_comment("magic"); out.write_data(magic); out.write_comment("totalsize"); out.write_data(totalsize); out.write_comment("off_dt_struct"); out.write_data(off_dt_struct); out.write_comment("off_dt_strings"); out.write_data(off_dt_strings); out.write_comment("off_mem_rsvmap"); out.write_data(off_mem_rsvmap); out.write_comment("version"); out.write_data(version); out.write_comment("last_comp_version"); out.write_data(last_comp_version); out.write_comment("boot_cpuid_phys"); out.write_data(boot_cpuid_phys); out.write_comment("size_dt_strings"); out.write_data(size_dt_strings); out.write_comment("size_dt_struct"); out.write_data(size_dt_struct); } bool header::read_dtb(input_buffer &input) { if (!(input.consume_binary(magic) && magic == 0xd00dfeed)) { fprintf(stderr, "Missing magic token in header. Got %" PRIx32 " expected 0xd00dfeed\n", magic); return false; } return input.consume_binary(totalsize) && input.consume_binary(off_dt_struct) && input.consume_binary(off_dt_strings) && input.consume_binary(off_mem_rsvmap) && input.consume_binary(version) && input.consume_binary(last_comp_version) && input.consume_binary(boot_cpuid_phys) && input.consume_binary(size_dt_strings) && input.consume_binary(size_dt_struct); } uint32_t -string_table::add_string(string str) +string_table::add_string(const string &str) { auto old = string_offsets.find(str); if (old == string_offsets.end()) { uint32_t start = size; // Don't forget the trailing nul size += str.size() + 1; string_offsets.insert(std::make_pair(str, start)); strings.push_back(str); return start; } else { return old->second; } } void string_table::write(dtb::output_writer &writer) { - writer.write_comment(string("Strings table.")); - writer.write_label(string("dt_strings_start")); + writer.write_comment("Strings table."); + writer.write_label("dt_strings_start"); for (auto &i : strings) { writer.write_string(i); } - writer.write_label(string("dt_strings_end")); + writer.write_label("dt_strings_end"); } } // namespace dtb } // namespace dtc Index: user/alc/PQ_LAUNDRY/usr.bin/dtc/dtb.hh =================================================================== --- user/alc/PQ_LAUNDRY/usr.bin/dtc/dtb.hh (revision 306831) +++ user/alc/PQ_LAUNDRY/usr.bin/dtc/dtb.hh (revision 306832) @@ -1,365 +1,371 @@ /*- * Copyright (c) 2013 David Chisnall * All rights reserved. * * This software was developed by SRI International and the University of * Cambridge Computer Laboratory under DARPA/AFRL contract (FA8750-10-C-0237) * ("CTSRD"), as part of the DARPA CRASH research programme. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _DTB_HH_ #define _DTB_HH_ #include -#include "string.hh" +#include #include +#include "input_buffer.hh" +#include "util.hh" + namespace dtc { /** * The dtb namespace contains code related to the generation of device tree * blobs, the binary representation of flattened device trees. The abstract * tree representation calls into this code to generate the output. */ namespace dtb { /** The token types in the DTB, as defined by §7.4.1 of the ePAPR * specification. All of these values are written in big-endian format in the * output. */ enum token_type { /** * Marker indicating the start of a node in the tree. This is followed * by the nul-terminated name. If a unit address is specified, then * the name also contains the address, with an @ symbol between the end * of the name and the start of the address. * * The name is then padded such that the next token begins on a 4-byte * boundary. The node may contain properties, other nodes, both, or be * empty. */ FDT_BEGIN_NODE = 0x00000001, /** * Marker indicating the end of a node. */ FDT_END_NODE = 0x00000002, /** * The start of a property. This is followed by two 32-bit big-endian * values. The first indicates the length of the property value, the * second its index in the strings table. It is then followed by the * property value, if the value is of non-zero length. */ FDT_PROP = 0x00000003, /** * Ignored token. May be used for padding inside DTB nodes. */ FDT_NOP = 0x00000004, /** * Marker indicating the end of the tree. */ FDT_END = 0x00000009 }; /** * Returns the token as a string. This is used for debugging and for printing * human-friendly error messages about malformed DTB input. */ inline const char *token_type_name(token_type t) { switch(t) { case FDT_BEGIN_NODE: return "FDT_BEGIN_NODE"; case FDT_END_NODE: return "FDT_END_NODE"; case FDT_PROP: return "FDT_PROP"; case FDT_NOP: return "FDT_NOP"; case FDT_END: return "FDT_END"; } assert(0); } /** * Abstract class for writing a section of the output. We create one * of these for each section that needs to be written. It is intended to build * a temporary buffer of the output in memory and then write it to a file * stream. The size can be returned after all of the data has been written * into the internal buffer, so the sizes of the three tables can be calculated * before storing them in the buffer. */ struct output_writer { /** * Writes a label into the output stream. This is only applicable for * assembly output, where the labels become symbols that can be * resolved at link time. */ - virtual void write_label(string name) = 0; + virtual void write_label(const std::string &name) = 0; /** * Writes a comment into the output stream. Useful only when debugging * the output. */ - virtual void write_comment(string name) = 0; + virtual void write_comment(const std::string &name) = 0; /** * Writes a string. A nul terminator is implicitly added. */ - virtual void write_string(string name) = 0; + virtual void write_string(const std::string &name) = 0; /** * Writes a single 8-bit value. */ virtual void write_data(uint8_t) = 0; /** * Writes a single 32-bit value. The value is written in big-endian * format, but should be passed in the host's native endian. */ virtual void write_data(uint32_t) = 0; /** * Writes a single 64-bit value. The value is written in big-endian * format, but should be passed in the host's native endian. */ virtual void write_data(uint64_t) = 0; /** * Writes the collected output to the specified file descriptor. */ virtual void write_to_file(int fd) = 0; /** * Returns the number of bytes. */ virtual uint32_t size() = 0; /** * Helper for writing tokens to the output stream. This writes a * comment above the token describing its value, for easier debugging * of the output. */ inline void write_token(token_type t) { write_comment(token_type_name(t)); write_data((uint32_t)t); } /** * Helper function that writes a byte buffer to the output, one byte at * a time. */ void write_data(byte_buffer b); }; /** * Binary file writer. This class is responsible for writing the DTB output * directly in blob format. */ class binary_writer : public output_writer { /** * The internal buffer used to store the blob while it is being * constructed. */ byte_buffer buffer; public: /** * The binary format does not support labels, so this method * does nothing. */ - virtual void write_label(string) {} + virtual void write_label(const std::string &) {} /** * Comments are ignored by the binary writer. */ - virtual void write_comment(string) {} - virtual void write_string(string name); + virtual void write_comment(const std::string&) {} + virtual void write_string(const std::string &name); virtual void write_data(uint8_t v); virtual void write_data(uint32_t v); virtual void write_data(uint64_t v); virtual void write_to_file(int fd); virtual uint32_t size(); }; /** * Assembly writer. This class is responsible for writing the output in an * assembly format that is suitable for linking into a kernel, loader, and so * on. */ class asm_writer : public output_writer { /** * The internal buffer for temporary values. Note that this actually * contains ASCII text, but it is a byte buffer so that we can just * copy strings across as-is. */ byte_buffer buffer; /** * The number of bytes written to the current line. This is used to * allow line wrapping, where we aim to write four .byte directives to * make the alignment clearer. */ int byte_count; /** * The current number of bytes written. This is the number in binary * format, not the number of bytes in the buffer. */ uint32_t bytes_written; /** - * Writes a C string directly to the output as-is. This is mainly used - * for writing directives. + * Writes a string directly to the output as-is. This is the function that + * performs the real output. */ void write_string(const char *c); /** + * Write a string to the output. + */ + void write_string(const std::string &c); + /** * Writes the string, starting on a new line. */ void write_line(const char *c); /** * Writes a byte in binary format. This will emit a single .byte * directive, with up to four per line. */ void write_byte(uint8_t b); public: asm_writer() : byte_count(0), bytes_written(0) {} - virtual void write_label(string name); - virtual void write_comment(string name); - virtual void write_string(string name); + virtual void write_label(const std::string &name); + virtual void write_comment(const std::string &name); virtual void write_data(uint8_t v); virtual void write_data(uint32_t v); virtual void write_data(uint64_t v); virtual void write_to_file(int fd); virtual uint32_t size(); }; /** * Class encapsulating the device tree blob header. This class stores all of * the values found in the header and is responsible for writing them to the * output. */ struct header { /** * Magic value, used to validate that this really is a device tree * blob. Should always be set to 0xd00dfeed. */ uint32_t magic; /** * The total size of the blob, including header, reservations, strings * table, and padding. */ uint32_t totalsize; /** * The offset from the start of the blob of the struct table (i.e. the * part of the blob containing the entire device tree). */ uint32_t off_dt_struct; /** * The offset from the start of the blob of the strings table. */ uint32_t off_dt_strings; /** * The offset of the reservation map from the start of the blob. */ uint32_t off_mem_rsvmap; /** * The version of the blob. This should always be 17. */ uint32_t version; /** * The earliest version of the DTB specification with which this blob * is backwards compatible. This should always be 16. */ uint32_t last_comp_version; /** * The ID of the CPU where this boots. */ uint32_t boot_cpuid_phys; /** * The size of the strings table. */ uint32_t size_dt_strings; /** * The size of the struct table. */ uint32_t size_dt_struct; /** * Writes the entire header to the specified output buffer. */ void write(output_writer &out); /** * Reads the header from bits binary representation in a blob. */ bool read_dtb(input_buffer &input); /** * Default constructor. Initialises the values that have sensible * defaults, leaves the others blank. */ header() : magic(0xd00dfeed), version(17), last_comp_version(16), boot_cpuid_phys(0) {} }; /** * Class encapsulating the string table. FDT strings are stored in a string * section. This maintains a map from strings to their offsets in the strings * section. * * Note: We don't currently do suffix matching, which may save a small amount * of space. */ class string_table { /** * Map from strings to their offset. */ - std::map string_offsets; + std::map string_offsets; /** * The strings, in the order in which they should be written to the * output. The order must be stable - adding another string must not * change the offset of any that we have already referenced - and so we * simply write the strings in the order that they are passed. */ - std::vector strings; + std::vector strings; /** * The current size of the strings section. */ uint32_t size; public: /** * Default constructor, creates an empty strings table. */ string_table() : size(0) {} /** * Adds a string to the table, returning the offset from the start * where it will be written. If the string is already present, this * will return its existing offset, otherwise it will return a new * offset. */ - uint32_t add_string(string str); + uint32_t add_string(const std::string &str); /** * Writes the strings table to the specified output. */ void write(dtb::output_writer &writer); }; } // namespace dtb } // namespace dtc #endif // !_DTB_HH_ Index: user/alc/PQ_LAUNDRY/usr.bin/dtc/dtc.1 =================================================================== --- user/alc/PQ_LAUNDRY/usr.bin/dtc/dtc.1 (revision 306831) +++ user/alc/PQ_LAUNDRY/usr.bin/dtc/dtc.1 (revision 306832) @@ -1,332 +1,336 @@ .\"- .\" Copyright (c) 2013 David Chisnall .\" All rights reserved. .\" .\" This software was developed by SRI International and the University of .\" Cambridge Computer Laboratory under DARPA/AFRL contract (FA8750-10-C-0237) .\" ("CTSRD"), as part of the DARPA CRASH research programme. .\" .\" This software was developed by SRI International and the University of .\" Redistribution and use in source and binary forms, with or without .\" modification, are permitted provided that the following conditions .\" are met: .\" 1. Redistributions of source code must retain the above copyright .\" notice, this list of conditions and the following disclaimer. .\" 2. Redistributions in binary form must reproduce the above copyright .\" notice, this list of conditions and the following disclaimer in the .\" documentation and/or other materials provided with the distribution. .\" .\" THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND .\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE .\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE .\" ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE .\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL .\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS .\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) .\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT .\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF .\" SUCH DAMAGE. .\" .\" $FreeBSD$ .\"/ .Dd January 1, 2013 .Dt DTC 1 .Os .Sh NAME .Nm dtc .Nd device tree compiler .Sh SYNOPSIS .Nm .Op Fl fhsv .Op Fl b Ar boot_cpu_id .Op Fl d Ar dependency_file .Op Fl E Ar [no-]checker_name .Op Fl H Ar phandle_format .Op Fl I Ar input_format .Op Fl O Ar output_format .Op Fl o Ar output_file .Op Fl R Ar entries .Op Fl S Ar bytes .Op Fl p Ar bytes .Op Fl V Ar blob_version .Op Fl W Ar [no-]checker_name .Op Fl P Ar predefined_properties .Ar input_file .Sh DESCRIPTION The .Nm utility converts flattened device tree (FDT) representations. It is most commonly used to generate device tree blobs (DTB), the binary representation of an FDT, from device tree sources (DTS), the ASCII text source representation. .Pp The binary can be written in two formats, binary and assembly. The binary is identical to the in-memory representation and can be used directly by firmware, loaders, and so on. The assembly format, documented in .Sx "ASM FORMAT" , will produce the same binary format when assembled, but also includes some global variables that refer to parts of the table. This format is most commonly used to produce a kernel specific to a device, with the device tree blob compiled in. .Pp The options are as follows: .Bl -tag -width indent .It Fl d Ar dependency_file Writes a dependency file understandable by make to the specified file. This file can be included in a Makefile and will ensure that the output file depends on the input file and any files that it includes. This argument is only useful when the input is DTS, as only the source format has a notion of inclusions. .It Fl E Ar [no-]checker_name Enable or disable a specified checker. The argument is the name of the checker. The full list of checkers is given in .Sx CHECKERS . .It Fl f Force the tool to attempt to generate the output, even if the input had errors. .It Fl h Display the help text and exit. .It Fl H Ar phandle_format Specifies the type of phandle nodes to generate in the output. Valid values are: .Pp .Bl -tag -width indent -compact .It Ar linux Generate the legacy linux,phandle nodes expected by older systems. .It Ar epapr Generate the phandle nodes, as described in the ePAPR specification. This is the most sensible option for device trees being used with .Fx . .It Ar both Generate both, for maximum compatibility. .El .It Fl I Ar input_format Specifies the input format. Valid values are: .Pp .Bl -tag -width indent -compact .It Ar dtb Device tree blob. The binary representation of the FDT. .It Ar dts Device tree source. The ASCII representation of the FDT. This is the default if the input format is not explicitly stated. .El .It Fl O Ar output_format Specifies the output format. Valid values are: .Pp .Bl -tag -width indent -compact .It Ar asm Assembler source for generating a device tree blob, as described in .Sx "ASM FORMAT" . .It Ar dtb Device tree blob. The binary representation of the FDT. This is the default if the output format is not explicitly stated. .It Ar dts Device tree source. The ASCII representation of the FDT. .El .It Fl o Ar output_file The file to which to write the output. .It Fl P Ar predefined_macro Defines a macro, in the form .Ar name=value or .Ar name to be used for device tree source files that contain conditional components. This tool supports two extensions to the standard to support conditional compilation of device trees. The first is an .Ar /include/if [property]/ "file.dts" directive that is allowed at the start of a file and which will only include the specified file if it the specified property is passed with this flag. The second is the .Ar $NAME format for property values. These allow property value to be specified on the command line. .It Fl R Ar entries The number of empty reservation table entries to pad the table with. This is useful if you are generating a device tree blob for bootloader or similar that needs to reserve some memory before passing control to the operating system. .It Fl S Ar bytes The minimum size in bytes of the blob. The blob will be padded after the strings table to ensure that it is the correct size. This is useful for environments where the device tree blob must be modified in place. .It Fl p Ar bytes The number of bytes of padding to add to the blob. The blob will be padded after the strings table to ensure that it is the correct size. This is useful for environments where the device tree blob must be modified in place. .It Fl W Ar [no-]checker_name Enable or disable a specified checker. This is an alias for .Fl E . .It Fl s Sorts the properties and nodes in the tree. This is mainly useful when using tools like .Xr diff 1 to compare two device tree sources. .It Fl V Ar output_version The version of the format to output. This is only relevant for binary outputs, and only a value of 17 is currently supported. .It Fl v Display the tool version and exit. .It Ar input_file The source file. .El .Sh "ASM FORMAT" The assembly format defines several globals that can be referred to from other compilation units, in addition to any labels specified in the source. These are: .Pp .Bl -tag -width "dt_strings_start" -compact -offset indent .It dt_blob_start start of the device tree blob. .It dt_header start of the header, usually identical to the start of the blob. .It dt_reserve_map start of the reservation map. .It dt_struct_start start of the structure table. .It dt_struct_end end of the structure table. .It dt_strings_start start of the strings table. .It dt_strings_end end of the strings table. .It dt_blob_end end of the device tree blob. .El .Sh CHECKERS The utility provides a number of semantic checks on the correctness of the tree. These can be disabled with the .Fl W flag. For example, .Fl W Ar no-type-phandle will disable the phandle type check. The supported checks are: .Pp .Bl -tag -width "no-type-phandle" -compact -offset indent .It type-compatible Checks the type of the .Va compatible property. .It type-model Checks the type of the .Va model property. .It type-compatible Checks the type of the .Va compatible property. .It cells-attributes Checks that all nodes with children have both .Va #address-cells and .Va #size-cells properties. +.It deleted-nodes +Checks that all +.Va /delete-node/ +statements refer to nodes that are merged. .El .Sh EXAMPLES The command: .Pp .Dl "dtc -o blob.S -O asm device.dts" .Pp will generate a .Pa blob.S file from the device tree source .Pa device.dts and print errors if any occur during parsing or property checking. The resulting file can be assembled and linked into a binary. .Pp The command: .Pp .Dl "dtc -o - -O dts -I dtb device.dtb" .Pp will write the device tree source for the device tree blob .Pa device.dtb to the standard output. This is useful when debugging device trees. .Sh COMPATIBILITY This utility is intended to be compatible with the device tree compiler provided by elinux.org. Currently, it implements the subset of features required to build FreeBSD and others that have been requested by FreeBSD developers. .Pp The .Ar fs input format is not supported. This builds a tree from a Linux .Pa /proc/device-tree , a file system hierarchy not found in FreeBSD, which instead exposes the DTB directly via a sysctl. .Pp The warnings and errors supported by the elinux.org tool are not documented. This tool supports the warnings described in the .Sx CHECKERS section. .Sh SEE ALSO .Xr fdt 4 .Sh STANDARDS The device tree formats understood by this tool conform to the Power.org Standard for Embedded Power Architecture Platform Requirements .Pq Vt ePAPR , except as noted in the .Sx BUGS section and with the following exceptions for compatibility with the elinux.org tool: .Pp .Bl -bullet -compact .It The target of cross references is defined to be a node name in the specification, but is in fact a label. .El .Pp The /include/ directive is not part of the standard, however it is implemented with the semantics compatible with the elinux.org tool. It must appear in the top level of a file, and imports a new root definition. If a file, plus all of its inclusions, contains multiple roots then they are merged. All nodes that are present in the second but not the first are imported. Any that appear in both are recursively merged, with properties from the second replacing those from the first and properties child nodes being recursively merged. .Sh HISTORY A dtc tool first appeared in .Fx 9.0 . This version of the tool first appeared in .Fx 10.0 . .Sh AUTHORS .An David T. Chisnall .Pp Note: The fact that the tool and the author share the same initials is entirely coincidental. .Sh BUGS The device tree compiler does not yet support the following features: .Pp .Bl -bullet -compact .It Labels in the middle of property values. This is only useful in the assembly output, and only vaguely useful there, so is unlikely to be added soon. .It Full paths, rather than labels, as the targets for phandles. This is not very hard to add, but will probably not be added until something actually needs it. .El .Pp The current version performs a very limited set of semantic checks on the tree. This will be improved in future versions. Index: user/alc/PQ_LAUNDRY/usr.bin/dtc/dtc.cc =================================================================== --- user/alc/PQ_LAUNDRY/usr.bin/dtc/dtc.cc (revision 306831) +++ user/alc/PQ_LAUNDRY/usr.bin/dtc/dtc.cc (revision 306832) @@ -1,344 +1,345 @@ /*- * Copyright (c) 2013 David Chisnall * All rights reserved. * * This software was developed by SRI International and the University of * Cambridge Computer Laboratory under DARPA/AFRL contract (FA8750-10-C-0237) * ("CTSRD"), as part of the DARPA CRASH research programme. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include #include #include #include #include #include #include #include #include "fdt.hh" #include "checking.hh" +#include "util.hh" using namespace dtc; +using std::string; /** * The current major version of the tool. */ int version_major = 0; /** * The current minor version of the tool. */ int version_minor = 4; /** * The current patch level of the tool. */ int version_patch = 0; -static void usage(const char* argv0) +static void usage(const string &argv0) { fprintf(stderr, "Usage:\n" "\t%s\t[-fhsv] [-b boot_cpu_id] [-d dependency_file]" "[-E [no-]checker_name]\n" "\t\t[-H phandle_format] [-I input_format]" "[-O output_format]\n" "\t\t[-o output_file] [-R entries] [-S bytes] [-p bytes]" "[-V blob_version]\n" - "\t\t-W [no-]checker_name] input_file\n", basename((char*)argv0)); + "\t\t-W [no-]checker_name] input_file\n", basename(argv0).c_str()); } /** * Prints the current version of this program.. */ static void version(const char* progname) { fprintf(stderr, "Version: %s %d.%d.%d\n", progname, version_major, version_minor, version_patch); } using fdt::device_tree; int main(int argc, char **argv) { int ch; int outfile = fileno(stdout); const char *outfile_name = "-"; const char *in_file = "-"; FILE *depfile = 0; bool debug_mode = false; - void (device_tree::*write_fn)(int) = &device_tree::write_binary; - void (device_tree::*read_fn)(const char*, FILE*) = - &device_tree::parse_dts; + auto write_fn = &device_tree::write_binary; + auto read_fn = &device_tree::parse_dts; uint32_t boot_cpu; bool boot_cpu_specified = false; bool keep_going = false; bool sort = false; clock_t c0 = clock(); class device_tree tree; fdt::checking::check_manager checks; const char *options = "hqI:O:o:V:d:R:S:p:b:fi:svH:W:E:DP:"; // Don't forget to update the man page if any more options are added. while ((ch = getopt(argc, argv, options)) != -1) { switch (ch) { case 'h': usage(argv[0]); return EXIT_SUCCESS; case 'v': version(argv[0]); return EXIT_SUCCESS; case 'I': { - string arg = string(optarg); - if (arg == string("dtb")) + string arg(optarg); + if (arg == "dtb") { read_fn = &device_tree::parse_dtb; } - else if (arg == string("dts")) + else if (arg == "dts") { read_fn = &device_tree::parse_dts; } else { fprintf(stderr, "Unknown input format: %s\n", optarg); return EXIT_FAILURE; } break; } case 'O': { - string arg = string(optarg); - if (arg == string("dtb")) + string arg(optarg); + if (arg == "dtb") { write_fn = &device_tree::write_binary; } - else if (arg == string("asm")) + else if (arg == "asm") { write_fn = &device_tree::write_asm; } - else if (arg == string("dts")) + else if (arg == "dts") { write_fn = &device_tree::write_dts; } else { fprintf(stderr, "Unknown output format: %s\n", optarg); return EXIT_FAILURE; } break; } case 'o': { outfile_name = optarg; outfile = open(optarg, O_CREAT | O_TRUNC | O_WRONLY, 0666); if (outfile == -1) { perror("Unable to open output file"); return EXIT_FAILURE; } break; } case 'D': debug_mode = true; break; case 'V': - if (string(optarg) != string("17")) + if (string(optarg) != "17") { fprintf(stderr, "Unknown output format version: %s\n", optarg); return EXIT_FAILURE; } break; case 'd': { if (depfile != 0) { fclose(depfile); } - if (string(optarg) == string("-")) + if (string(optarg) == "-") { depfile = stdout; } else { depfile = fdopen(open(optarg, O_CREAT | O_TRUNC | O_WRONLY, 0666), "w"); if (depfile == 0) { perror("Unable to open dependency file"); return EXIT_FAILURE; } } break; } case 'H': { - string arg = string(optarg); - if (arg == string("both")) + string arg(optarg); + if (arg == "both") { tree.set_phandle_format(device_tree::BOTH); } - else if (arg == string("epapr")) + else if (arg == "epapr") { tree.set_phandle_format(device_tree::EPAPR); } - else if (arg == string("linux")) + else if (arg == "linux") { tree.set_phandle_format(device_tree::LINUX); } else { fprintf(stderr, "Unknown phandle format: %s\n", optarg); return EXIT_FAILURE; } break; } case 'b': // Don't bother to check if strtoll fails, just // use the 0 it returns. boot_cpu = (uint32_t)strtoll(optarg, 0, 10); boot_cpu_specified = true; break; case 'f': keep_going = true; break; case 'W': case 'E': { - string arg = string(optarg); + string arg(optarg); if ((arg.size() > 3) && (strncmp(optarg, "no-", 3) == 0)) { arg = string(optarg+3); if (!checks.disable_checker(arg)) { fprintf(stderr, "Checker %s either does not exist or is already disabled\n", optarg+3); } break; } if (!checks.enable_checker(arg)) { fprintf(stderr, "Checker %s either does not exist or is already enabled\n", optarg); } break; } case 's': { sort = true; break; } case 'i': { tree.add_include_path(optarg); break; } // Should quiet warnings, but for now is silently ignored. case 'q': break; case 'R': tree.set_empty_reserve_map_entries(strtoll(optarg, 0, 10)); break; case 'S': tree.set_blob_minimum_size(strtoll(optarg, 0, 10)); break; case 'p': tree.set_blob_padding(strtoll(optarg, 0, 10)); break; case 'P': if (!tree.parse_define(optarg)) { fprintf(stderr, "Invalid predefine value %s\n", optarg); } break; default: fprintf(stderr, "Unknown option %c\n", ch); return EXIT_FAILURE; } } if (optind < argc) { in_file = argv[optind]; } if (depfile != 0) { fputs(outfile_name, depfile); fputs(": ", depfile); fputs(in_file, depfile); } clock_t c1 = clock(); (tree.*read_fn)(in_file, depfile); // Override the boot CPU found in the header, if we're loading from dtb if (boot_cpu_specified) { tree.set_boot_cpu(boot_cpu); } if (sort) { tree.sort(); } if (depfile != 0) { putc('\n', depfile); fclose(depfile); } if (!(tree.is_valid() || keep_going)) { - fprintf(stderr, "Failed to parse tree. Unhappy face!\n"); + fprintf(stderr, "Failed to parse tree.\n"); return EXIT_FAILURE; } clock_t c2 = clock(); if (!(checks.run_checks(&tree, true) || keep_going)) { return EXIT_FAILURE; } clock_t c3 = clock(); (tree.*write_fn)(outfile); close(outfile); clock_t c4 = clock(); if (debug_mode) { struct rusage r; getrusage(RUSAGE_SELF, &r); fprintf(stderr, "Peak memory usage: %ld bytes\n", r.ru_maxrss); fprintf(stderr, "Setup and option parsing took %f seconds\n", ((double)(c1-c0))/CLOCKS_PER_SEC); fprintf(stderr, "Parsing took %f seconds\n", ((double)(c2-c1))/CLOCKS_PER_SEC); fprintf(stderr, "Checking took %f seconds\n", ((double)(c3-c2))/CLOCKS_PER_SEC); fprintf(stderr, "Generating output took %f seconds\n", ((double)(c4-c3))/CLOCKS_PER_SEC); fprintf(stderr, "Total time: %f seconds\n", ((double)(c4-c0))/CLOCKS_PER_SEC); // This is not needed, but keeps valgrind quiet. fclose(stdin); } return EXIT_SUCCESS; } Index: user/alc/PQ_LAUNDRY/usr.bin/dtc/fdt.cc =================================================================== --- user/alc/PQ_LAUNDRY/usr.bin/dtc/fdt.cc (revision 306831) +++ user/alc/PQ_LAUNDRY/usr.bin/dtc/fdt.cc (revision 306832) @@ -1,1616 +1,1667 @@ /*- * Copyright (c) 2013 David Chisnall * All rights reserved. * * This software was developed by SRI International and the University of * Cambridge Computer Laboratory under DARPA/AFRL contract (FA8750-10-C-0237) * ("CTSRD"), as part of the DARPA CRASH research programme. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #define __STDC_LIMIT_MACROS 1 #include "fdt.hh" #include "dtb.hh" #include +#include #include #include #include #include #include #include #include #include #include #include +using std::string; + namespace dtc { namespace fdt { uint32_t property_value::get_as_uint32() { if (byte_data.size() != 4) { return 0; } uint32_t v = 0; v &= byte_data[0] << 24; v &= byte_data[1] << 16; v &= byte_data[2] << 8; v &= byte_data[3] << 0; return v; } void property_value::push_to_buffer(byte_buffer &buffer) { if (!byte_data.empty()) { buffer.insert(buffer.end(), byte_data.begin(), byte_data.end()); } else { - string_data.push_to_buffer(buffer, true); + push_string(buffer, string_data, true); // Trailing nul buffer.push_back(0); } } void property_value::write_dts(FILE *file) { resolve_type(); switch (type) { default: assert(0 && "Invalid type"); case STRING: case STRING_LIST: case CROSS_REFERENCE: write_as_string(file); break; case PHANDLE: write_as_cells(file); break; case BINARY: if (byte_data.size() % 4 == 0) { write_as_cells(file); break; } write_as_bytes(file); break; } } void property_value::resolve_type() { if (type != UNKNOWN) { return; } if (byte_data.empty()) { type = STRING; return; } if (byte_data.back() == 0) { bool is_all_printable = true; int nuls = 0; int bytes = 0; bool lastWasNull = false; for (auto i : byte_data) { bytes++; is_all_printable &= (i == '\0') || isprint(i); if (i == '\0') { // If there are two nulls in a row, then we're probably binary. if (lastWasNull) { type = BINARY; return; } nuls++; lastWasNull = true; } else { lastWasNull = false; } if (!is_all_printable) { break; } } if ((is_all_printable && (bytes > nuls)) || bytes == 0) { type = STRING; if (nuls > 1) { type = STRING_LIST; } return; } } type = BINARY; } void property_value::write_as_string(FILE *file) { putc('"', file); if (byte_data.empty()) { - string_data.print(file); + fputs(string_data.c_str(), file); } else { bool hasNull = (byte_data.back() == '\0'); // Remove trailing null bytes from the string before printing as dts. if (hasNull) { byte_data.pop_back(); } for (auto i : byte_data) { // FIXME Escape tabs, newlines, and so on. if (i == '\0') { fputs("\", \"", file); continue; } putc(i, file); } if (hasNull) { byte_data.push_back('\0'); } } putc('"', file); } void property_value::write_as_cells(FILE *file) { putc('<', file); assert((byte_data.size() % 4) == 0); for (auto i=byte_data.begin(), e=byte_data.end(); i!=e ; ++i) { uint32_t v = 0; v = (v << 8) | *i; ++i; v = (v << 8) | *i; ++i; v = (v << 8) | *i; ++i; v = (v << 8) | *i; fprintf(file, "0x%" PRIx32, v); if (i+1 != e) { putc(' ', file); } } putc('>', file); } void property_value::write_as_bytes(FILE *file) { putc('[', file); for (auto i=byte_data.begin(), e=byte_data.end(); i!=e ; i++) { fprintf(file, "%02hhx", *i); if (i+1 != e) { putc(' ', file); } } putc(']', file); } void -property::parse_string(input_buffer &input) +property::parse_string(text_input_buffer &input) { property_value v; - assert(input[0] == '"'); + assert(*input == '"'); ++input; - const char *start = (const char*)input; - int length = 0; - while (char c = input[0]) + std::vector bytes; + bool isEscaped = false; + while (char c = *input) { - if (c == '"' && input[-1] != '\\') + if (c == '"' && !isEscaped) { input.consume('"'); break; } + isEscaped = (c == '\\'); + bytes.push_back(c); ++input; - ++length; } - v.string_data = string(start, length); + v.string_data = string(bytes.begin(), bytes.end()); values.push_back(v); } void -property::parse_cells(input_buffer &input, int cell_size) +property::parse_cells(text_input_buffer &input, int cell_size) { - assert(input[0] == '<'); + assert(*input == '<'); ++input; property_value v; input.next_token(); while (!input.consume('>')) { input.next_token(); // If this is a phandle then we need to get the name of the // referenced node if (input.consume('&')) { if (cell_size != 32) { input.parse_error("reference only permitted in 32-bit arrays"); valid = false; return; } input.next_token(); - // FIXME: We should support full paths here, but we - // don't. - string referenced = string::parse_node_name(input); + bool isPath = false; + string referenced; + if (!input.consume('{')) + { + referenced = input.parse_node_name(); + } + else + { + referenced = input.parse_to('}'); + input.consume('}'); + isPath = true; + } if (referenced.empty()) { input.parse_error("Expected node name"); valid = false; return; } input.next_token(); // If we already have some bytes, make the phandle a // separate component. if (!v.byte_data.empty()) { values.push_back(v); v = property_value(); } v.string_data = referenced; v.type = property_value::PHANDLE; values.push_back(v); v = property_value(); } else { //FIXME: We should support labels in the middle //of these, but we don't. unsigned long long val; if (!input.consume_integer_expression(val)) { input.parse_error("Expected numbers in array of cells"); valid = false; return; } switch (cell_size) { case 8: v.byte_data.push_back(val); break; case 16: push_big_endian(v.byte_data, (uint16_t)val); break; case 32: push_big_endian(v.byte_data, (uint32_t)val); break; case 64: push_big_endian(v.byte_data, (uint64_t)val); break; default: assert(0 && "Invalid cell size!"); } input.next_token(); } } // Don't store an empty string value here. if (v.byte_data.size() > 0) { values.push_back(v); } } void -property::parse_bytes(input_buffer &input) +property::parse_bytes(text_input_buffer &input) { - assert(input[0] == '['); + assert(*input == '['); ++input; property_value v; input.next_token(); while (!input.consume(']')) { { //FIXME: We should support //labels in the middle of //these, but we don't. uint8_t val; if (!input.consume_hex_byte(val)) { input.parse_error("Expected hex bytes in array of bytes"); valid = false; return; } v.byte_data.push_back(val); input.next_token(); } } values.push_back(v); } void -property::parse_reference(input_buffer &input) +property::parse_reference(text_input_buffer &input) { - assert(input[0] == '&'); + assert(*input == '&'); ++input; input.next_token(); property_value v; - v.string_data = string::parse_node_name(input); + v.string_data = input.parse_node_name(); if (v.string_data.empty()) { input.parse_error("Expected node name"); valid = false; return; } v.type = property_value::CROSS_REFERENCE; values.push_back(v); } property::property(input_buffer &structs, input_buffer &strings) { uint32_t name_offset; uint32_t length; valid = structs.consume_binary(length) && structs.consume_binary(name_offset); if (!valid) { fprintf(stderr, "Failed to read property\n"); return; } // Find the name input_buffer name_buffer = strings.buffer_from_offset(name_offset); - if (name_buffer.empty()) + if (name_buffer.finished()) { fprintf(stderr, "Property name offset %" PRIu32 " is past the end of the strings table\n", name_offset); valid = false; return; } - key = string(name_buffer); + key = name_buffer.parse_to(0); // If we're empty, do not push anything as value. if (!length) return; // Read the value uint8_t byte; property_value v; for (uint32_t i=0 ; ifind(name)) == defines->end())) { input.parse_error("Undefined property name\n"); valid = false; return; } values.push_back((*found).second->values[0]); } -property::property(input_buffer &input, - string k, - string l, +property::property(text_input_buffer &input, + string &&k, + string_set &&l, bool semicolonTerminated, - define_map *defines) : key(k), label(l), valid(true) + define_map *defines) : key(k), labels(l), valid(true) { do { input.next_token(); - switch (input[0]) + switch (*input) { case '$': { parse_define(input, defines); if (valid) { break; } } default: input.parse_error("Invalid property value."); valid = false; return; case '/': { unsigned long long bits = 0; valid = input.consume("/bits/"); input.next_token(); valid &= input.consume_integer(bits); if ((bits != 8) && (bits != 16) && (bits != 32) && (bits != 64)) { input.parse_error("Invalid size for elements"); valid = false; } if (!valid) return; input.next_token(); - if (input[0] != '<') + if (*input != '<') { input.parse_error("/bits/ directive is only valid on arrays"); valid = false; return; } parse_cells(input, bits); break; } case '"': parse_string(input); break; case '<': parse_cells(input, 32); break; case '[': parse_bytes(input); break; case '&': parse_reference(input); break; case ';': { break; } } input.next_token(); } while (input.consume(',')); if (semicolonTerminated && !input.consume(';')) { input.parse_error("Expected ; at end of property"); valid = false; } } property_ptr property::parse_dtb(input_buffer &structs, input_buffer &strings) { property_ptr p(new property(structs, strings)); if (!p->valid) { p = nullptr; } return p; } property_ptr -property::parse(input_buffer &input, string key, string label, +property::parse(text_input_buffer &input, string &&key, string_set &&label, bool semicolonTerminated, define_map *defines) { - property_ptr p(new property(input, key, label, semicolonTerminated, defines)); + property_ptr p(new property(input, + std::move(key), + std::move(label), + semicolonTerminated, + defines)); if (!p->valid) { p = nullptr; } return p; } void property::write(dtb::output_writer &writer, dtb::string_table &strings) { writer.write_token(dtb::FDT_PROP); byte_buffer value_buffer; for (value_iterator i=begin(), e=end() ; i!=e ; ++i) { i->push_to_buffer(value_buffer); } writer.write_data((uint32_t)value_buffer.size()); writer.write_comment(key); writer.write_data(strings.add_string(key)); writer.write_data(value_buffer); } bool property_value::try_to_merge(property_value &other) { resolve_type(); switch (type) { case UNKNOWN: __builtin_unreachable(); assert(0); return false; case EMPTY: *this = other; case STRING: case STRING_LIST: case CROSS_REFERENCE: return false; case PHANDLE: case BINARY: if (other.type == PHANDLE || other.type == BINARY) { type = BINARY; byte_data.insert(byte_data.end(), other.byte_data.begin(), other.byte_data.end()); return true; } } return false; } void property::write_dts(FILE *file, int indent) { for (int i=0 ; i *vals = &values; std::vector v; // If we've got multiple values then try to merge them all together. if (values.size() > 1) { vals = &v; v.push_back(values.front()); for (auto i=(++begin()), e=end() ; i!=e ; ++i) { if (!v.back().try_to_merge(*i)) { v.push_back(*i); } } } fputs(" = ", file); for (auto i=vals->begin(), e=vals->end() ; i!=e ; ++i) { i->write_dts(file); if (i+1 != e) { putc(',', file); putc(' ', file); } } } fputs(";\n", file); } string -node::parse_name(input_buffer &input, bool &is_property, const char *error) +node::parse_name(text_input_buffer &input, bool &is_property, const char *error) { if (!valid) { return string(); } input.next_token(); if (is_property) { - return string::parse_property_name(input); + return input.parse_property_name(); } - string n = string::parse_node_or_property_name(input, is_property); + string n = input.parse_node_or_property_name(is_property); if (n.empty()) { if (n.empty()) { input.parse_error(error); valid = false; } } return n; } void node::visit(std::function fn) { fn(*this); for (auto &&c : children) { c->visit(fn); } } node::node(input_buffer &structs, input_buffer &strings) : valid(true) { - const char *name_start = (const char*)structs; - int name_length = 0; + std::vector bytes; while (structs[0] != '\0' && structs[0] != '@') { - name_length++; + bytes.push_back(structs[0]); ++structs; } - name = string(name_start, name_length); + name = string(bytes.begin(), bytes.end()); + bytes.clear(); if (structs[0] == '@') { ++structs; - name_start = (const char*)structs; - name_length = 0; while (structs[0] != '\0') { - name_length++; + bytes.push_back(structs[0]); ++structs; } - unit_address = string(name_start, name_length); + unit_address = string(bytes.begin(), bytes.end()); } ++structs; uint32_t token; while (structs.consume_binary(token)) { switch (token) { default: fprintf(stderr, "Unexpected token 0x%" PRIx32 " while parsing node.\n", token); valid = false; return; // Child node, parse it. case dtb::FDT_BEGIN_NODE: { node_ptr child = node::parse_dtb(structs, strings); if (child == 0) { valid = false; return; } children.push_back(std::move(child)); break; } // End of this node, no errors. case dtb::FDT_END_NODE: return; // Property, parse it. case dtb::FDT_PROP: { property_ptr prop = property::parse_dtb(structs, strings); if (prop == 0) { valid = false; return; } props.push_back(prop); break; } break; // End of structs table. Should appear after // the end of the last node. case dtb::FDT_END: fprintf(stderr, "Unexpected FDT_END token while parsing node.\n"); valid = false; return; // NOPs are padding. Ignore them. case dtb::FDT_NOP: break; } } fprintf(stderr, "Failed to read token from structs table while parsing node.\n"); valid = false; return; } -node::node(input_buffer &input, string n, string l, string a, define_map *defines) : - label(l), name(n), unit_address(a), valid(true) +node::node(text_input_buffer &input, + string &&n, + std::unordered_set &&l, + string &&a, + define_map *defines) + : labels(l), name(n), unit_address(a), valid(true) { if (!input.consume('{')) { input.parse_error("Expected { to start new device tree node.\n"); } input.next_token(); while (valid && !input.consume('}')) { // flag set if we find any characters that are only in // the property name character set, not the node bool is_property = false; - string child_name, child_label, child_address; + string child_name, child_address; + std::unordered_set child_labels; + auto parse_delete = [&](const char *expected, bool at) + { + if (child_name == string()) + { + input.parse_error(expected); + valid = false; + return; + } + input.next_token(); + if (at && input.consume('@')) + { + child_name += '@'; + child_name += parse_name(input, is_property, "Expected unit address"); + } + if (!input.consume(';')) + { + input.parse_error("Expected semicolon"); + valid = false; + return; + } + input.next_token(); + }; + if (input.consume("/delete-node/")) + { + input.next_token(); + child_name = input.parse_node_name(); + parse_delete("Expected node name", true); + if (valid) + { + deleted_children.insert(child_name); + } + continue; + } + if (input.consume("/delete-property/")) + { + input.next_token(); + child_name = input.parse_property_name(); + parse_delete("Expected property name", false); + if (valid) + { + deleted_props.insert(child_name); + } + continue; + } child_name = parse_name(input, is_property, "Expected property or node name"); - if (input.consume(':')) + while (input.consume(':')) { // Node labels can contain any characters? The // spec doesn't say, so we guess so... is_property = false; - child_label = child_name; + child_labels.insert(std::move(child_name)); child_name = parse_name(input, is_property, "Expected property or node name"); } if (input.consume('@')) { child_address = parse_name(input, is_property, "Expected unit address"); } if (!valid) { return; } input.next_token(); // If we're parsing a property, then we must actually do that. if (input.consume('=')) { - property_ptr p = property::parse(input, child_name, - child_label, true, defines); + property_ptr p = property::parse(input, std::move(child_name), + std::move(child_labels), true, defines); if (p == 0) { valid = false; } else { props.push_back(p); } } - else if (!is_property && input[0] == ('{')) + else if (!is_property && *input == ('{')) { - node_ptr child = node::parse(input, child_name, - child_label, child_address, defines); + node_ptr child = node::parse(input, std::move(child_name), + std::move(child_labels), std::move(child_address), defines); if (child) { children.push_back(std::move(child)); } else { valid = false; } } else if (input.consume(';')) { - props.push_back(property_ptr(new property(child_name, child_label))); + props.push_back(property_ptr(new property(std::move(child_name), std::move(child_labels)))); } else { - input.parse_error("Error parsing property."); + input.parse_error("Error parsing property. Expected property value"); valid = false; } input.next_token(); } + input.next_token(); input.consume(';'); } bool node::cmp_properties(property_ptr &p1, property_ptr &p2) { return p1->get_key() < p2->get_key(); } bool node::cmp_children(node_ptr &c1, node_ptr &c2) { if (c1->name == c2->name) { return c1->unit_address < c2->unit_address; } return c1->name < c2->name; } void node::sort() { std::sort(property_begin(), property_end(), cmp_properties); std::sort(child_begin(), child_end(), cmp_children); for (auto &c : child_nodes()) { c->sort(); } } node_ptr -node::parse(input_buffer &input, - string name, - string label, - string address, +node::parse(text_input_buffer &input, + string &&name, + string_set &&label, + string &&address, define_map *defines) { - node_ptr n(new node(input, name, label, address, defines)); + node_ptr n(new node(input, + std::move(name), + std::move(label), + std::move(address), + defines)); if (!n->valid) { n = 0; } return n; } node_ptr node::parse_dtb(input_buffer &structs, input_buffer &strings) { node_ptr n(new node(structs, strings)); if (!n->valid) { n = 0; } return n; } property_ptr -node::get_property(string key) +node::get_property(const string &key) { for (auto &i : props) { if (i->get_key() == key) { return i; } } return 0; } void node::merge_node(node_ptr other) { - if (!other->label.empty()) + for (auto &l : other->labels) { - label = other->label; + labels.insert(l); } // Note: this is an O(n*m) operation. It might be sensible to // optimise this if we find that there are nodes with very // large numbers of properties, but for typical usage the // entire vector will fit (easily) into cache, so iterating // over it repeatedly isn't that expensive. for (auto &p : other->properties()) { bool found = false; for (auto &mp : properties()) { if (mp->get_key() == p->get_key()) { mp = p; found = true; break; } } if (!found) { add_property(p); } } for (auto &c : other->children) { bool found = false; for (auto &i : children) { if (i->name == c->name && i->unit_address == c->unit_address) { i->merge_node(std::move(c)); found = true; break; } } if (!found) { children.push_back(std::move(c)); } } + children.erase(std::remove_if(children.begin(), children.end(), + [&](const node_ptr &p) { + string full_name = p->name; + if (p->unit_address != string()) + { + full_name += '@'; + full_name += p->unit_address; + } + if (other->deleted_children.count(full_name) > 0) + { + other->deleted_children.erase(full_name); + return true; + } + return false; + }), children.end()); + props.erase(std::remove_if(props.begin(), props.end(), + [&](const property_ptr &p) { + if (other->deleted_props.count(p->get_key()) > 0) + { + other->deleted_props.erase(p->get_key()); + return true; + } + return false; + }), props.end()); } void node::write(dtb::output_writer &writer, dtb::string_table &strings) { writer.write_token(dtb::FDT_BEGIN_NODE); byte_buffer name_buffer; - name.push_to_buffer(name_buffer); + push_string(name_buffer, name); if (unit_address != string()) { name_buffer.push_back('@'); - unit_address.push_to_buffer(name_buffer); + push_string(name_buffer, unit_address); } writer.write_comment(name); writer.write_data(name_buffer); writer.write_data((uint8_t)0); for (auto p : properties()) { p->write(writer, strings); } for (auto &c : child_nodes()) { c->write(writer, strings); } writer.write_token(dtb::FDT_END_NODE); } void node::write_dts(FILE *file, int indent) { for (int i=0 ; iwrite_dts(file, indent+1); } for (auto &c : child_nodes()) { c->write_dts(file, indent+1); } for (int i=0 ; ilabel; path.push_back(std::make_pair(n->name, n->unit_address)); - if (name != string()) + for (const string &name : n->labels) { - if (node_names.find(name) == node_names.end()) + if (name != string()) { - node_names.insert(std::make_pair(name, n.get())); - node_paths.insert(std::make_pair(name, path)); - } - else - { - node_names[name] = (node*)-1; - auto i = node_paths.find(name); - if (i != node_paths.end()) + auto iter = node_names.find(name); + if (iter == node_names.end()) { - node_paths.erase(name); + node_names.insert(std::make_pair(name, n.get())); + node_paths.insert(std::make_pair(name, path)); } - fprintf(stderr, "Label not unique: "); - name.dump(); - fprintf(stderr, ". References to this label will not be resolved."); + else + { + node_names.erase(iter); + auto i = node_paths.find(name); + if (i != node_paths.end()) + { + node_paths.erase(name); + } + fprintf(stderr, "Label not unique: %s. References to this label will not be resolved.\n", name.c_str()); + } } } for (auto &c : n->child_nodes()) { collect_names_recursive(c, path); } path.pop_back(); // Now we collect the phandles and properties that reference // other nodes. for (auto &p : n->properties()) { for (auto &v : *p) { if (v.is_phandle()) { phandles.push_back(&v); } if (v.is_cross_reference()) { cross_references.push_back(&v); } } - if (p->get_key() == string("phandle") || - p->get_key() == string("linux,phandle")) + if ((p->get_key() == "phandle") || + (p->get_key() == "linux,phandle")) { if (p->begin()->byte_data.size() != 4) { - fprintf(stderr, "Invalid phandle value for node "); - n->name.dump(); - fprintf(stderr, ". Should be a 4-byte value.\n"); + fprintf(stderr, "Invalid phandle value for node %s. Should be a 4-byte value.\n", n->name.c_str()); valid = false; } else { uint32_t phandle = p->begin()->get_as_uint32(); used_phandles.insert(std::make_pair(phandle, n.get())); } } } } void device_tree::collect_names() { node_path p; node_names.clear(); node_paths.clear(); cross_references.clear(); phandles.clear(); collect_names_recursive(root, p); } void device_tree::resolve_cross_references() { for (auto *pv : cross_references) { node_path path = node_paths[pv->string_data]; - // Skip the first name in the path. It's always "", and implicitly / - for (auto p=path.begin()+1, pe=path.end() ; p!=pe ; ++p) + auto p = path.begin(); + auto pe = path.end(); + if (p != pe) { - pv->byte_data.push_back('/'); - p->first.push_to_buffer(pv->byte_data); - if (!(p->second.empty())) + // Skip the first name in the path. It's always "", and implicitly / + for (++p ; p!=pe ; ++p) { - pv->byte_data.push_back('@'); - p->second.push_to_buffer(pv->byte_data); - pv->byte_data.push_back(0); + pv->byte_data.push_back('/'); + push_string(pv->byte_data, p->first); + if (!(p->second.empty())) + { + pv->byte_data.push_back('@'); + push_string(pv->byte_data, p->second); + pv->byte_data.push_back(0); + } } } } std::unordered_set phandle_set; for (auto &i : phandles) { phandle_set.insert(i); } std::vector sorted_phandles; root->visit([&](node &n) { for (auto &p : n.properties()) { for (auto &v : *p) { if (phandle_set.count(&v)) { sorted_phandles.push_back(&v); } } } }); assert(sorted_phandles.size() == phandles.size()); uint32_t phandle = 1; for (auto &i : sorted_phandles) { string target_name = i->string_data; - node *target = node_names[target_name]; - if (target == 0) + node *target = nullptr; + string possible; + // If the node name is a path, then look it up by following the path, + // otherwise jump directly to the named node. + if (target_name[0] == '/') { - fprintf(stderr, "Failed to find node with label: "); - target_name.dump(); - fprintf(stderr, "\n"); + std::string path; + target = root.get(); + std::istringstream ss(target_name); + string path_element; + // Read the leading / + std::getline(ss, path_element, '/'); + // Iterate over path elements + while (!ss.eof()) + { + path += '/'; + std::getline(ss, path_element, '/'); + std::istringstream nss(path_element); + string node_name, node_address; + std::getline(nss, node_name, '@'); + std::getline(nss, node_address, '@'); + node *next = nullptr; + for (auto &c : target->child_nodes()) + { + if (c->name == node_name) + { + if (c->unit_address == node_address) + { + next = c.get(); + break; + } + else + { + possible = path + c->name; + if (c->unit_address != string()) + { + possible += '@'; + possible += c->unit_address; + } + } + } + } + path += node_name; + if (node_address != string()) + { + path += '@'; + path += node_address; + } + target = next; + if (target == nullptr) + { + break; + } + } + } + else + { + target = node_names[target_name]; + } + if (target == nullptr) + { + fprintf(stderr, "Failed to find node with label: %s\n", target_name.c_str()); + if (possible != string()) + { + fprintf(stderr, "Possible intended match: %s\n", possible.c_str()); + } valid = 0; return; } // If there is an existing phandle, use it property_ptr p = target->get_property("phandle"); if (p == 0) { p = target->get_property("linux,phandle"); } if (p == 0) { // Otherwise insert a new phandle node property_value v; while (used_phandles.find(phandle) != used_phandles.end()) { // Note that we only don't need to // store this phandle in the set, // because we are monotonically // increasing the value of phandle and // so will only ever revisit this value // if we have used 2^32 phandles, at // which point our blob won't fit in // any 32-bit system and we've done // something badly wrong elsewhere // already. phandle++; } push_big_endian(v.byte_data, phandle++); if (phandle_node_name == BOTH || phandle_node_name == LINUX) { - p.reset(new property(string("linux,phandle"))); + p.reset(new property("linux,phandle")); p->add_value(v); target->add_property(p); } if (phandle_node_name == BOTH || phandle_node_name == EPAPR) { - p.reset(new property(string("phandle"))); + p.reset(new property("phandle")); p->add_value(v); target->add_property(p); } } p->begin()->push_to_buffer(i->byte_data); assert(i->byte_data.size() == 4); } } -bool -device_tree::parse_include(input_buffer &input, - const std::string &dir, - std::vector &roots, - FILE *depfile, - bool &read_header) -{ - if (!input.consume("/include/")) - { - return false; - } - bool reallyInclude = true; - if (input.consume("if ")) - { - input.next_token(); - string name = string::parse_property_name(input); - // XXX: Error handling - if (defines.find(name) == defines.end()) - { - reallyInclude = false; - } - input.consume('/'); - } - input.next_token(); - if (!input.consume('"')) - { - input.parse_error("Expected quoted filename"); - valid = false; - return false; - } - int length = 0; - while (input[length] != '"') length++; - std::string file((const char*)input, length); - std::string include_file = dir + '/' + file; - input.consume(file.c_str()); - if (!reallyInclude) - { - input.consume('"'); - input.next_token(); - return true; - } - - input_buffer *include_buffer = buffer_for_file(include_file.c_str(), false); - - if (include_buffer == 0) - { - for (auto i : include_paths) - { - include_file = i + '/' + file; - include_buffer = buffer_for_file(include_file.c_str()); - if (include_buffer != 0) - { - break; - } - } - } - if (depfile != 0) - { - putc(' ', depfile); - fputs(include_file.c_str(), depfile); - } - if (include_buffer == 0) - { - input.parse_error("Unable to locate input file"); - input.consume('"'); - input.next_token(); - valid = false; - return true; - } - input.consume('"'); - input.next_token(); - parse_file(*include_buffer, dir, roots, depfile, read_header); - return true; -} - void -device_tree::parse_file(input_buffer &input, - const std::string &dir, +device_tree::parse_file(text_input_buffer &input, std::vector &roots, - FILE *depfile, bool &read_header) { input.next_token(); // Read the header if (input.consume("/dts-v1/;")) { read_header = true; } input.next_token(); input.next_token(); if (!read_header) { input.parse_error("Expected /dts-v1/; version string"); } - while(parse_include(input, dir, roots, depfile, read_header)) {} // Read any memory reservations - while(input.consume("/memreserve/")) + while (input.consume("/memreserve/")) { unsigned long long start, len; input.next_token(); // Read the start and length. if (!(input.consume_integer_expression(start) && (input.next_token(), input.consume_integer_expression(len)))) { input.parse_error("Expected size on /memreserve/ node."); } input.next_token(); input.consume(';'); reservations.push_back(reservation(start, len)); + input.next_token(); } - input.next_token(); - while(parse_include(input, dir, roots, depfile, read_header)) {} while (valid && !input.finished()) { node_ptr n; if (input.consume('/')) { input.next_token(); - n = node::parse(input, string(), string(), string(), &defines); + n = node::parse(input, string(), string_set(), string(), &defines); } else if (input.consume('&')) { input.next_token(); - string name = string::parse_node_name(input); + string name = input.parse_node_name(); input.next_token(); - n = node::parse(input, name, string(), string(), &defines); + n = node::parse(input, std::move(name), string_set(), string(), &defines); } else { input.parse_error("Failed to find root node /."); } if (n) { roots.push_back(std::move(n)); } else { valid = false; } input.next_token(); - while(parse_include(input, dir, roots, depfile, read_header)) {} } } -input_buffer* -device_tree::buffer_for_file(const char *path, bool warn) -{ - if (string(path) == string("-")) - { - input_buffer *b = new stream_input_buffer(); - if (b) - { - std::unique_ptr ptr(b); - buffers.push_back(std::move(ptr)); - } - return b; - } - int source = open(path, O_RDONLY); - if (source == -1) - { - if (warn) - { - fprintf(stderr, "Unable to open file '%s'. %s\n", path, strerror(errno)); - } - return 0; - } - struct stat st; - if (fstat(source, &st) == 0 && S_ISDIR(st.st_mode)) - { - fprintf(stderr, "File %s is a directory\n", path); - close(source); - return 0; - } - input_buffer *b = new mmap_input_buffer(source); - // Keep the buffer that owns the memory around for the lifetime - // of this FDT. Ones simply referring to it may have shorter - // lifetimes. - if (b) - { - std::unique_ptr ptr(b); - buffers.push_back(std::move(ptr)); - } - close(source); - return b; -} - template void device_tree::write(int fd) { dtb::string_table st; dtb::header head; writer head_writer; writer reservation_writer; writer struct_writer; writer strings_writer; // Build the reservation table reservation_writer.write_comment(string("Memory reservations")); reservation_writer.write_label(string("dt_reserve_map")); for (auto &i : reservations) { reservation_writer.write_comment(string("Reservation start")); reservation_writer.write_data(i.first); reservation_writer.write_comment(string("Reservation length")); reservation_writer.write_data(i.first); } // Write n spare reserve map entries, plus the trailing 0. for (uint32_t i=0 ; i<=spare_reserve_map_entries ; i++) { reservation_writer.write_data((uint64_t)0); reservation_writer.write_data((uint64_t)0); } struct_writer.write_comment(string("Device tree")); struct_writer.write_label(string("dt_struct_start")); root->write(struct_writer, st); struct_writer.write_token(dtb::FDT_END); struct_writer.write_label(string("dt_struct_end")); st.write(strings_writer); // Find the strings size before we stick padding on the end. // Note: We should possibly use a new writer for the padding. head.size_dt_strings = strings_writer.size(); // Stick the padding in the strings writer, but after the // marker indicating that it's the end. // Note: We probably should add a padding call to the writer so // that the asm back end can write padding directives instead // of a load of 0 bytes. for (uint32_t i=0 ; i(fd); } void device_tree::write_asm(int fd) { write(fd); } void device_tree::write_dts(int fd) { FILE *file = fdopen(fd, "w"); fputs("/dts-v1/;\n\n", file); if (!reservations.empty()) { const char msg[] = "/memreserve/"; fwrite(msg, sizeof(msg), 1, file); for (auto &i : reservations) { fprintf(file, " %" PRIx64 " %" PRIx64, i.first, i.second); } fputs(";\n\n", file); } putc('/', file); putc(' ', file); root->write_dts(file, 0); fclose(file); } void -device_tree::parse_dtb(const char *fn, FILE *) +device_tree::parse_dtb(const string &fn, FILE *) { - input_buffer *in = buffer_for_file(fn); + auto in = input_buffer::buffer_for_file(fn); if (in == 0) { valid = false; return; } input_buffer &input = *in; dtb::header h; valid = h.read_dtb(input); boot_cpu = h.boot_cpuid_phys; if (h.last_comp_version > 17) { fprintf(stderr, "Don't know how to read this version of the device tree blob"); valid = false; } if (!valid) { return; } input_buffer reservation_map = input.buffer_from_offset(h.off_mem_rsvmap, 0); uint64_t start, length; do { if (!(reservation_map.consume_binary(start) && reservation_map.consume_binary(length))) { fprintf(stderr, "Failed to read memory reservation table\n"); valid = false; return; } } while (!((start == 0) && (length == 0))); input_buffer struct_table = input.buffer_from_offset(h.off_dt_struct, h.size_dt_struct); input_buffer strings_table = input.buffer_from_offset(h.off_dt_strings, h.size_dt_strings); uint32_t token; if (!(struct_table.consume_binary(token) && (token == dtb::FDT_BEGIN_NODE))) { fprintf(stderr, "Expected FDT_BEGIN_NODE token.\n"); valid = false; return; } root = node::parse_dtb(struct_table, strings_table); if (!(struct_table.consume_binary(token) && (token == dtb::FDT_END))) { fprintf(stderr, "Expected FDT_END token after parsing root node.\n"); valid = false; return; } valid = (root != 0); } void -device_tree::parse_dts(const char *fn, FILE *depfile) +device_tree::parse_dts(const string &fn, FILE *depfile) { - input_buffer *in = buffer_for_file(fn); - std::string dir(dirname((char*)fn)); - if (in == 0) + auto in = input_buffer::buffer_for_file(fn); + if (!in) { valid = false; return; } std::vector roots; - input_buffer &input = *in; + std::unordered_set defnames; + for (auto &i : defines) + { + defnames.insert(i.first); + } + text_input_buffer input(std::move(in), + std::move(defnames), + std::vector(include_paths), + dirname(fn), + depfile); bool read_header = false; - parse_file(input, dir, roots, depfile, read_header); + parse_file(input, roots, read_header); switch (roots.size()) { case 0: valid = false; input.parse_error("Failed to find root node /."); return; case 1: root = std::move(roots[0]); break; default: { root = std::move(roots[0]); for (auto i=++(roots.begin()), e=roots.end() ; i!=e ; ++i) { auto &node = *i; string name = node->name; if (name == string()) { root->merge_node(std::move(node)); } else { auto existing = node_names.find(name); if (existing == node_names.end()) { collect_names(); existing = node_names.find(name); } if (existing == node_names.end()) { - fprintf(stderr, "Unable to merge node: "); - name.dump(); - fprintf(stderr, "\n"); + fprintf(stderr, "Unable to merge node: %s\n", name.c_str()); } - existing->second->merge_node(std::move(node)); + else + { + existing->second->merge_node(std::move(node)); + } } } } } collect_names(); resolve_cross_references(); } bool device_tree::parse_define(const char *def) { char *val = strchr(def, '='); if (!val) { if (strlen(def) != 0) { string name(def); defines[name]; return true; } return false; } string name(def, val-def); + string name_copy = name; val++; - input_buffer in = input_buffer(val, strlen(val)); - property_ptr p = property::parse(in, name, string(), false); + std::unique_ptr raw(new input_buffer(val, strlen(val))); + text_input_buffer in(std::move(raw), + std::unordered_set(), + std::vector(), + std::string(), + nullptr); + property_ptr p = property::parse(in, std::move(name_copy), string_set(), false); if (p) defines[name] = p; return (bool)p; } } // namespace fdt } // namespace dtc Index: user/alc/PQ_LAUNDRY/usr.bin/dtc/fdt.hh =================================================================== --- user/alc/PQ_LAUNDRY/usr.bin/dtc/fdt.hh (revision 306831) +++ user/alc/PQ_LAUNDRY/usr.bin/dtc/fdt.hh (revision 306832) @@ -1,866 +1,888 @@ /*- * Copyright (c) 2013 David Chisnall * All rights reserved. * * This software was developed by SRI International and the University of * Cambridge Computer Laboratory under DARPA/AFRL contract (FA8750-10-C-0237) * ("CTSRD"), as part of the DARPA CRASH research programme. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _FDT_HH_ #define _FDT_HH_ #include #include #include #include #include #include "util.hh" -#include "string.hh" +#include "input_buffer.hh" namespace dtc { namespace dtb { struct output_writer; class string_table; } namespace fdt { class property; class node; /** * Type for (owned) pointers to properties. */ typedef std::shared_ptr property_ptr; /** * Owning pointer to a node. */ typedef std::unique_ptr node_ptr; /** * Map from macros to property pointers. */ -typedef std::unordered_map define_map; +typedef std::unordered_map define_map; /** + * Set of strings used for label names. + */ +typedef std::unordered_set string_set; +/** * Properties may contain a number of different value, each with a different * label. This class encapsulates a single value. */ struct property_value { /** * The label for this data. This is usually empty. */ - string label; + std::string label; /** * If this value is a string, or something resolved from a string (a * reference) then this contains the source string. */ - string string_data; + std::string string_data; /** * The data that should be written to the final output. */ byte_buffer byte_data; /** * Enumeration describing the possible types of a value. Note that * property-coded arrays will appear simply as binary (or possibly * string, if they happen to be nul-terminated and printable), and must * be checked separately. */ enum value_type { /** * This is a list of strings. When read from source, string * lists become one property value for each string, however * when read from binary we have a single property value * incorporating the entire text, with nul bytes separating the * strings. */ STRING_LIST, /** * This property contains a single string. */ STRING, /** * This is a binary value. Check the size of byte_data to * determine how many bytes this contains. */ BINARY, /** This contains a short-form address that should be replaced * by a fully-qualified version. This will only appear when * the input is a device tree source. When parsed from a * device tree blob, the cross reference will have already been * resolved and the property value will be a string containing * the full path of the target node. */ CROSS_REFERENCE, /** * This is a phandle reference. When parsed from source, the * string_data will contain the node label for the target and, * after cross references have been resolved, the binary data * will contain a 32-bit integer that should match the phandle * property of the target node. */ PHANDLE, /** * An empty property value. This will never appear on a real * property value, it is used by checkers to indicate that no * property values should exist for a property. */ EMPTY, /** * The type of this property has not yet been determined. */ UNKNOWN }; /** * The type of this property. */ value_type type; /** * Returns true if this value is a cross reference, false otherwise. */ inline bool is_cross_reference() { return is_type(CROSS_REFERENCE); } /** * Returns true if this value is a phandle reference, false otherwise. */ inline bool is_phandle() { return is_type(PHANDLE); } /** * Returns true if this value is a string, false otherwise. */ inline bool is_string() { return is_type(STRING); } /** * Returns true if this value is a string list (a nul-separated * sequence of strings), false otherwise. */ inline bool is_string_list() { return is_type(STRING_LIST); } /** * Returns true if this value is binary, false otherwise. */ inline bool is_binary() { return is_type(BINARY); } /** * Returns this property value as a 32-bit integer. Returns 0 if this * property value is not 32 bits long. The bytes in the property value * are assumed to be in big-endian format, but the return value is in * the host native endian. */ uint32_t get_as_uint32(); /** * Default constructor, specifying the label of the value. */ - property_value(string l=string()) : label(l), type(UNKNOWN) {} + property_value(std::string l=std::string()) : label(l), type(UNKNOWN) {} /** * Writes the data for this value into an output buffer. */ void push_to_buffer(byte_buffer &buffer); /** * Writes the property value to the standard output. This uses the * following heuristics for deciding how to print the output: * * - If the value is nul-terminated and only contains printable * characters, it is written as a string. * - If it is a multiple of 4 bytes long, then it is printed as cells. * - Otherwise, it is printed as a byte buffer. */ void write_dts(FILE *file); /** * Tries to merge adjacent property values, returns true if it succeeds and * false otherwise. */ bool try_to_merge(property_value &other); private: /** * Returns whether the value is of the specified type. If the type of * the value has not yet been determined, then this calculates it. */ inline bool is_type(value_type v) { if (type == UNKNOWN) { resolve_type(); } return type == v; } /** * Determines the type of the value based on its contents. */ void resolve_type(); /** * Writes the property value to the specified file as a quoted string. * This is used when generating DTS. */ void write_as_string(FILE *file); /** * Writes the property value to the specified file as a sequence of * 32-bit big-endian cells. This is used when generating DTS. */ void write_as_cells(FILE *file); /** * Writes the property value to the specified file as a sequence of * bytes. This is used when generating DTS. */ void write_as_bytes(FILE *file); }; /** * A value encapsulating a single property. This contains a key, optionally a * label, and optionally one or more values. */ class property { /** * The name of this property. */ - string key; + std::string key; /** - * An optional label. + * Zero or more labels. */ - string label; + string_set labels; /** * The values in this property. */ std::vector values; /** * Value indicating that this is a valid property. If a parse error * occurs, then this value is false. */ bool valid; /** * Parses a string property value, i.e. a value enclosed in double quotes. */ - void parse_string(input_buffer &input); + void parse_string(text_input_buffer &input); /** * Parses one or more 32-bit values enclosed in angle brackets. */ - void parse_cells(input_buffer &input, int cell_size); + void parse_cells(text_input_buffer &input, int cell_size); /** * Parses an array of bytes, contained within square brackets. */ - void parse_bytes(input_buffer &input); + void parse_bytes(text_input_buffer &input); /** * Parses a reference. This is a node label preceded by an ampersand * symbol, which should expand to the full path to that node. * * Note: The specification says that the target of such a reference is * a node name, however dtc assumes that it is a label, and so we * follow their interpretation for compatibility. */ - void parse_reference(input_buffer &input); + void parse_reference(text_input_buffer &input); /** * Parse a predefined macro definition for a property. */ - void parse_define(input_buffer &input, define_map *defines); + void parse_define(text_input_buffer &input, define_map *defines); /** * Constructs a new property from two input buffers, pointing to the * struct and strings tables in the device tree blob, respectively. * The structs input buffer is assumed to have just consumed the * FDT_PROP token. */ property(input_buffer &structs, input_buffer &strings); /** * Parses a new property from the input buffer. */ - property(input_buffer &input, - string k, - string l, + property(text_input_buffer &input, + std::string &&k, + string_set &&l, bool terminated, define_map *defines); public: /** * Creates an empty property. */ - property(string k, string l=string()) : key(k), label(l), valid(true) - {} + property(std::string &&k, string_set &&l=string_set()) + : key(k), labels(l), valid(true) {} /** * Copy constructor. */ - property(property &p) : key(p.key), label(p.label), values(p.values), + property(property &p) : key(p.key), labels(p.labels), values(p.values), valid(p.valid) {} /** * Factory method for constructing a new property. Attempts to parse a * property from the input, and returns it on success. On any parse * error, this will return 0. */ static property_ptr parse_dtb(input_buffer &structs, - input_buffer &strings); + input_buffer &strings); /** * Factory method for constructing a new property. Attempts to parse a * property from the input, and returns it on success. On any parse * error, this will return 0. */ - static property_ptr parse(input_buffer &input, - string key, - string label=string(), + static property_ptr parse(text_input_buffer &input, + std::string &&key, + string_set &&labels=string_set(), bool semicolonTerminated=true, define_map *defines=0); /** * Iterator type used for accessing the values of a property. */ typedef std::vector::iterator value_iterator; /** * Returns an iterator referring to the first value in this property. */ inline value_iterator begin() { return values.begin(); } /** * Returns an iterator referring to the last value in this property. */ inline value_iterator end() { return values.end(); } /** * Adds a new value to an existing property. */ inline void add_value(property_value v) { values.push_back(v); } /** * Returns the key for this property. */ - inline string get_key() + inline std::string get_key() { return key; } /** * Writes the property to the specified writer. The property name is a * reference into the strings table. */ void write(dtb::output_writer &writer, dtb::string_table &strings); /** * Writes in DTS format to the specified file, at the given indent * level. This will begin the line with the number of tabs specified * as the indent level and then write the property in the most * applicable way that it can determine. */ void write_dts(FILE *file, int indent); }; /** * Class encapsulating a device tree node. Nodes may contain properties and * other nodes. */ class node { public: /** - * The label for this node, if any. Node labels are used as the + * The labels for this node, if any. Node labels are used as the * targets for cross references. */ - string label; + std::unordered_set labels; /** * The name of the node. */ - string name; + std::string name; /** * The unit address of the node, which is optionally written after the * name followed by an at symbol. */ - string unit_address; + std::string unit_address; /** * The type for the property vector. */ typedef std::vector property_vector; /** * Iterator type for child nodes. */ typedef std::vector::iterator child_iterator; private: /** * Adaptor to use children in range-based for loops. */ struct child_range { child_range(node &nd) : n(nd) {} child_iterator begin() { return n.child_begin(); } child_iterator end() { return n.child_end(); } private: node &n; }; /** * Adaptor to use properties in range-based for loops. */ struct property_range { property_range(node &nd) : n(nd) {} property_vector::iterator begin() { return n.property_begin(); } property_vector::iterator end() { return n.property_end(); } private: node &n; }; /** * The properties contained within this node. */ property_vector props; /** * The children of this node. */ std::vector children; /** + * Children that should be deleted from this node when merging. + */ + std::unordered_set deleted_children; + /** + * Properties that should be deleted from this node when merging. + */ + std::unordered_set deleted_props; + /** * A flag indicating whether this node is valid. This is set to false * if an error occurs during parsing. */ bool valid; /** * Parses a name inside a node, writing the string passed as the last * argument as an error if it fails. */ - string parse_name(input_buffer &input, - bool &is_property, - const char *error); + std::string parse_name(text_input_buffer &input, + bool &is_property, + const char *error); /** * Constructs a new node from two input buffers, pointing to the struct * and strings tables in the device tree blob, respectively. */ node(input_buffer &structs, input_buffer &strings); /** * Parses a new node from the specified input buffer. This is called * when the input cursor is on the open brace for the start of the * node. The name, and optionally label and unit address, should have * already been parsed. */ - node(input_buffer &input, string n, string l, string a, define_map*); + node(text_input_buffer &input, + std::string &&n, + std::unordered_set &&l, + std::string &&a, + define_map*); /** * Comparison function for properties, used when sorting the properties * vector. Orders the properties based on their names. */ static inline bool cmp_properties(property_ptr &p1, property_ptr &p2); /* { return p1->get_key() < p2->get_key(); } */ /** * Comparison function for nodes, used when sorting the children * vector. Orders the nodes based on their names or, if the names are * the same, by the unit addresses. */ static inline bool cmp_children(node_ptr &c1, node_ptr &c2); public: /** * Sorts the node's properties and children into alphabetical order and * recursively sorts the children. */ void sort(); /** * Returns an iterator for the first child of this node. */ inline child_iterator child_begin() { return children.begin(); } /** * Returns an iterator after the last child of this node. */ inline child_iterator child_end() { return children.end(); } + /** + * Returns a range suitable for use in a range-based for loop describing + * the children of this node. + */ inline child_range child_nodes() { return child_range(*this); } + /** + * Accessor for the deleted children. + */ + inline const std::unordered_set &deleted_child_nodes() + { + return deleted_children; + } + /** + * Accessor for the deleted properties + */ + inline const std::unordered_set &deleted_properties() + { + return deleted_props; + } + /** + * Returns a range suitable for use in a range-based for loop describing + * the properties of this node. + */ inline property_range properties() { return property_range(*this); } /** * Returns an iterator after the last property of this node. */ inline property_vector::iterator property_begin() { return props.begin(); } /** * Returns an iterator for the first property of this node. */ inline property_vector::iterator property_end() { return props.end(); } /** * Factory method for constructing a new node. Attempts to parse a * node in DTS format from the input, and returns it on success. On * any parse error, this will return 0. This should be called with the * cursor on the open brace of the property, after the name and so on * have been parsed. */ - static node_ptr parse(input_buffer &input, - string name, - string label=string(), - string address=string(), + static node_ptr parse(text_input_buffer &input, + std::string &&name, + std::unordered_set &&label=std::unordered_set(), + std::string &&address=std::string(), define_map *defines=0); /** * Factory method for constructing a new node. Attempts to parse a * node in DTB format from the input, and returns it on success. On * any parse error, this will return 0. This should be called with the * cursor on the open brace of the property, after the name and so on * have been parsed. */ static node_ptr parse_dtb(input_buffer &structs, input_buffer &strings); /** * Returns a property corresponding to the specified key, or 0 if this * node does not contain a property of that name. */ - property_ptr get_property(string key); + property_ptr get_property(const std::string &key); /** * Adds a new property to this node. */ inline void add_property(property_ptr &p) { props.push_back(p); } /** * Merges a node into this one. Any properties present in both are * overridden, any properties present in only one are preserved. */ void merge_node(node_ptr other); /** * Write this node to the specified output. Although nodes do not * refer to a string table directly, their properties do. The string * table passed as the second argument is used for the names of * properties within this node and its children. */ void write(dtb::output_writer &writer, dtb::string_table &strings); /** * Writes the current node as DTS to the specified file. The second * parameter is the indent level. This function will start every line * with this number of tabs. */ void write_dts(FILE *file, int indent); /** * Recursively visit this node and then its children. */ void visit(std::function); }; /** * Class encapsulating the entire parsed FDT. This is the top-level class, * which parses the entire DTS representation and write out the finished * version. */ class device_tree { public: /** * Type used for node paths. A node path is sequence of names and unit * addresses. */ - typedef std::vector > node_path; + typedef std::vector > node_path; /** * Name that we should use for phandle nodes. */ enum phandle_format { /** linux,phandle */ LINUX, /** phandle */ EPAPR, /** Create both nodes. */ BOTH }; private: /** * The format that we should use for writing phandles. */ phandle_format phandle_node_name; /** * Flag indicating that this tree is valid. This will be set to false * on parse errors. */ bool valid; /** * Type used for memory reservations. A reservation is two 64-bit * values indicating a base address and length in memory that the * kernel should not use. The high 32 bits are ignored on 32-bit * platforms. */ typedef std::pair reservation; /** * The memory reserves table. */ std::vector reservations; /** * Root node. All other nodes are children of this node. */ node_ptr root; /** * Mapping from names to nodes. Only unambiguous names are recorded, * duplicate names are stored as (node*)-1. */ - std::unordered_map node_names; + std::unordered_map node_names; /** * A map from labels to node paths. When resolving cross references, * we look up referenced nodes in this and replace the cross reference * with the full path to its target. */ - std::unordered_map node_paths; + std::unordered_map node_paths; /** * A collection of property values that are references to other nodes. * These should be expanded to the full path of their targets. */ std::vector cross_references; /** * A collection of property values that refer to phandles. These will * be replaced by the value of the phandle property in their * destination. */ std::vector phandles; /** * The names of nodes that target phandles. */ - std::unordered_set phandle_targets; + std::unordered_set phandle_targets; /** * A collection of input buffers that we are using. These input * buffers are the ones that own their memory, and so we must preserve * them for the lifetime of the device tree. */ std::vector> buffers; /** * A map of used phandle values to nodes. All phandles must be unique, * so we keep a set of ones that the user explicitly provides in the * input to ensure that we don't reuse them. * * This is a map, rather than a set, because we also want to be able to * find phandles that were provided by the user explicitly when we are * doing checking. */ std::unordered_map used_phandles; /** * Paths to search for include files. This contains a set of * nul-terminated strings, which are not owned by this class and so * must be freed separately. */ std::vector include_paths; /** * Dictionary of predefined macros provided on the command line. */ define_map defines; /** * The default boot CPU, specified in the device tree header. */ uint32_t boot_cpu; /** * The number of empty reserve map entries to generate in the blob. */ uint32_t spare_reserve_map_entries; /** * The minimum size in bytes of the blob. */ uint32_t minimum_blob_size; /** * The number of bytes of padding to add to the end of the blob. */ uint32_t blob_padding; /** * Visit all of the nodes recursively, and if they have labels then add * them to the node_paths and node_names vectors so that they can be * used in resolving cross references. Also collects phandle * properties that have been explicitly added. */ void collect_names_recursive(node_ptr &n, node_path &path); /** * Assign phandle properties to all nodes that have been referenced and * require one. This method will recursively visit the tree starting at * the node that it is passed. */ void assign_phandles(node_ptr &n, uint32_t &next); /** * Calls the recursive version of this method on every root node. */ void collect_names(); /** * Resolves all cross references. Any properties that refer to another * node must have their values replaced by either the node path or * phandle value. */ void resolve_cross_references(); /** - * Parse a top-level include directive. - */ - bool parse_include(input_buffer &input, - const std::string &dir, - std::vector &roots, - FILE *depfile, - bool &read_header); - /** * Parses a dts file in the given buffer and adds the roots to the parsed * set. The `read_header` argument indicates whether the header has * already been read. Some dts files place the header in an include, * rather than in the top-level file. */ - void parse_file(input_buffer &input, - const std::string &dir, + void parse_file(text_input_buffer &input, std::vector &roots, - FILE *depfile, bool &read_header); /** - * Allocates a new mmap()'d input buffer for use in parsing. This - * object then keeps a reference to it, ensuring that it is not - * deallocated until the device tree is destroyed. - */ - input_buffer *buffer_for_file(const char *path, bool warn=true); - /** * Template function that writes a dtb blob using the specified writer. * The writer defines the output format (assembly, blob). */ template void write(int fd); public: /** * Returns the node referenced by the property. If this is a tree that * is in source form, then we have a string that we can use to index * the cross_references array and so we can just look that up. */ node *referenced_node(property_value &v); /** * Writes this FDT as a DTB to the specified output. */ void write_binary(int fd); /** * Writes this FDT as an assembly representation of the DTB to the * specified output. The result can then be assembled and linked into * a program. */ void write_asm(int fd); /** * Writes the tree in DTS (source) format. */ void write_dts(int fd); /** * Default constructor. Creates a valid, but empty FDT. */ device_tree() : phandle_node_name(EPAPR), valid(true), boot_cpu(0), spare_reserve_map_entries(0), minimum_blob_size(0), blob_padding(0) {} /** * Constructs a device tree from the specified file name, referring to * a file that contains a device tree blob. */ - void parse_dtb(const char *fn, FILE *depfile); + void parse_dtb(const std::string &fn, FILE *depfile); /** * Constructs a device tree from the specified file name, referring to * a file that contains device tree source. */ - void parse_dts(const char *fn, FILE *depfile); + void parse_dts(const std::string &fn, FILE *depfile); /** * Returns whether this tree is valid. */ inline bool is_valid() { return valid; } /** * Sets the format for writing phandle properties. */ inline void set_phandle_format(phandle_format f) { phandle_node_name = f; } /** * Returns a pointer to the root node of this tree. No ownership * transfer. */ inline const node_ptr &get_root() const { return root; } /** * Sets the physical boot CPU. */ void set_boot_cpu(uint32_t cpu) { boot_cpu = cpu; } /** * Sorts the tree. Useful for debugging device trees. */ void sort() { root->sort(); } /** * Adds a path to search for include files. The argument must be a * nul-terminated string representing the path. The device tree keeps * a pointer to this string, but does not own it: the caller is * responsible for freeing it if required. */ void add_include_path(const char *path) { std::string p(path); include_paths.push_back(std::move(p)); } /** * Sets the number of empty reserve map entries to add. */ void set_empty_reserve_map_entries(uint32_t e) { spare_reserve_map_entries = e; } /** * Sets the minimum size, in bytes, of the blob. */ void set_blob_minimum_size(uint32_t s) { minimum_blob_size = s; } /** * Sets the amount of padding to add to the blob. */ void set_blob_padding(uint32_t p) { blob_padding = p; } /** * Parses a predefined macro value. */ bool parse_define(const char *def); }; } // namespace fdt } // namespace dtc #endif // !_FDT_HH_ Index: user/alc/PQ_LAUNDRY/usr.bin/dtc/input_buffer.cc =================================================================== --- user/alc/PQ_LAUNDRY/usr.bin/dtc/input_buffer.cc (revision 306831) +++ user/alc/PQ_LAUNDRY/usr.bin/dtc/input_buffer.cc (revision 306832) @@ -1,831 +1,1221 @@ /*- * Copyright (c) 2013 David Chisnall * All rights reserved. * * This software was developed by SRI International and the University of * Cambridge Computer Laboratory under DARPA/AFRL contract (FA8750-10-C-0237) * ("CTSRD"), as part of the DARPA CRASH research programme. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include "input_buffer.hh" #include #include #include #include #include #include #include #ifndef NDEBUG #include #endif #include #include #include +#include +#include #ifndef MAP_PREFAULT_READ #define MAP_PREFAULT_READ 0 #endif +using std::string; + +namespace +{ +/** + * Subclass of input_buffer that mmap()s a file and owns the resulting memory. + * When this object is destroyed, the memory is unmapped. + */ +struct mmap_input_buffer : public dtc::input_buffer +{ + string fn; + const string &filename() const override + { + return fn; + } + /** + * Constructs a new buffer from the file passed in as a file + * descriptor. + */ + mmap_input_buffer(int fd, string &&filename); + /** + * Unmaps the buffer, if one exists. + */ + virtual ~mmap_input_buffer(); +}; +/** + * Input buffer read from standard input. This is used for reading device tree + * blobs and source from standard input. It reads the entire input into + * malloc'd memory, so will be very slow for large inputs. DTS and DTB files + * are very rarely more than 10KB though, so this is probably not a problem. + */ +struct stream_input_buffer : public dtc::input_buffer +{ + const string &filename() const override + { + static string n = ""; + return n; + } + /** + * The buffer that will store the data read from the standard input. + */ + std::vector b; + /** + * Constructs a new buffer from the standard input. + */ + stream_input_buffer(); +}; + +mmap_input_buffer::mmap_input_buffer(int fd, std::string &&filename) + : input_buffer(0, 0), fn(filename) +{ + struct stat sb; + if (fstat(fd, &sb)) + { + perror("Failed to stat file"); + } + size = sb.st_size; + buffer = (const char*)mmap(0, size, PROT_READ, MAP_PRIVATE | + MAP_PREFAULT_READ, fd, 0); + if (buffer == MAP_FAILED) + { + perror("Failed to mmap file"); + exit(EXIT_FAILURE); + } +} + +mmap_input_buffer::~mmap_input_buffer() +{ + if (buffer != 0) + { + munmap((void*)buffer, size); + } +} + +stream_input_buffer::stream_input_buffer() : input_buffer(0, 0) +{ + int c; + while ((c = fgetc(stdin)) != EOF) + { + b.push_back(c); + } + buffer = b.data(); + size = b.size(); +} + +} // Anonymous namespace + + namespace dtc { + void -input_buffer::skip_spaces() +input_buffer::skip_to(char c) { - if (cursor >= size) { return; } - if (cursor < 0) { return; } - char c = buffer[cursor]; + while ((cursor < size) && (buffer[cursor] != c)) + { + cursor++; + } +} + +void +text_input_buffer::skip_to(char c) +{ + while (!finished() && (*(*this) != c)) + { + ++(*this); + } +} + +void +text_input_buffer::skip_spaces() +{ + if (finished()) { return; } + char c = *(*this); + bool last_nl = false; while ((c == ' ') || (c == '\t') || (c == '\n') || (c == '\f') || (c == '\v') || (c == '\r')) { - cursor++; - if (cursor > size) + last_nl = ((c == '\n') || (c == '\r')); + ++(*this); + if (finished()) { c = '\0'; } else { - c = buffer[cursor]; + c = *(*this); } } + // Skip C preprocessor leftovers + if ((c == '#') && ((cursor == 0) || last_nl)) + { + skip_to('\n'); + skip_spaces(); + } + if (consume("/include/")) + { + handle_include(); + skip_spaces(); + } } +void +text_input_buffer::handle_include() +{ + bool reallyInclude = true; + if (consume("if ")) + { + next_token(); + string name = parse_property_name(); + if (defines.count(name) > 0) + { + reallyInclude = true; + } + consume('/'); + } + next_token(); + if (!consume('"')) + { + parse_error("Expected quoted filename"); + return; + } + string file = parse_to('"'); + consume('"'); + if (!reallyInclude) + { + return; + } + string include_file = dir + '/' + file; + auto include_buffer = input_buffer::buffer_for_file(include_file, false); + if (include_buffer == 0) + { + for (auto i : include_paths) + { + include_file = i + '/' + file; + include_buffer = input_buffer::buffer_for_file(include_file, false); + if (include_buffer != 0) + { + break; + } + } + } + if (depfile) + { + putc(' ', depfile); + fputs(include_file.c_str(), depfile); + } + if (!include_buffer) + { + parse_error("Unable to locate input file"); + return; + } + input_stack.push(std::move(include_buffer)); +} + input_buffer input_buffer::buffer_from_offset(int offset, int s) { + if (offset < 0) + { + return input_buffer(); + } if (s == 0) { s = size - offset; } if (offset > size) { return input_buffer(); } if (s > (size-offset)) { return input_buffer(); } return input_buffer(&buffer[offset], s); } bool input_buffer::consume(const char *str) { int len = strlen(str); if (len > size - cursor) { return false; } else { for (int i=0 ; i(&buffer[size]); outInt = strtoull(&buffer[cursor], &end, 0); if (end == &buffer[cursor]) { return false; } cursor = end - buffer; return true; } namespace { /** * Convenience typedef for the type that we use for all values. */ typedef unsigned long long valty; /** * Expression tree currently being parsed. */ struct expression { + typedef text_input_buffer::source_location source_location; /** + * The type that is returned when computing the result. The boolean value + * indicates whether this is a valid expression. + * + * FIXME: Once we can use C++17, this should be `std::optional`. + */ + typedef std::pair result; + /** * Evaluate this node, taking into account operator precedence. */ - virtual valty operator()() = 0; + virtual result operator()() = 0; /** * Returns the precedence of this node. Lower values indicate higher * precedence. */ virtual int precedence() = 0; + /** + * Constructs an expression, storing the location where it was created. + */ + expression(source_location l) : loc(l) {} virtual ~expression() {} #ifndef NDEBUG /** * Dumps this expression to `std::cerr`, appending a newline if `nl` is * `true`. */ void dump(bool nl=false) { - if (this == nullptr) + void *ptr = this; + if (ptr == nullptr) { std::cerr << "{nullptr}\n"; return; } dump_impl(); if (nl) { std::cerr << '\n'; } } private: /** * Method that sublcasses override to implement the behaviour of `dump()`. */ virtual void dump_impl() = 0; #endif + protected: + source_location loc; }; /** * Expression wrapping a single integer. Leaf nodes in the expression tree. */ class terminal_expr : public expression { /** * The value that this wraps. */ valty val; /** * Evaluate. Trivially returns the value that this class wraps. */ - valty operator()() override + result operator()() override { - return val; + return {val, true}; } int precedence() override { return 0; } public: /** * Constructor. */ - terminal_expr(valty v) : val(v) {} + terminal_expr(source_location l, valty v) : expression(l), val(v) {} #ifndef NDEBUG void dump_impl() override { std::cerr << val; } #endif }; /** * Parenthetical expression. Exists to make the contents opaque. */ struct paren_expression : public expression { /** * The expression within the parentheses. */ expression_ptr subexpr; /** * Constructor. Takes the child expression as the only argument. */ - paren_expression(expression_ptr p) : subexpr(std::move(p)) {} + paren_expression(source_location l, expression_ptr p) : expression(l), + subexpr(std::move(p)) {} int precedence() override { return 0; } /** * Evaluate - just forwards to the underlying expression. */ - valty operator()() override + result operator()() override { return (*subexpr)(); } #ifndef NDEBUG void dump_impl() override { std::cerr << " ("; subexpr->dump(); std::cerr << ") "; } #endif }; /** * Template class for unary operators. The `OpChar` template parameter is * solely for debugging and makes it easy to print the expression. The `Op` * template parameter is a function object that implements the operator that * this class provides. Most of these are provided by the `` * header. */ template class unary_operator : public expression { /** * The subexpression for this unary operator. */ expression_ptr subexpr; - valty operator()() override + result operator()() override { Op op; - return op((*subexpr)()); + result s = (*subexpr)(); + if (!s.second) + { + return s; + } + return {op(s.first), true}; } /** * All unary operators have the same precedence. They are all evaluated * before binary expressions, but after parentheses. */ int precedence() override { return 3; } public: - unary_operator(expression_ptr p) : subexpr(std::move(p)) {} + unary_operator(source_location l, expression_ptr p) : + expression(l), subexpr(std::move(p)) {} #ifndef NDEBUG void dump_impl() override { std::cerr << OpChar; subexpr->dump(); } #endif }; /** * Abstract base class for binary operators. Allows the tree to be modified * without knowing what the operations actually are. */ struct binary_operator_base : public expression { + using expression::expression; /** * The left side of the expression. */ expression_ptr lhs; /** * The right side of the expression. */ expression_ptr rhs; /** * Insert a node somewhere down the path of left children, until it would * be preempting something that should execute first. */ void insert_left(binary_operator_base *new_left) { if (lhs->precedence() < new_left->precedence()) { new_left->rhs = std::move(lhs); lhs.reset(new_left); } else { static_cast(lhs.get())->insert_left(new_left); } } }; /** * Template class for binary operators. The precedence and the operation are * provided as template parameters. */ template struct binary_operator : public binary_operator_base { - valty operator()() override + result operator()() override { Op op; - return op((*lhs)(), (*rhs)()); + result l = (*lhs)(); + result r = (*rhs)(); + if (!(l.second && r.second)) + { + return {0, false}; + } + return {op(l.first, r.first), true}; } int precedence() override { return Precedence; } #ifdef NDEBUG /** * Constructor. Takes the name of the operator as an argument, for * debugging. Only stores it in debug mode. */ - binary_operator(const char *) {} + binary_operator(source_location l, const char *) : expression(l) {} #else const char *opName; - binary_operator(const char *o) : opName(o) {} + binary_operator(source_location l, const char *o) : + binary_operator_base(l), opName(o) {} void dump_impl() override { lhs->dump(); std::cerr << opName; rhs->dump(); } #endif }; /** * Ternary conditional operators (`cond ? true : false`) are a special case - * there are no other ternary operators. */ class ternary_conditional_operator : public expression { /** * The condition for the clause. */ expression_ptr cond; /** * The expression that this evaluates to if the condition is true. */ expression_ptr lhs; /** * The expression that this evaluates to if the condition is false. */ expression_ptr rhs; - valty operator()() override + result operator()() override { - return (*cond)() ? (*lhs)() : (*rhs)(); + result c = (*cond)(); + result l = (*lhs)(); + result r = (*rhs)(); + if (!(l.second && r.second && c.second)) + { + return {0, false}; + } + return c.first ? l : r; } int precedence() override { // The actual precedence of a ternary conditional operator is 15, but // its associativity is the opposite way around to the other operators, // so we fudge it slightly. return 3; } #ifndef NDEBUG void dump_impl() override { cond->dump(); std::cerr << " ? "; lhs->dump(); std::cerr << " : "; rhs->dump(); } #endif public: - ternary_conditional_operator(expression_ptr c, + ternary_conditional_operator(source_location sl, + expression_ptr c, expression_ptr l, expression_ptr r) : - cond(std::move(c)), lhs(std::move(l)), rhs(std::move(r)) {} + expression(sl), cond(std::move(c)), lhs(std::move(l)), + rhs(std::move(r)) {} }; template struct lshift { constexpr T operator()(const T &lhs, const T &rhs) const { return lhs << rhs; } }; template struct rshift { constexpr T operator()(const T &lhs, const T &rhs) const { return lhs >> rhs; } }; template struct unary_plus { constexpr T operator()(const T &val) const { return +val; } }; // TODO: Replace with std::bit_not once we can guarantee C++14 as a baseline. template struct bit_not { constexpr T operator()(const T &val) const { return ~val; } }; +template +struct divmod : public binary_operator<5, T> +{ + using binary_operator<5, T>::binary_operator; + using binary_operator_base::result; + result operator()() override + { + result r = (*binary_operator_base::rhs)(); + if (r.second && (r.first == 0)) + { + expression::loc.report_error("Division by zero"); + return {0, false}; + } + return binary_operator<5, T>::operator()(); + } +}; + } // anonymous namespace -expression_ptr input_buffer::parse_binary_expression(expression_ptr lhs) +expression_ptr text_input_buffer::parse_binary_expression(expression_ptr lhs) { next_token(); binary_operator_base *expr = nullptr; - char op = ((*this)[0]); + char op = *(*this); + source_location l = location(); switch (op) { default: return lhs; case '+': - expr = new binary_operator<6, std::plus>("+"); + expr = new binary_operator<6, std::plus>(l, "+"); break; case '-': - expr = new binary_operator<6, std::minus>("-"); + expr = new binary_operator<6, std::minus>(l, "-"); break; case '%': - expr = new binary_operator<5, std::modulus>("%"); + expr = new divmod>(l, "/"); break; case '*': - expr = new binary_operator<5, std::multiplies>("*"); + expr = new binary_operator<5, std::multiplies>(l, "*"); break; case '/': - expr = new binary_operator<5, std::divides>("/"); + expr = new divmod>(l, "/"); break; case '<': - cursor++; - switch ((*this)[0]) + switch (peek()) { default: parse_error("Invalid operator"); return nullptr; case ' ': case '(': case '0'...'9': - cursor--; - expr = new binary_operator<8, std::less>("<"); + expr = new binary_operator<8, std::less>(l, "<"); break; case '=': - expr = new binary_operator<8, std::less_equal>("<="); + ++(*this); + expr = new binary_operator<8, std::less_equal>(l, "<="); break; case '<': - expr = new binary_operator<7, lshift>("<<"); + ++(*this); + expr = new binary_operator<7, lshift>(l, "<<"); break; } break; case '>': - cursor++; - switch ((*this)[0]) + switch (peek()) { default: parse_error("Invalid operator"); return nullptr; case '(': case ' ': case '0'...'9': - cursor--; - expr = new binary_operator<8, std::greater>(">"); + expr = new binary_operator<8, std::greater>(l, ">"); break; case '=': - expr = new binary_operator<8, std::greater_equal>(">="); + ++(*this); + expr = new binary_operator<8, std::greater_equal>(l, ">="); break; case '>': - expr = new binary_operator<7, rshift>(">>"); + ++(*this); + expr = new binary_operator<7, rshift>(l, ">>"); break; return lhs; } break; case '=': - if ((*this)[1] != '=') + if (peek() != '=') { parse_error("Invalid operator"); return nullptr; } - consume('='); - expr = new binary_operator<9, std::equal_to>("=="); + expr = new binary_operator<9, std::equal_to>(l, "=="); break; case '!': - if ((*this)[1] != '=') + if (peek() != '=') { parse_error("Invalid operator"); return nullptr; } cursor++; - expr = new binary_operator<9, std::not_equal_to>("!="); + expr = new binary_operator<9, std::not_equal_to>(l, "!="); break; case '&': - if ((*this)[1] == '&') + if (peek() == '&') { - expr = new binary_operator<13, std::logical_and>("&&"); + expr = new binary_operator<13, std::logical_and>(l, "&&"); } else { - expr = new binary_operator<10, std::bit_and>("&"); + expr = new binary_operator<10, std::bit_and>(l, "&"); } break; case '|': - if ((*this)[1] == '|') + if (peek() == '|') { - expr = new binary_operator<12, std::logical_or>("||"); + expr = new binary_operator<12, std::logical_or>(l, "||"); } else { - expr = new binary_operator<14, std::bit_or>("|"); + expr = new binary_operator<14, std::bit_or>(l, "|"); } break; case '?': { consume('?'); expression_ptr true_case = parse_expression(); next_token(); if (!true_case || !consume(':')) { parse_error("Expected : in ternary conditional operator"); return nullptr; } expression_ptr false_case = parse_expression(); if (!false_case) { parse_error("Expected false condition for ternary operator"); return nullptr; } - return expression_ptr(new ternary_conditional_operator(std::move(lhs), + return expression_ptr(new ternary_conditional_operator(l, std::move(lhs), std::move(true_case), std::move(false_case))); } } - cursor++; + ++(*this); next_token(); expression_ptr e(expr); expression_ptr rhs(parse_expression()); if (!rhs) { return nullptr; } expr->lhs = std::move(lhs); if (rhs->precedence() < expr->precedence()) { expr->rhs = std::move(rhs); } else { // If we're a normal left-to-right expression, then we need to insert // this as the far-left child node of the rhs expression binary_operator_base *rhs_op = static_cast(rhs.get()); rhs_op->insert_left(expr); e.release(); return rhs; } return e; } -expression_ptr input_buffer::parse_expression(bool stopAtParen) +expression_ptr text_input_buffer::parse_expression(bool stopAtParen) { next_token(); unsigned long long leftVal; expression_ptr lhs; - switch ((*this)[0]) + source_location l = location(); + switch (*(*this)) { case '0'...'9': if (!consume_integer(leftVal)) { return nullptr; } - lhs.reset(new terminal_expr(leftVal)); + lhs.reset(new terminal_expr(l, leftVal)); break; case '(': { consume('('); expression_ptr &&subexpr = parse_expression(); if (!subexpr) { return nullptr; } - lhs.reset(new paren_expression(std::move(subexpr))); + lhs.reset(new paren_expression(l, std::move(subexpr))); if (!consume(')')) { return nullptr; } if (stopAtParen) { return lhs; } break; } case '+': { consume('+'); expression_ptr &&subexpr = parse_expression(); if (!subexpr) { return nullptr; } - lhs.reset(new unary_operator<'+', unary_plus>(std::move(subexpr))); + lhs.reset(new unary_operator<'+', unary_plus>(l, std::move(subexpr))); break; } case '-': { consume('-'); expression_ptr &&subexpr = parse_expression(); if (!subexpr) { return nullptr; } - lhs.reset(new unary_operator<'-', std::negate>(std::move(subexpr))); + lhs.reset(new unary_operator<'-', std::negate>(l, std::move(subexpr))); break; } case '!': { consume('!'); expression_ptr &&subexpr = parse_expression(); if (!subexpr) { return nullptr; } - lhs.reset(new unary_operator<'!', std::logical_not>(std::move(subexpr))); + lhs.reset(new unary_operator<'!', std::logical_not>(l, std::move(subexpr))); break; } case '~': { consume('~'); expression_ptr &&subexpr = parse_expression(); if (!subexpr) { return nullptr; } - lhs.reset(new unary_operator<'~', bit_not>(std::move(subexpr))); + lhs.reset(new unary_operator<'~', bit_not>(l, std::move(subexpr))); break; } } if (!lhs) { return nullptr; } return parse_binary_expression(std::move(lhs)); } bool -input_buffer::consume_integer_expression(unsigned long long &outInt) +text_input_buffer::consume_integer_expression(unsigned long long &outInt) { - switch ((*this)[0]) + switch (*(*this)) { case '(': { expression_ptr e(parse_expression(true)); if (!e) { return false; } - outInt = (*e)(); - return true; + auto r = (*e)(); + if (r.second) + { + outInt = r.first; + return true; + } + return false; } case '0'...'9': return consume_integer(outInt); default: return false; } } bool input_buffer::consume_hex_byte(uint8_t &outByte) { if (!ishexdigit((*this)[0]) && !ishexdigit((*this)[1])) { return false; } outByte = (digittoint((*this)[0]) << 4) | digittoint((*this)[1]); cursor += 2; return true; } -input_buffer& -input_buffer::next_token() +text_input_buffer& +text_input_buffer::next_token() { + auto &self = *this; int start; do { start = cursor; skip_spaces(); + if (finished()) + { + return self; + } // Parse /* comments - if ((*this)[0] == '/' && (*this)[1] == '*') + if (*self == '/' && peek() == '*') { // eat the start of the comment - ++(*this); - ++(*this); + ++self; + ++self; do { // Find the ending * of */ - while ((**this != '\0') && (**this != '*')) + while ((*self != '\0') && (*self != '*') && !finished()) { - ++(*this); + ++self; } // Eat the * - ++(*this); - } while ((**this != '\0') && (**this != '/')); + ++self; + } while ((*self != '\0') && (*self != '/') && !finished()); // Eat the / - ++(*this); + ++self; } // Parse // comments - if (((*this)[0] == '/' && (*this)[1] == '/')) + if ((*self == '/' && peek() == '/')) { // eat the start of the comment - ++(*this); - ++(*this); + ++self; + ++self; // Find the ending of the line - while (**this != '\n') + while (*self != '\n' && !finished()) { - ++(*this); + ++self; } // Eat the \n - ++(*this); + ++self; } } while (start != cursor); - return *this; + return self; } void -input_buffer::parse_error(const char *msg) +text_input_buffer::parse_error(const char *msg) { + if (input_stack.empty()) + { + fprintf(stderr, "Error: %s\n", msg); + return; + } + input_buffer &b = *input_stack.top(); + parse_error(msg, b, b.cursor); +} +void +text_input_buffer::parse_error(const char *msg, + input_buffer &b, + int loc) +{ int line_count = 1; int line_start = 0; - int line_end = cursor; - for (int i=cursor ; i>0 ; --i) + int line_end = loc; + if (loc < 0 || loc > b.size) { - if (buffer[i] == '\n') + return; + } + for (int i=loc ; i>0 ; --i) + { + if (b.buffer[i] == '\n') { line_count++; if (line_start == 0) { line_start = i+1; } } } - for (int i=cursor+1 ; i= 'a') && (c <= 'z')) || ((c >= 'A') && + (c <= 'Z')); } - size = sb.st_size; - buffer = (const char*)mmap(0, size, PROT_READ, MAP_PRIVATE | - MAP_PREFAULT_READ, fd, 0); - if (buffer == MAP_FAILED) +}; +/** + * Check whether a character is in the set allowed for node names. This is a + * class so that it can be used with a template function for parsing strings. + */ +struct is_node_name_character +{ + static inline bool check(const char c) { - perror("Failed to mmap file"); - exit(EXIT_FAILURE); + switch(c) + { + default: + return false; + case 'a'...'z': case 'A'...'Z': case '0'...'9': + case ',': case '.': case '+': case '-': + case '_': + return true; + } } +}; +/** + * Check whether a character is in the set allowed for property names. This is + * a class so that it can be used with a template function for parsing strings. + */ +struct is_property_name_character +{ + static inline bool check(const char c) + { + switch(c) + { + default: + return false; + case 'a'...'z': case 'A'...'Z': case '0'...'9': + case ',': case '.': case '+': case '-': + case '_': case '#': + return true; + } + } +}; + +template +string parse(text_input_buffer &s) +{ + std::vector bytes; + for (char c=*s ; T::check(c) ; c=*(++s)) + { + bytes.push_back(c); + } + return string(bytes.begin(), bytes.end()); } -mmap_input_buffer::~mmap_input_buffer() +} + +string +text_input_buffer::parse_node_name() { - if (buffer != 0) + return parse(*this); +} + +string +text_input_buffer::parse_property_name() +{ + return parse(*this); +} + +string +text_input_buffer::parse_node_or_property_name(bool &is_property) +{ + if (is_property) { - munmap((void*)buffer, size); + return parse_property_name(); } + std::vector bytes; + for (char c=*(*this) ; is_node_name_character::check(c) ; c=*(++(*this))) + { + bytes.push_back(c); + } + for (char c=*(*this) ; is_property_name_character::check(c) ; c=*(++(*this))) + { + bytes.push_back(c); + is_property = true; + } + return string(bytes.begin(), bytes.end()); } -stream_input_buffer::stream_input_buffer() : input_buffer(0, 0) +string +input_buffer::parse_to(char stop) { - int c; - while ((c = fgetc(stdin)) != EOF) + std::vector bytes; + for (char c=*(*this) ; c != stop ; c=*(++(*this))) { - b.push_back(c); + bytes.push_back(c); } - buffer = b.data(); - size = b.size(); + return string(bytes.begin(), bytes.end()); +} + +string +text_input_buffer::parse_to(char stop) +{ + std::vector bytes; + for (char c=*(*this) ; c != stop ; c=*(++(*this))) + { + if (finished()) + { + break; + } + bytes.push_back(c); + } + return string(bytes.begin(), bytes.end()); +} + +char +text_input_buffer::peek() +{ + return (*input_stack.top())[1]; +} + +std::unique_ptr +input_buffer::buffer_for_file(const string &path, bool warn) +{ + if (path == "-") + { + std::unique_ptr b(new stream_input_buffer()); + return b; + } + int source = open(path.c_str(), O_RDONLY); + if (source == -1) + { + if (warn) + { + fprintf(stderr, "Unable to open file '%s'. %s\n", path.c_str(), strerror(errno)); + } + return 0; + } + struct stat st; + if (fstat(source, &st) == 0 && S_ISDIR(st.st_mode)) + { + if (warn) + { + fprintf(stderr, "File %s is a directory\n", path.c_str()); + } + close(source); + return 0; + } + std::unique_ptr b(new mmap_input_buffer(source, std::string(path))); + close(source); + return b; } } // namespace dtc Index: user/alc/PQ_LAUNDRY/usr.bin/dtc/input_buffer.hh =================================================================== --- user/alc/PQ_LAUNDRY/usr.bin/dtc/input_buffer.hh (revision 306831) +++ user/alc/PQ_LAUNDRY/usr.bin/dtc/input_buffer.hh (revision 306832) @@ -1,316 +1,537 @@ /*- * Copyright (c) 2013 David Chisnall * All rights reserved. * * This software was developed by SRI International and the University of * Cambridge Computer Laboratory under DARPA/AFRL contract (FA8750-10-C-0237) * ("CTSRD"), as part of the DARPA CRASH research programme. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _INPUT_BUFFER_HH_ #define _INPUT_BUFFER_HH_ #include "util.hh" #include +#include +#include +#include namespace dtc { namespace { struct expression; typedef std::unique_ptr expression_ptr; } /** * Class encapsulating the input file. Can be used as a const char*, but has * range checking. Attempting to access anything out of range will return a 0 * byte. The input buffer can be cheaply copied, without copying the * underlying memory, however it is the user's responsibility to ensure that * such copies do not persist beyond the lifetime of the underlying memory. * * This also contains methods for reporting errors and for consuming the token * stream. */ class input_buffer { + friend class text_input_buffer; protected: /** * The buffer. This class doesn't own the buffer, but the * mmap_input_buffer subclass does. */ const char* buffer; /** * The size of the buffer. */ int size; private: /** - * Parse an expression. If `stopAtParen` is set, then only parse a number - * or a parenthetical expression, otherwise assume that either is the - * left-hand side of a binary expression and try to parse the right-hand - * side. - */ - expression_ptr parse_expression(bool stopAtParen=false); - /** - * Parse a binary expression, having already parsed the right-hand side. - */ - expression_ptr parse_binary_expression(expression_ptr lhs); - /** * The current place in the buffer where we are reading. This class * keeps a separate size, pointer, and cursor so that we can move * forwards and backwards and still have checks that we haven't fallen * off either end. */ int cursor; /** * Private constructor. This is used to create input buffers that * refer to the same memory, but have different cursors. */ input_buffer(const char* b, int s, int c) : buffer(b), size(s), cursor(c) {} + public: /** - * Reads forward past any spaces. The DTS format is not whitespace - * sensitive and so we want to scan past whitespace when reading it. + * Returns the file name associated with this buffer. */ - void skip_spaces(); - public: + virtual const std::string &filename() const + { + static std::string s; + return s; + } + static std::unique_ptr buffer_for_file(const std::string &path, + bool warn=true); /** + * Skips all characters in the input until the specified character is + * encountered. + */ + void skip_to(char); + /** + * Parses up to a specified character and returns the intervening + * characters as a string. + */ + std::string parse_to(char); + /** * Return whether all input has been consumed. */ bool finished() { return cursor >= size; } /** * Virtual destructor. Does nothing, but exists so that subclasses * that own the memory can run cleanup code for deallocating it. */ virtual ~input_buffer() {}; /** * Constructs an empty buffer. */ input_buffer() : buffer(0), size(0), cursor(0) {} /** * Constructs a new buffer with a specified memory region and size. */ input_buffer(const char* b, int s) : buffer(b), size(s), cursor(0){} /** * Returns a new input buffer referring into this input, clamped to the * specified size. If the requested buffer would fall outside the * range of this one, then it returns an empty buffer. * * The returned buffer shares the same underlying storage as the * original. This is intended to be used for splitting up the various * sections of a device tree blob. Requesting a size of 0 will give a * buffer that extends to the end of the available memory. */ input_buffer buffer_from_offset(int offset, int s=0); /** - * Returns true if this buffer has no unconsumed space in it. - */ - inline bool empty() - { - return cursor >= size; - } - /** * Dereferencing operator, allows the buffer to be treated as a char* * and dereferenced to give a character. This returns a null byte if * the cursor is out of range. */ inline char operator*() { if (cursor >= size) { return '\0'; } if (cursor < 0) { return '\0'; } return buffer[cursor]; } /** * Array subscripting operator, returns a character at the specified * index offset from the current cursor. The offset may be negative, * to reread characters that have already been read. If the current * cursor plus offset is outside of the range, this returns a nul * byte. */ inline char operator[](int offset) { if (cursor + offset >= size) { return '\0'; } if (cursor + offset < 0) { return '\0'; } return buffer[cursor + offset]; } /** * Increments the cursor, iterating forward in the buffer. */ inline input_buffer &operator++() { cursor++; return *this; } /** - * Cast to char* operator. Returns a pointer into the buffer that can - * be used for constructing strings. - */ - inline operator const char*() - { - if (cursor >= size) { return 0; } - if (cursor < 0) { return 0; } - return &buffer[cursor]; - } - /** * Consumes a character. Moves the cursor one character forward if the * next character matches the argument, returning true. If the current * character does not match the argument, returns false. */ inline bool consume(char c) { - if ((*this)[0] == c) + if (*(*this) == c) { ++(*this); return true; } return false; } /** * Consumes a string. If the (null-terminated) string passed as the * argument appears in the input, advances the cursor to the end and * returns true. Returns false if the string does not appear at the * current point in the input. */ bool consume(const char *str); /** * Reads an integer in base 8, 10, or 16. Returns true and advances * the cursor to the end of the integer if the cursor points to an * integer, returns false and does not move the cursor otherwise. * * The parsed value is returned via the argument. */ bool consume_integer(unsigned long long &outInt); /** * Reads an arithmetic expression (containing any of the normal C * operators), evaluates it, and returns the result. */ bool consume_integer_expression(unsigned long long &outInt); /** + * Consumes two hex digits and return the resulting byte via the first + * argument. If the next two characters are hex digits, returns true + * and advances the cursor. If not, then returns false and leaves the + * cursor in place. + */ + bool consume_hex_byte(uint8_t &outByte); + /** * Template function that consumes a binary value in big-endian format * from the input stream. Returns true and advances the cursor if * there is a value of the correct size. This function assumes that * all values must be natively aligned, and so advances the cursor to * the correct alignment before reading. */ template bool consume_binary(T &out) { int align = 0; int type_size = sizeof(T); if (cursor % type_size != 0) { align = type_size - (cursor % type_size); } if (size < cursor + align + type_size) { return false; } cursor += align; assert(cursor % type_size == 0); out = 0; for (int i=0 ; i inline bool input_buffer::consume_binary(uint8_t &out) { if (size < cursor + 1) { return false; } out = buffer[cursor++]; return true; } /** - * Subclass of input_buffer that mmap()s a file and owns the resulting memory. - * When this object is destroyed, the memory is unmapped. + * An input buffer subclass used for parsing DTS files. This manages a stack + * of input buffers to handle /input/ operations. */ -struct mmap_input_buffer : public input_buffer +class text_input_buffer { + std::unordered_set defines; /** - * Constructs a new buffer from the file passed in as a file - * descriptor. + * The cursor is the input into the input stream where we are currently reading. */ - mmap_input_buffer(int fd); + int cursor = 0; /** - * Unmaps the buffer, if one exists. + * The current stack of includes. The current input is always from the top + * of the stack. */ - virtual ~mmap_input_buffer(); -}; -/** - * Input buffer read from standard input. This is used for reading device tree - * blobs and source from standard input. It reads the entire input into - * malloc'd memory, so will be very slow for large inputs. DTS and DTB files - * are very rarely more than 10KB though, so this is probably not a problem. - */ -struct stream_input_buffer : public input_buffer -{ + std::stack> input_stack; /** - * The buffer that will store the data read from the standard input. + * */ - std::vector b; + const std::vector include_paths; /** - * Constructs a new buffer from the standard input. + * Reads forward past any spaces. The DTS format is not whitespace + * sensitive and so we want to scan past whitespace when reading it. */ - stream_input_buffer(); + void skip_spaces(); + /** + * Returns the character immediately after the current one. + * + * This method does not look between files. + */ + char peek(); + /** + * If a /include/ token is encountered, then look up the corresponding + * input file, push it onto the input stack, and continue. + */ + void handle_include(); + /** + * The base directory for this file. + */ + const std::string dir; + /** + * The file where dependencies should be output. + */ + FILE *depfile; + public: + /** + * Construct a new text input buffer with the specified buffer as the start + * of parsing and the specified set of input paths for handling new + * inclusions. + */ + text_input_buffer(std::unique_ptr &&b, + std::unordered_set &&d, + std::vector &&i, + const std::string directory, + FILE *deps) + : defines(d), include_paths(i), dir(directory), depfile(deps) + { + input_stack.push(std::move(b)); + } + /** + * Skips all characters in the input until the specified character is + * encountered. + */ + void skip_to(char); + /** + * Parse an expression. If `stopAtParen` is set, then only parse a number + * or a parenthetical expression, otherwise assume that either is the + * left-hand side of a binary expression and try to parse the right-hand + * side. + */ + expression_ptr parse_expression(bool stopAtParen=false); + /** + * Parse a binary expression, having already parsed the right-hand side. + */ + expression_ptr parse_binary_expression(expression_ptr lhs); + /** + * Return whether all input has been consumed. + */ + bool finished() + { + return input_stack.empty() || + ((input_stack.size() == 1) && input_stack.top()->finished()); + } + /** + * Dereferencing operator. Returns the current character in the top input buffer. + */ + inline char operator*() + { + if (input_stack.empty()) + { + return 0; + } + return *(*input_stack.top()); + } + /** + * Increments the cursor, iterating forward in the buffer. + */ + inline text_input_buffer &operator++() + { + if (input_stack.empty()) + { + return *this; + } + cursor++; + auto &top = *input_stack.top(); + ++top; + if (top.finished()) + { + input_stack.pop(); + } + return *this; + } + /** + * Consumes a character. Moves the cursor one character forward if the + * next character matches the argument, returning true. If the current + * character does not match the argument, returns false. + */ + inline bool consume(char c) + { + if (*(*this) == c) + { + ++(*this); + return true; + } + return false; + } + /** + * Consumes a string. If the (null-terminated) string passed as the + * argument appears in the input, advances the cursor to the end and + * returns true. Returns false if the string does not appear at the + * current point in the input. + * + * This method does not scan between files. + */ + bool consume(const char *str) + { + if (input_stack.empty()) + { + return false; + } + return input_stack.top()->consume(str); + } + /** + * Reads an integer in base 8, 10, or 16. Returns true and advances + * the cursor to the end of the integer if the cursor points to an + * integer, returns false and does not move the cursor otherwise. + * + * The parsed value is returned via the argument. + * + * This method does not scan between files. + */ + bool consume_integer(unsigned long long &outInt) + { + if (input_stack.empty()) + { + return false; + } + return input_stack.top()->consume_integer(outInt); + } + /** + * Reads an arithmetic expression (containing any of the normal C + * operators), evaluates it, and returns the result. + */ + bool consume_integer_expression(unsigned long long &outInt); + /** + * Consumes two hex digits and return the resulting byte via the first + * argument. If the next two characters are hex digits, returns true + * and advances the cursor. If not, then returns false and leaves the + * cursor in place. + * + * This method does not scan between files. + */ + bool consume_hex_byte(uint8_t &outByte) + { + if (input_stack.empty()) + { + return false; + } + return input_stack.top()->consume_hex_byte(outByte); + } + /** + * Returns the longest string in the input buffer starting at the + * current cursor and composed entirely of characters that are valid in + * node names. + */ + std::string parse_node_name(); + /** + * Returns the longest string in the input buffer starting at the + * current cursor and composed entirely of characters that are valid in + * property names. + */ + std::string parse_property_name(); + /** + * Parses either a node or a property name. If is_property is true on + * entry, then only property names are parsed. If it is false, then it + * will be set, on return, to indicate whether the parsed name is only + * valid as a property. + */ + std::string parse_node_or_property_name(bool &is_property); + /** + * Parses up to a specified character and returns the intervening + * characters as a string. + */ + std::string parse_to(char); + /** + * Advances the cursor to the start of the next token, skipping + * comments and whitespace. If the cursor already points to the start + * of a token, then this function does nothing. + */ + text_input_buffer &next_token(); + /** + * Location in the source file. This should never be interpreted by + * anything other than error reporting functions of this class. It will + * eventually become something more complex than an `int`. + */ + class source_location + { + friend class text_input_buffer; + /** + * The text buffer object that included `b`. + */ + text_input_buffer &buffer; + /** + * The underlying buffer that contains this location. + */ + std::shared_ptr b; + /** + * The offset within the current buffer of the source location. + */ + int cursor; + source_location(text_input_buffer &buf) + : buffer(buf), + b(buf.input_stack.empty() ? nullptr : buf.input_stack.top()), + cursor(b ? b->cursor : 0) {} + public: + /** + * Report an error at this location. + */ + void report_error(const char *msg) + { + if (b) + { + buffer.parse_error(msg, *b, cursor); + } + else + { + buffer.parse_error(msg); + } + } + }; + /** + * Returns the current source location. + */ + source_location location() + { + return { *this }; + } + /** + * Prints a message indicating the location of a parse error. + */ + void parse_error(const char *msg); + private: + /** + * Prints a message indicating the location of a parse error, given a + * specified location. This is used when input has already moved beyond + * the location that caused the failure. + */ + void parse_error(const char *msg, input_buffer &b, int loc); }; } // namespace dtc #endif // !_INPUT_BUFFER_HH_ Index: user/alc/PQ_LAUNDRY/usr.bin/dtc/string.cc =================================================================== --- user/alc/PQ_LAUNDRY/usr.bin/dtc/string.cc (revision 306831) +++ user/alc/PQ_LAUNDRY/usr.bin/dtc/string.cc (revision 306832) @@ -1,258 +1,148 @@ /*- * Copyright (c) 2013 David Chisnall * All rights reserved. * * This software was developed by SRI International and the University of * Cambridge Computer Laboratory under DARPA/AFRL contract (FA8750-10-C-0237) * ("CTSRD"), as part of the DARPA CRASH research programme. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ -#include "string.hh" +#include +#include +#include #include -#include +#include -namespace -{ -/** - * The source files are ASCII, so we provide a non-locale-aware version of - * isalpha. This is a class so that it can be used with a template function - * for parsing strings. - */ -struct is_alpha -{ - static inline bool check(const char c) - { - return ((c >= 'a') && (c <= 'z')) || ((c >= 'A') && - (c <= 'Z')); - } -}; -/** - * Check whether a character is in the set allowed for node names. This is a - * class so that it can be used with a template function for parsing strings. - */ -struct is_node_name_character -{ - static inline bool check(const char c) - { - switch(c) - { - default: - return false; - case 'a'...'z': case 'A'...'Z': case '0'...'9': - case ',': case '.': case '+': case '-': - case '_': - return true; - } - } -}; -/** - * Check whether a character is in the set allowed for property names. This is - * a class so that it can be used with a template function for parsing strings. - */ -struct is_property_name_character -{ - static inline bool check(const char c) - { - switch(c) - { - default: - return false; - case 'a'...'z': case 'A'...'Z': case '0'...'9': - case ',': case '.': case '+': case '-': - case '_': case '#': - return true; - } - } -}; +#include "util.hh" -} +using std::string; namespace dtc { -template string -string::parse(input_buffer &s) -{ - const char *start = s; - int l=0; - while (T::check(*s)) { l++; ++s; } - return string(start, l); -} - -string::string(input_buffer &s) : start((const char*)s), length(0) -{ - while(s[length] != '\0') - { - length++; - } -} - -string -string::parse_node_name(input_buffer &s) -{ - return parse(s); -} - -string -string::parse_property_name(input_buffer &s) -{ - return parse(s); -} -string -string::parse_node_or_property_name(input_buffer &s, bool &is_property) -{ - if (is_property) - { - return parse_property_name(s); - } - const char *start = s; - int l=0; - while (is_node_name_character::check(*s)) - { - l++; - ++s; - } - while (is_property_name_character::check(*s)) - { - l++; - ++s; - is_property = true; - } - return string(start, l); -} - -bool -string::operator==(const string& other) const -{ - return (length == other.length) && - (memcmp(start, other.start, length) == 0); -} - -bool -string::operator==(const char *other) const -{ - return strncmp(other, start, length) == 0; -} - -bool -string::operator<(const string& other) const -{ - if (length < other.length) { return true; } - if (length > other.length) { return false; } - return memcmp(start, other.start, length) < 0; -} - void -string::push_to_buffer(byte_buffer &buffer, bool escapes) +push_string(byte_buffer &buffer, const string &s, bool escapes) { - for (int i=0 ; i= '0') + if (i+1 < length && s[i+1] <= '7' && s[i+1] >= '0') { v <<= 3; - v |= digittoint(start[i+1]); + v |= digittoint(s[i+1]); i++; - if (i+1 < length && start[i+1] <= '7' && start[i+1] >= '0') + if (i+1 < length && s[i+1] <= '7' && s[i+1] >= '0') { v <<= 3; - v |= digittoint(start[i+1]); + v |= digittoint(s[i+1]); } } c = (uint8_t)v; break; } case 'x': { ++i; if (i >= length) { break; } - int v = digittoint(start[i]); - if (i+1 < length && ishexdigit(start[i+1])) + int v = digittoint(s[i]); + if (i+1 < length && ishexdigit(s[i+1])) { v <<= 4; - v |= digittoint(start[++i]); + v |= digittoint(s[++i]); } c = (uint8_t)v; break; } } } buffer.push_back(c); } } -void -string::print(FILE *file) +std::string dirname(const string &s) { - fwrite(start, length, 1, file); + if (s == string()) + { + return string(); + } + char *str = strdup(s.c_str()); + string dn(::dirname(str)); + free(str); + return dn; } -void -string::dump() +std::string basename(const string &s) { - print(stderr); + if (s == string()) + { + return string(); + } + char *str = strdup(s.c_str()); + string bn(::basename(str)); + free(str); + return bn; } - } // namespace dtc Index: user/alc/PQ_LAUNDRY/usr.bin/dtc/util.hh =================================================================== --- user/alc/PQ_LAUNDRY/usr.bin/dtc/util.hh (revision 306831) +++ user/alc/PQ_LAUNDRY/usr.bin/dtc/util.hh (revision 306832) @@ -1,92 +1,115 @@ /*- * Copyright (c) 2013 David Chisnall * All rights reserved. * * This software was developed by SRI International and the University of * Cambridge Computer Laboratory under DARPA/AFRL contract (FA8750-10-C-0237) * ("CTSRD"), as part of the DARPA CRASH research programme. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _UTIL_HH_ #define _UTIL_HH_ #include // If we aren't using C++11, then just ignore static asserts. #if __cplusplus < 201103L #ifndef static_assert #define static_assert(x, y) ((void)0) #endif #endif namespace dtc { /** * Type for a buffer of bytes. This is used for a lot of short-lived temporary * variables, so may eventually be changed to something like LLVM's * SmallVector, but currently the program runs in a tiny fraction of a second, * so this is not an issue. */ typedef std::vector byte_buffer; /** * Helper function to push a big endian value into a byte buffer. We use * native-endian values for all of the in-memory data structures and only * transform them into big endian form for output. */ template inline void push_big_endian(byte_buffer &v, T val) { static_assert(sizeof(T) > 1, "Big endian doesn't make sense for single-byte values"); for (int bit=(sizeof(T) - 1)*8 ; bit>=0 ; bit-= 8) { v.push_back((val >> bit) & 0xff); } } +void push_string(byte_buffer &v, const std::string &s, bool escapes=false); + /** * Simple inline non-locale-aware check that this is a valid ASCII * digit. */ inline bool isdigit(char c) { return (c >= '0') && (c <= '9'); } /** * Simple inline non-locale-aware check that this is a valid ASCII * hex digit. */ inline bool ishexdigit(char c) { return ((c >= '0') && (c <= '9')) || ((c >= 'a') && (c <= 'f')) || - ((c >= 'A') && (c <= 'Z')); + ((c >= 'A') && (c <= 'F')); } + +/** + * Simple inline non-locale-aware check that this is a valid ASCII + * letter. + */ +inline bool isalpha(char c) +{ + return ((c >= 'a') && (c <= 'z')) || ((c >= 'A') && (c <= 'Z')); +} + +/** + * A wrapper around dirname(3) that handles inconsistencies relating to memory + * management between platforms and provides a std::string interface. + */ +std::string dirname(const std::string&); + +/** + * A wrapper around basename(3) that handles inconsistencies relating to memory + * management between platforms and provides a std::string interface. + */ +std::string basename(const std::string&); }// namespace dtc #endif // !_UTIL_HH_ Index: user/alc/PQ_LAUNDRY/usr.bin/elfdump/elfdump.c =================================================================== --- user/alc/PQ_LAUNDRY/usr.bin/elfdump/elfdump.c (revision 306831) +++ user/alc/PQ_LAUNDRY/usr.bin/elfdump/elfdump.c (revision 306832) @@ -1,1261 +1,1257 @@ /*- * Copyright (c) 2003 David O'Brien. All rights reserved. * Copyright (c) 2001 Jake Burkholder * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include +#include #include #include #include #include #include #include #include #include -#include #include #define ED_DYN (1<<0) #define ED_EHDR (1<<1) #define ED_GOT (1<<2) #define ED_HASH (1<<3) #define ED_INTERP (1<<4) #define ED_NOTE (1<<5) #define ED_PHDR (1<<6) #define ED_REL (1<<7) #define ED_SHDR (1<<8) #define ED_SYMTAB (1<<9) #define ED_ALL ((1<<10)-1) #define elf_get_addr elf_get_quad #define elf_get_off elf_get_quad #define elf_get_size elf_get_quad enum elf_member { D_TAG = 1, D_PTR, D_VAL, E_CLASS, E_DATA, E_OSABI, E_TYPE, E_MACHINE, E_VERSION, E_ENTRY, E_PHOFF, E_SHOFF, E_FLAGS, E_EHSIZE, E_PHENTSIZE, E_PHNUM, E_SHENTSIZE, E_SHNUM, E_SHSTRNDX, N_NAMESZ, N_DESCSZ, N_TYPE, P_TYPE, P_OFFSET, P_VADDR, P_PADDR, P_FILESZ, P_MEMSZ, P_FLAGS, P_ALIGN, SH_NAME, SH_TYPE, SH_FLAGS, SH_ADDR, SH_OFFSET, SH_SIZE, SH_LINK, SH_INFO, SH_ADDRALIGN, SH_ENTSIZE, ST_NAME, ST_VALUE, ST_SIZE, ST_INFO, ST_SHNDX, R_OFFSET, R_INFO, RA_OFFSET, RA_INFO, RA_ADDEND }; typedef enum elf_member elf_member_t; static int elf32_offsets[] = { 0, offsetof(Elf32_Dyn, d_tag), offsetof(Elf32_Dyn, d_un.d_ptr), offsetof(Elf32_Dyn, d_un.d_val), offsetof(Elf32_Ehdr, e_ident[EI_CLASS]), offsetof(Elf32_Ehdr, e_ident[EI_DATA]), offsetof(Elf32_Ehdr, e_ident[EI_OSABI]), offsetof(Elf32_Ehdr, e_type), offsetof(Elf32_Ehdr, e_machine), offsetof(Elf32_Ehdr, e_version), offsetof(Elf32_Ehdr, e_entry), offsetof(Elf32_Ehdr, e_phoff), offsetof(Elf32_Ehdr, e_shoff), offsetof(Elf32_Ehdr, e_flags), offsetof(Elf32_Ehdr, e_ehsize), offsetof(Elf32_Ehdr, e_phentsize), offsetof(Elf32_Ehdr, e_phnum), offsetof(Elf32_Ehdr, e_shentsize), offsetof(Elf32_Ehdr, e_shnum), offsetof(Elf32_Ehdr, e_shstrndx), offsetof(Elf_Note, n_namesz), offsetof(Elf_Note, n_descsz), offsetof(Elf_Note, n_type), offsetof(Elf32_Phdr, p_type), offsetof(Elf32_Phdr, p_offset), offsetof(Elf32_Phdr, p_vaddr), offsetof(Elf32_Phdr, p_paddr), offsetof(Elf32_Phdr, p_filesz), offsetof(Elf32_Phdr, p_memsz), offsetof(Elf32_Phdr, p_flags), offsetof(Elf32_Phdr, p_align), offsetof(Elf32_Shdr, sh_name), offsetof(Elf32_Shdr, sh_type), offsetof(Elf32_Shdr, sh_flags), offsetof(Elf32_Shdr, sh_addr), offsetof(Elf32_Shdr, sh_offset), offsetof(Elf32_Shdr, sh_size), offsetof(Elf32_Shdr, sh_link), offsetof(Elf32_Shdr, sh_info), offsetof(Elf32_Shdr, sh_addralign), offsetof(Elf32_Shdr, sh_entsize), offsetof(Elf32_Sym, st_name), offsetof(Elf32_Sym, st_value), offsetof(Elf32_Sym, st_size), offsetof(Elf32_Sym, st_info), offsetof(Elf32_Sym, st_shndx), offsetof(Elf32_Rel, r_offset), offsetof(Elf32_Rel, r_info), offsetof(Elf32_Rela, r_offset), offsetof(Elf32_Rela, r_info), offsetof(Elf32_Rela, r_addend) }; static int elf64_offsets[] = { 0, offsetof(Elf64_Dyn, d_tag), offsetof(Elf64_Dyn, d_un.d_ptr), offsetof(Elf64_Dyn, d_un.d_val), offsetof(Elf32_Ehdr, e_ident[EI_CLASS]), offsetof(Elf32_Ehdr, e_ident[EI_DATA]), offsetof(Elf32_Ehdr, e_ident[EI_OSABI]), offsetof(Elf64_Ehdr, e_type), offsetof(Elf64_Ehdr, e_machine), offsetof(Elf64_Ehdr, e_version), offsetof(Elf64_Ehdr, e_entry), offsetof(Elf64_Ehdr, e_phoff), offsetof(Elf64_Ehdr, e_shoff), offsetof(Elf64_Ehdr, e_flags), offsetof(Elf64_Ehdr, e_ehsize), offsetof(Elf64_Ehdr, e_phentsize), offsetof(Elf64_Ehdr, e_phnum), offsetof(Elf64_Ehdr, e_shentsize), offsetof(Elf64_Ehdr, e_shnum), offsetof(Elf64_Ehdr, e_shstrndx), offsetof(Elf_Note, n_namesz), offsetof(Elf_Note, n_descsz), offsetof(Elf_Note, n_type), offsetof(Elf64_Phdr, p_type), offsetof(Elf64_Phdr, p_offset), offsetof(Elf64_Phdr, p_vaddr), offsetof(Elf64_Phdr, p_paddr), offsetof(Elf64_Phdr, p_filesz), offsetof(Elf64_Phdr, p_memsz), offsetof(Elf64_Phdr, p_flags), offsetof(Elf64_Phdr, p_align), offsetof(Elf64_Shdr, sh_name), offsetof(Elf64_Shdr, sh_type), offsetof(Elf64_Shdr, sh_flags), offsetof(Elf64_Shdr, sh_addr), offsetof(Elf64_Shdr, sh_offset), offsetof(Elf64_Shdr, sh_size), offsetof(Elf64_Shdr, sh_link), offsetof(Elf64_Shdr, sh_info), offsetof(Elf64_Shdr, sh_addralign), offsetof(Elf64_Shdr, sh_entsize), offsetof(Elf64_Sym, st_name), offsetof(Elf64_Sym, st_value), offsetof(Elf64_Sym, st_size), offsetof(Elf64_Sym, st_info), offsetof(Elf64_Sym, st_shndx), offsetof(Elf64_Rel, r_offset), offsetof(Elf64_Rel, r_info), offsetof(Elf64_Rela, r_offset), offsetof(Elf64_Rela, r_info), offsetof(Elf64_Rela, r_addend) }; /* http://www.sco.com/developers/gabi/latest/ch5.dynamic.html#tag_encodings */ static const char * d_tags(u_int64_t tag) { static char unknown_tag[48]; switch (tag) { case DT_NULL: return "DT_NULL"; case DT_NEEDED: return "DT_NEEDED"; case DT_PLTRELSZ: return "DT_PLTRELSZ"; case DT_PLTGOT: return "DT_PLTGOT"; case DT_HASH: return "DT_HASH"; case DT_STRTAB: return "DT_STRTAB"; case DT_SYMTAB: return "DT_SYMTAB"; case DT_RELA: return "DT_RELA"; case DT_RELASZ: return "DT_RELASZ"; case DT_RELAENT: return "DT_RELAENT"; case DT_STRSZ: return "DT_STRSZ"; case DT_SYMENT: return "DT_SYMENT"; case DT_INIT: return "DT_INIT"; case DT_FINI: return "DT_FINI"; case DT_SONAME: return "DT_SONAME"; case DT_RPATH: return "DT_RPATH"; case DT_SYMBOLIC: return "DT_SYMBOLIC"; case DT_REL: return "DT_REL"; case DT_RELSZ: return "DT_RELSZ"; case DT_RELENT: return "DT_RELENT"; case DT_PLTREL: return "DT_PLTREL"; case DT_DEBUG: return "DT_DEBUG"; case DT_TEXTREL: return "DT_TEXTREL"; case DT_JMPREL: return "DT_JMPREL"; case DT_BIND_NOW: return "DT_BIND_NOW"; case DT_INIT_ARRAY: return "DT_INIT_ARRAY"; case DT_FINI_ARRAY: return "DT_FINI_ARRAY"; case DT_INIT_ARRAYSZ: return "DT_INIT_ARRAYSZ"; case DT_FINI_ARRAYSZ: return "DT_FINI_ARRAYSZ"; case DT_RUNPATH: return "DT_RUNPATH"; case DT_FLAGS: return "DT_FLAGS"; case DT_PREINIT_ARRAY: return "DT_PREINIT_ARRAY"; /* XXX DT_ENCODING */ case DT_PREINIT_ARRAYSZ:return "DT_PREINIT_ARRAYSZ"; /* 0x6000000D - 0x6ffff000 operating system-specific semantics */ case 0x6ffffdf5: return "DT_GNU_PRELINKED"; case 0x6ffffdf6: return "DT_GNU_CONFLICTSZ"; case 0x6ffffdf7: return "DT_GNU_LIBLISTSZ"; case 0x6ffffdf8: return "DT_SUNW_CHECKSUM"; case DT_PLTPADSZ: return "DT_PLTPADSZ"; case DT_MOVEENT: return "DT_MOVEENT"; case DT_MOVESZ: return "DT_MOVESZ"; case DT_FEATURE: return "DT_FEATURE"; case DT_POSFLAG_1: return "DT_POSFLAG_1"; case DT_SYMINSZ: return "DT_SYMINSZ"; case DT_SYMINENT : return "DT_SYMINENT (DT_VALRNGHI)"; case DT_ADDRRNGLO: return "DT_ADDRRNGLO"; case DT_GNU_HASH: return "DT_GNU_HASH"; case 0x6ffffef8: return "DT_GNU_CONFLICT"; case 0x6ffffef9: return "DT_GNU_LIBLIST"; case DT_CONFIG: return "DT_CONFIG"; case DT_DEPAUDIT: return "DT_DEPAUDIT"; case DT_AUDIT: return "DT_AUDIT"; case DT_PLTPAD: return "DT_PLTPAD"; case DT_MOVETAB: return "DT_MOVETAB"; case DT_SYMINFO : return "DT_SYMINFO (DT_ADDRRNGHI)"; case DT_RELACOUNT: return "DT_RELACOUNT"; case DT_RELCOUNT: return "DT_RELCOUNT"; case DT_FLAGS_1: return "DT_FLAGS_1"; case DT_VERDEF: return "DT_VERDEF"; case DT_VERDEFNUM: return "DT_VERDEFNUM"; case DT_VERNEED: return "DT_VERNEED"; case DT_VERNEEDNUM: return "DT_VERNEEDNUM"; case 0x6ffffff0: return "DT_GNU_VERSYM"; /* 0x70000000 - 0x7fffffff processor-specific semantics */ case 0x70000000: return "DT_IA_64_PLT_RESERVE"; case 0x7ffffffd: return "DT_SUNW_AUXILIARY"; case 0x7ffffffe: return "DT_SUNW_USED"; case 0x7fffffff: return "DT_SUNW_FILTER"; } snprintf(unknown_tag, sizeof(unknown_tag), "ERROR: TAG NOT DEFINED -- tag 0x%jx", (uintmax_t)tag); return (unknown_tag); } static const char * e_machines(u_int mach) { static char machdesc[64]; switch (mach) { case EM_NONE: return "EM_NONE"; case EM_M32: return "EM_M32"; case EM_SPARC: return "EM_SPARC"; case EM_386: return "EM_386"; case EM_68K: return "EM_68K"; case EM_88K: return "EM_88K"; case EM_IAMCU: return "EM_IAMCU"; case EM_860: return "EM_860"; case EM_MIPS: return "EM_MIPS"; case EM_PPC: return "EM_PPC"; case EM_PPC64: return "EM_PPC64"; case EM_ARM: return "EM_ARM"; case EM_ALPHA: return "EM_ALPHA (legacy)"; case EM_SPARCV9:return "EM_SPARCV9"; case EM_IA_64: return "EM_IA_64"; case EM_X86_64: return "EM_X86_64"; case EM_AARCH64:return "EM_AARCH64"; case EM_RISCV: return "EM_RISCV"; } snprintf(machdesc, sizeof(machdesc), "(unknown machine) -- type 0x%x", mach); return (machdesc); } static const char *e_types[] = { "ET_NONE", "ET_REL", "ET_EXEC", "ET_DYN", "ET_CORE" }; static const char *ei_versions[] = { "EV_NONE", "EV_CURRENT" }; static const char *ei_classes[] = { "ELFCLASSNONE", "ELFCLASS32", "ELFCLASS64" }; static const char *ei_data[] = { "ELFDATANONE", "ELFDATA2LSB", "ELFDATA2MSB" }; static const char *ei_abis[256] = { "ELFOSABI_NONE", "ELFOSABI_HPUX", "ELFOSABI_NETBSD", "ELFOSABI_LINUX", "ELFOSABI_HURD", "ELFOSABI_86OPEN", "ELFOSABI_SOLARIS", "ELFOSABI_AIX", "ELFOSABI_IRIX", "ELFOSABI_FREEBSD", "ELFOSABI_TRU64", "ELFOSABI_MODESTO", "ELFOSABI_OPENBSD", [255] = "ELFOSABI_STANDALONE" }; static const char *p_types[] = { "PT_NULL", "PT_LOAD", "PT_DYNAMIC", "PT_INTERP", "PT_NOTE", "PT_SHLIB", "PT_PHDR", "PT_TLS" }; static const char *p_flags[] = { "", "PF_X", "PF_W", "PF_X|PF_W", "PF_R", "PF_X|PF_R", "PF_W|PF_R", "PF_X|PF_W|PF_R" }; /* http://www.sco.com/developers/gabi/latest/ch4.sheader.html#sh_type */ static const char * sh_types(uint64_t machine, uint64_t sht) { static char unknown_buf[64]; if (sht < 0x60000000) { switch (sht) { case SHT_NULL: return "SHT_NULL"; case SHT_PROGBITS: return "SHT_PROGBITS"; case SHT_SYMTAB: return "SHT_SYMTAB"; case SHT_STRTAB: return "SHT_STRTAB"; case SHT_RELA: return "SHT_RELA"; case SHT_HASH: return "SHT_HASH"; case SHT_DYNAMIC: return "SHT_DYNAMIC"; case SHT_NOTE: return "SHT_NOTE"; case SHT_NOBITS: return "SHT_NOBITS"; case SHT_REL: return "SHT_REL"; case SHT_SHLIB: return "SHT_SHLIB"; case SHT_DYNSYM: return "SHT_DYNSYM"; case SHT_INIT_ARRAY: return "SHT_INIT_ARRAY"; case SHT_FINI_ARRAY: return "SHT_FINI_ARRAY"; case SHT_PREINIT_ARRAY: return "SHT_PREINIT_ARRAY"; case SHT_GROUP: return "SHT_GROUP"; case SHT_SYMTAB_SHNDX: return "SHT_SYMTAB_SHNDX"; } snprintf(unknown_buf, sizeof(unknown_buf), "ERROR: SHT %ju NOT DEFINED", (uintmax_t)sht); return (unknown_buf); } else if (sht < 0x70000000) { /* 0x60000000-0x6fffffff operating system-specific semantics */ switch (sht) { case 0x6ffffff0: return "XXX:VERSYM"; case SHT_SUNW_dof: return "SHT_SUNW_dof"; case SHT_GNU_HASH: return "SHT_GNU_HASH"; case 0x6ffffff7: return "SHT_GNU_LIBLIST"; case 0x6ffffffc: return "XXX:VERDEF"; case SHT_SUNW_verdef: return "SHT_SUNW(GNU)_verdef"; case SHT_SUNW_verneed: return "SHT_SUNW(GNU)_verneed"; case SHT_SUNW_versym: return "SHT_SUNW(GNU)_versym"; } snprintf(unknown_buf, sizeof(unknown_buf), "ERROR: OS-SPECIFIC SHT 0x%jx NOT DEFINED", (uintmax_t)sht); return (unknown_buf); } else if (sht < 0x80000000) { /* 0x70000000-0x7fffffff processor-specific semantics */ switch (machine) { case EM_ARM: switch (sht) { case SHT_ARM_EXIDX: return "SHT_ARM_EXIDX"; case SHT_ARM_PREEMPTMAP:return "SHT_ARM_PREEMPTMAP"; case SHT_ARM_ATTRIBUTES:return "SHT_ARM_ATTRIBUTES"; case SHT_ARM_DEBUGOVERLAY: return "SHT_ARM_DEBUGOVERLAY"; case SHT_ARM_OVERLAYSECTION: return "SHT_ARM_OVERLAYSECTION"; } break; case EM_IA_64: switch (sht) { case 0x70000000: return "SHT_IA_64_EXT"; case 0x70000001: return "SHT_IA_64_UNWIND"; } break; case EM_MIPS: switch (sht) { case SHT_MIPS_REGINFO: return "SHT_MIPS_REGINFO"; case SHT_MIPS_OPTIONS: return "SHT_MIPS_OPTIONS"; case SHT_MIPS_ABIFLAGS: return "SHT_MIPS_ABIFLAGS"; } break; } switch (sht) { case 0x7ffffffd: return "XXX:AUXILIARY"; case 0x7fffffff: return "XXX:FILTER"; } snprintf(unknown_buf, sizeof(unknown_buf), "ERROR: PROCESSOR-SPECIFIC SHT 0x%jx NOT DEFINED", (uintmax_t)sht); return (unknown_buf); } else { /* 0x80000000-0xffffffff application programs */ snprintf(unknown_buf, sizeof(unknown_buf), "ERROR: SHT 0x%jx NOT DEFINED", (uintmax_t)sht); return (unknown_buf); } } static const char *sh_flags[] = { "", "SHF_WRITE", "SHF_ALLOC", "SHF_WRITE|SHF_ALLOC", "SHF_EXECINSTR", "SHF_WRITE|SHF_EXECINSTR", "SHF_ALLOC|SHF_EXECINSTR", "SHF_WRITE|SHF_ALLOC|SHF_EXECINSTR" }; static const char * st_type(unsigned int mach, unsigned int type) { static char s_type[32]; switch (type) { case STT_NOTYPE: return "STT_NOTYPE"; case STT_OBJECT: return "STT_OBJECT"; case STT_FUNC: return "STT_FUNC"; case STT_SECTION: return "STT_SECTION"; case STT_FILE: return "STT_FILE"; case STT_COMMON: return "STT_COMMON"; case STT_TLS: return "STT_TLS"; case 13: if (mach == EM_SPARCV9) return "STT_SPARC_REGISTER"; break; } snprintf(s_type, sizeof(s_type), "", type); return (s_type); } static const char *st_bindings[] = { "STB_LOCAL", "STB_GLOBAL", "STB_WEAK" }; static char *dynstr; static char *shstrtab; static char *strtab; static FILE *out; static u_int64_t elf_get_byte(Elf32_Ehdr *e, void *base, elf_member_t member); static u_int64_t elf_get_quarter(Elf32_Ehdr *e, void *base, elf_member_t member); #if 0 static u_int64_t elf_get_half(Elf32_Ehdr *e, void *base, elf_member_t member); #endif static u_int64_t elf_get_word(Elf32_Ehdr *e, void *base, elf_member_t member); static u_int64_t elf_get_quad(Elf32_Ehdr *e, void *base, elf_member_t member); static void elf_print_ehdr(Elf32_Ehdr *e, void *sh); static void elf_print_phdr(Elf32_Ehdr *e, void *p); static void elf_print_shdr(Elf32_Ehdr *e, void *sh); static void elf_print_symtab(Elf32_Ehdr *e, void *sh, char *str); static void elf_print_dynamic(Elf32_Ehdr *e, void *sh); static void elf_print_rel(Elf32_Ehdr *e, void *r); static void elf_print_rela(Elf32_Ehdr *e, void *ra); static void elf_print_interp(Elf32_Ehdr *e, void *p); static void elf_print_got(Elf32_Ehdr *e, void *sh); static void elf_print_hash(Elf32_Ehdr *e, void *sh); static void elf_print_note(Elf32_Ehdr *e, void *sh); static void usage(void); /* * Helpers for ELF files with shnum or shstrndx values that don't fit in the * ELF header. If the values are too large then an escape value is used to * indicate that the actual value is found in one of section 0's fields. */ static uint64_t elf_get_shnum(Elf32_Ehdr *e, void *sh) { uint64_t shnum; shnum = elf_get_quarter(e, e, E_SHNUM); if (shnum == 0) shnum = elf_get_word(e, (char *)sh, SH_SIZE); return shnum; } static uint64_t elf_get_shstrndx(Elf32_Ehdr *e, void *sh) { uint64_t shstrndx; shstrndx = elf_get_quarter(e, e, E_SHSTRNDX); if (shstrndx == SHN_XINDEX) shstrndx = elf_get_word(e, (char *)sh, SH_LINK); return shstrndx; } int main(int ac, char **av) { cap_rights_t rights; u_int64_t phoff; u_int64_t shoff; u_int64_t phentsize; u_int64_t phnum; u_int64_t shentsize; u_int64_t shnum; u_int64_t shstrndx; u_int64_t offset; u_int64_t name; u_int64_t type; struct stat sb; - unsigned long cmd; u_int flags; Elf32_Ehdr *e; void *p; void *sh; void *v; int fd; int ch; int i; out = stdout; flags = 0; while ((ch = getopt(ac, av, "acdeiGhnprsw:")) != -1) switch (ch) { case 'a': flags = ED_ALL; break; case 'c': flags |= ED_SHDR; break; case 'd': flags |= ED_DYN; break; case 'e': flags |= ED_EHDR; break; case 'i': flags |= ED_INTERP; break; case 'G': flags |= ED_GOT; break; case 'h': flags |= ED_HASH; break; case 'n': flags |= ED_NOTE; break; case 'p': flags |= ED_PHDR; break; case 'r': flags |= ED_REL; break; case 's': flags |= ED_SYMTAB; break; case 'w': if ((out = fopen(optarg, "w")) == NULL) err(1, "%s", optarg); cap_rights_init(&rights, CAP_FSTAT, CAP_WRITE); if (cap_rights_limit(fileno(out), &rights) < 0 && errno != ENOSYS) err(1, "unable to limit rights for %s", optarg); break; case '?': default: usage(); } ac -= optind; av += optind; if (ac == 0 || flags == 0) usage(); if ((fd = open(*av, O_RDONLY)) < 0 || fstat(fd, &sb) < 0) err(1, "%s", *av); cap_rights_init(&rights, CAP_MMAP_R); if (cap_rights_limit(fd, &rights) < 0 && errno != ENOSYS) err(1, "unable to limit rights for %s", *av); - cap_rights_limit(STDIN_FILENO, cap_rights_init(&rights)); - cap_rights_init(&rights, CAP_FSTAT, CAP_IOCTL, CAP_WRITE); - cmd = TIOCGETA; /* required by isatty(3) in printf(3) */ - if ((cap_rights_limit(STDOUT_FILENO, &rights) < 0 && errno != ENOSYS) || - (cap_ioctls_limit(STDOUT_FILENO, &cmd, 1) < 0 && errno != ENOSYS) || - (cap_rights_limit(STDERR_FILENO, &rights) < 0 && errno != ENOSYS) || - (cap_ioctls_limit(STDERR_FILENO, &cmd, 1) < 0 && errno != ENOSYS)) - err(1, "unable to limit rights for stdout/stderr"); + cap_rights_init(&rights); + if ((cap_rights_limit(STDIN_FILENO, &rights) < 0 && errno != ENOSYS) || + caph_limit_stdout() < 0 || caph_limit_stderr() < 0) { + err(1, "unable to limit rights for stdio"); + } if (cap_enter() < 0 && errno != ENOSYS) err(1, "unable to enter capability mode"); e = mmap(NULL, sb.st_size, PROT_READ, MAP_SHARED, fd, 0); if (e == MAP_FAILED) err(1, NULL); if (!IS_ELF(*(Elf32_Ehdr *)e)) errx(1, "not an elf file"); phoff = elf_get_off(e, e, E_PHOFF); shoff = elf_get_off(e, e, E_SHOFF); phentsize = elf_get_quarter(e, e, E_PHENTSIZE); phnum = elf_get_quarter(e, e, E_PHNUM); shentsize = elf_get_quarter(e, e, E_SHENTSIZE); p = (char *)e + phoff; if (shoff > 0) { sh = (char *)e + shoff; shnum = elf_get_shnum(e, sh); shstrndx = elf_get_shstrndx(e, sh); offset = elf_get_off(e, (char *)sh + shstrndx * shentsize, SH_OFFSET); shstrtab = (char *)e + offset; } else { sh = NULL; shnum = 0; shstrndx = 0; shstrtab = NULL; } for (i = 0; (u_int64_t)i < shnum; i++) { name = elf_get_word(e, (char *)sh + i * shentsize, SH_NAME); offset = elf_get_off(e, (char *)sh + i * shentsize, SH_OFFSET); if (strcmp(shstrtab + name, ".strtab") == 0) strtab = (char *)e + offset; if (strcmp(shstrtab + name, ".dynstr") == 0) dynstr = (char *)e + offset; } if (flags & ED_EHDR) elf_print_ehdr(e, sh); if (flags & ED_PHDR) elf_print_phdr(e, p); if (flags & ED_SHDR) elf_print_shdr(e, sh); for (i = 0; (u_int64_t)i < phnum; i++) { v = (char *)p + i * phentsize; type = elf_get_word(e, v, P_TYPE); switch (type) { case PT_INTERP: if (flags & ED_INTERP) elf_print_interp(e, v); break; case PT_NULL: case PT_LOAD: case PT_DYNAMIC: case PT_NOTE: case PT_SHLIB: case PT_PHDR: break; } } for (i = 0; (u_int64_t)i < shnum; i++) { v = (char *)sh + i * shentsize; type = elf_get_word(e, v, SH_TYPE); switch (type) { case SHT_SYMTAB: if (flags & ED_SYMTAB) elf_print_symtab(e, v, strtab); break; case SHT_DYNAMIC: if (flags & ED_DYN) elf_print_dynamic(e, v); break; case SHT_RELA: if (flags & ED_REL) elf_print_rela(e, v); break; case SHT_REL: if (flags & ED_REL) elf_print_rel(e, v); break; case SHT_NOTE: name = elf_get_word(e, v, SH_NAME); if (flags & ED_NOTE && strcmp(shstrtab + name, ".note.ABI-tag") == 0) elf_print_note(e, v); break; case SHT_DYNSYM: if (flags & ED_SYMTAB) elf_print_symtab(e, v, dynstr); break; case SHT_PROGBITS: name = elf_get_word(e, v, SH_NAME); if (flags & ED_GOT && strcmp(shstrtab + name, ".got") == 0) elf_print_got(e, v); break; case SHT_HASH: if (flags & ED_HASH) elf_print_hash(e, v); break; case SHT_NULL: case SHT_STRTAB: case SHT_NOBITS: case SHT_SHLIB: break; } } return 0; } static void elf_print_ehdr(Elf32_Ehdr *e, void *sh) { u_int64_t class; u_int64_t data; u_int64_t osabi; u_int64_t type; u_int64_t machine; u_int64_t version; u_int64_t entry; u_int64_t phoff; u_int64_t shoff; u_int64_t flags; u_int64_t ehsize; u_int64_t phentsize; u_int64_t phnum; u_int64_t shentsize; u_int64_t shnum; u_int64_t shstrndx; class = elf_get_byte(e, e, E_CLASS); data = elf_get_byte(e, e, E_DATA); osabi = elf_get_byte(e, e, E_OSABI); type = elf_get_quarter(e, e, E_TYPE); machine = elf_get_quarter(e, e, E_MACHINE); version = elf_get_word(e, e, E_VERSION); entry = elf_get_addr(e, e, E_ENTRY); phoff = elf_get_off(e, e, E_PHOFF); shoff = elf_get_off(e, e, E_SHOFF); flags = elf_get_word(e, e, E_FLAGS); ehsize = elf_get_quarter(e, e, E_EHSIZE); phentsize = elf_get_quarter(e, e, E_PHENTSIZE); phnum = elf_get_quarter(e, e, E_PHNUM); shentsize = elf_get_quarter(e, e, E_SHENTSIZE); fprintf(out, "\nelf header:\n"); fprintf(out, "\n"); fprintf(out, "\te_ident: %s %s %s\n", ei_classes[class], ei_data[data], ei_abis[osabi]); fprintf(out, "\te_type: %s\n", e_types[type]); fprintf(out, "\te_machine: %s\n", e_machines(machine)); fprintf(out, "\te_version: %s\n", ei_versions[version]); fprintf(out, "\te_entry: %#jx\n", (intmax_t)entry); fprintf(out, "\te_phoff: %jd\n", (intmax_t)phoff); fprintf(out, "\te_shoff: %jd\n", (intmax_t)shoff); fprintf(out, "\te_flags: %jd\n", (intmax_t)flags); fprintf(out, "\te_ehsize: %jd\n", (intmax_t)ehsize); fprintf(out, "\te_phentsize: %jd\n", (intmax_t)phentsize); fprintf(out, "\te_phnum: %jd\n", (intmax_t)phnum); fprintf(out, "\te_shentsize: %jd\n", (intmax_t)shentsize); if (sh != NULL) { shnum = elf_get_shnum(e, sh); shstrndx = elf_get_shstrndx(e, sh); fprintf(out, "\te_shnum: %jd\n", (intmax_t)shnum); fprintf(out, "\te_shstrndx: %jd\n", (intmax_t)shstrndx); } } static void elf_print_phdr(Elf32_Ehdr *e, void *p) { u_int64_t phentsize; u_int64_t phnum; u_int64_t type; u_int64_t offset; u_int64_t vaddr; u_int64_t paddr; u_int64_t filesz; u_int64_t memsz; u_int64_t flags; u_int64_t align; void *v; int i; phentsize = elf_get_quarter(e, e, E_PHENTSIZE); phnum = elf_get_quarter(e, e, E_PHNUM); fprintf(out, "\nprogram header:\n"); for (i = 0; (u_int64_t)i < phnum; i++) { v = (char *)p + i * phentsize; type = elf_get_word(e, v, P_TYPE); offset = elf_get_off(e, v, P_OFFSET); vaddr = elf_get_addr(e, v, P_VADDR); paddr = elf_get_addr(e, v, P_PADDR); filesz = elf_get_size(e, v, P_FILESZ); memsz = elf_get_size(e, v, P_MEMSZ); flags = elf_get_word(e, v, P_FLAGS); align = elf_get_size(e, v, P_ALIGN); fprintf(out, "\n"); fprintf(out, "entry: %d\n", i); fprintf(out, "\tp_type: %s\n", p_types[type & 0x7]); fprintf(out, "\tp_offset: %jd\n", (intmax_t)offset); fprintf(out, "\tp_vaddr: %#jx\n", (intmax_t)vaddr); fprintf(out, "\tp_paddr: %#jx\n", (intmax_t)paddr); fprintf(out, "\tp_filesz: %jd\n", (intmax_t)filesz); fprintf(out, "\tp_memsz: %jd\n", (intmax_t)memsz); fprintf(out, "\tp_flags: %s\n", p_flags[flags]); fprintf(out, "\tp_align: %jd\n", (intmax_t)align); } } static void elf_print_shdr(Elf32_Ehdr *e, void *sh) { u_int64_t shentsize; u_int64_t shnum; u_int64_t name; u_int64_t type; u_int64_t flags; u_int64_t addr; u_int64_t offset; u_int64_t size; u_int64_t shlink; u_int64_t info; u_int64_t addralign; u_int64_t entsize; u_int64_t machine; void *v; int i; if (sh == NULL) { fprintf(out, "\nNo section headers\n"); return; } machine = elf_get_quarter(e, e, E_MACHINE); shentsize = elf_get_quarter(e, e, E_SHENTSIZE); shnum = elf_get_shnum(e, sh); fprintf(out, "\nsection header:\n"); for (i = 0; (u_int64_t)i < shnum; i++) { v = (char *)sh + i * shentsize; name = elf_get_word(e, v, SH_NAME); type = elf_get_word(e, v, SH_TYPE); flags = elf_get_word(e, v, SH_FLAGS); addr = elf_get_addr(e, v, SH_ADDR); offset = elf_get_off(e, v, SH_OFFSET); size = elf_get_size(e, v, SH_SIZE); shlink = elf_get_word(e, v, SH_LINK); info = elf_get_word(e, v, SH_INFO); addralign = elf_get_size(e, v, SH_ADDRALIGN); entsize = elf_get_size(e, v, SH_ENTSIZE); fprintf(out, "\n"); fprintf(out, "entry: %d\n", i); fprintf(out, "\tsh_name: %s\n", shstrtab + name); fprintf(out, "\tsh_type: %s\n", sh_types(machine, type)); fprintf(out, "\tsh_flags: %s\n", sh_flags[flags & 0x7]); fprintf(out, "\tsh_addr: %#jx\n", addr); fprintf(out, "\tsh_offset: %jd\n", (intmax_t)offset); fprintf(out, "\tsh_size: %jd\n", (intmax_t)size); fprintf(out, "\tsh_link: %jd\n", (intmax_t)shlink); fprintf(out, "\tsh_info: %jd\n", (intmax_t)info); fprintf(out, "\tsh_addralign: %jd\n", (intmax_t)addralign); fprintf(out, "\tsh_entsize: %jd\n", (intmax_t)entsize); } } static void elf_print_symtab(Elf32_Ehdr *e, void *sh, char *str) { u_int64_t machine; u_int64_t offset; u_int64_t entsize; u_int64_t size; u_int64_t name; u_int64_t value; u_int64_t info; u_int64_t shndx; void *st; int len; int i; machine = elf_get_quarter(e, e, E_MACHINE); offset = elf_get_off(e, sh, SH_OFFSET); entsize = elf_get_size(e, sh, SH_ENTSIZE); size = elf_get_size(e, sh, SH_SIZE); name = elf_get_word(e, sh, SH_NAME); len = size / entsize; fprintf(out, "\nsymbol table (%s):\n", shstrtab + name); for (i = 0; i < len; i++) { st = (char *)e + offset + i * entsize; name = elf_get_word(e, st, ST_NAME); value = elf_get_addr(e, st, ST_VALUE); size = elf_get_size(e, st, ST_SIZE); info = elf_get_byte(e, st, ST_INFO); shndx = elf_get_quarter(e, st, ST_SHNDX); fprintf(out, "\n"); fprintf(out, "entry: %d\n", i); fprintf(out, "\tst_name: %s\n", str + name); fprintf(out, "\tst_value: %#jx\n", value); fprintf(out, "\tst_size: %jd\n", (intmax_t)size); fprintf(out, "\tst_info: %s %s\n", st_type(machine, ELF32_ST_TYPE(info)), st_bindings[ELF32_ST_BIND(info)]); fprintf(out, "\tst_shndx: %jd\n", (intmax_t)shndx); } } static void elf_print_dynamic(Elf32_Ehdr *e, void *sh) { u_int64_t offset; u_int64_t entsize; u_int64_t size; int64_t tag; u_int64_t ptr; u_int64_t val; void *d; int i; offset = elf_get_off(e, sh, SH_OFFSET); entsize = elf_get_size(e, sh, SH_ENTSIZE); size = elf_get_size(e, sh, SH_SIZE); fprintf(out, "\ndynamic:\n"); for (i = 0; (u_int64_t)i < size / entsize; i++) { d = (char *)e + offset + i * entsize; tag = elf_get_size(e, d, D_TAG); ptr = elf_get_size(e, d, D_PTR); val = elf_get_addr(e, d, D_VAL); fprintf(out, "\n"); fprintf(out, "entry: %d\n", i); fprintf(out, "\td_tag: %s\n", d_tags(tag)); switch (tag) { case DT_NEEDED: case DT_SONAME: case DT_RPATH: fprintf(out, "\td_val: %s\n", dynstr + val); break; case DT_PLTRELSZ: case DT_RELA: case DT_RELASZ: case DT_RELAENT: case DT_STRSZ: case DT_SYMENT: case DT_RELSZ: case DT_RELENT: case DT_PLTREL: fprintf(out, "\td_val: %jd\n", (intmax_t)val); break; case DT_PLTGOT: case DT_HASH: case DT_STRTAB: case DT_SYMTAB: case DT_INIT: case DT_FINI: case DT_REL: case DT_JMPREL: fprintf(out, "\td_ptr: %#jx\n", ptr); break; case DT_NULL: case DT_SYMBOLIC: case DT_DEBUG: case DT_TEXTREL: break; } } } static void elf_print_rela(Elf32_Ehdr *e, void *sh) { u_int64_t offset; u_int64_t entsize; u_int64_t size; u_int64_t name; u_int64_t info; int64_t addend; void *ra; void *v; int i; offset = elf_get_off(e, sh, SH_OFFSET); entsize = elf_get_size(e, sh, SH_ENTSIZE); size = elf_get_size(e, sh, SH_SIZE); name = elf_get_word(e, sh, SH_NAME); v = (char *)e + offset; fprintf(out, "\nrelocation with addend (%s):\n", shstrtab + name); for (i = 0; (u_int64_t)i < size / entsize; i++) { ra = (char *)v + i * entsize; offset = elf_get_addr(e, ra, RA_OFFSET); info = elf_get_word(e, ra, RA_INFO); addend = elf_get_off(e, ra, RA_ADDEND); fprintf(out, "\n"); fprintf(out, "entry: %d\n", i); fprintf(out, "\tr_offset: %#jx\n", offset); fprintf(out, "\tr_info: %jd\n", (intmax_t)info); fprintf(out, "\tr_addend: %jd\n", (intmax_t)addend); } } static void elf_print_rel(Elf32_Ehdr *e, void *sh) { u_int64_t offset; u_int64_t entsize; u_int64_t size; u_int64_t name; u_int64_t info; void *r; void *v; int i; offset = elf_get_off(e, sh, SH_OFFSET); entsize = elf_get_size(e, sh, SH_ENTSIZE); size = elf_get_size(e, sh, SH_SIZE); name = elf_get_word(e, sh, SH_NAME); v = (char *)e + offset; fprintf(out, "\nrelocation (%s):\n", shstrtab + name); for (i = 0; (u_int64_t)i < size / entsize; i++) { r = (char *)v + i * entsize; offset = elf_get_addr(e, r, R_OFFSET); info = elf_get_word(e, r, R_INFO); fprintf(out, "\n"); fprintf(out, "entry: %d\n", i); fprintf(out, "\tr_offset: %#jx\n", offset); fprintf(out, "\tr_info: %jd\n", (intmax_t)info); } } static void elf_print_interp(Elf32_Ehdr *e, void *p) { u_int64_t offset; char *s; offset = elf_get_off(e, p, P_OFFSET); s = (char *)e + offset; fprintf(out, "\ninterp:\n"); fprintf(out, "\t%s\n", s); } static void elf_print_got(Elf32_Ehdr *e, void *sh) { u_int64_t offset; u_int64_t addralign; u_int64_t size; u_int64_t addr; void *v; int i; offset = elf_get_off(e, sh, SH_OFFSET); addralign = elf_get_size(e, sh, SH_ADDRALIGN); size = elf_get_size(e, sh, SH_SIZE); v = (char *)e + offset; fprintf(out, "\nglobal offset table:\n"); for (i = 0; (u_int64_t)i < size / addralign; i++) { addr = elf_get_addr(e, (char *)v + i * addralign, 0); fprintf(out, "\n"); fprintf(out, "entry: %d\n", i); fprintf(out, "\t%#jx\n", addr); } } static void elf_print_hash(Elf32_Ehdr *e __unused, void *sh __unused) { } static void elf_print_note(Elf32_Ehdr *e, void *sh) { u_int64_t offset; u_int64_t size; u_int64_t name; u_int32_t namesz; u_int32_t descsz; u_int32_t desc; char *n, *s; offset = elf_get_off(e, sh, SH_OFFSET); size = elf_get_size(e, sh, SH_SIZE); name = elf_get_word(e, sh, SH_NAME); n = (char *)e + offset; fprintf(out, "\nnote (%s):\n", shstrtab + name); while (n < ((char *)e + offset + size)) { namesz = elf_get_word(e, n, N_NAMESZ); descsz = elf_get_word(e, n, N_DESCSZ); s = n + sizeof(Elf_Note); desc = elf_get_word(e, n + sizeof(Elf_Note) + namesz, 0); fprintf(out, "\t%s %d\n", s, desc); n += sizeof(Elf_Note) + namesz + descsz; } } static u_int64_t elf_get_byte(Elf32_Ehdr *e, void *base, elf_member_t member) { u_int64_t val; val = 0; switch (e->e_ident[EI_CLASS]) { case ELFCLASS32: val = ((uint8_t *)base)[elf32_offsets[member]]; break; case ELFCLASS64: val = ((uint8_t *)base)[elf64_offsets[member]]; break; case ELFCLASSNONE: errx(1, "invalid class"); } return val; } static u_int64_t elf_get_quarter(Elf32_Ehdr *e, void *base, elf_member_t member) { u_int64_t val; val = 0; switch (e->e_ident[EI_CLASS]) { case ELFCLASS32: base = (char *)base + elf32_offsets[member]; switch (e->e_ident[EI_DATA]) { case ELFDATA2MSB: val = be16dec(base); break; case ELFDATA2LSB: val = le16dec(base); break; case ELFDATANONE: errx(1, "invalid data format"); } break; case ELFCLASS64: base = (char *)base + elf64_offsets[member]; switch (e->e_ident[EI_DATA]) { case ELFDATA2MSB: val = be16dec(base); break; case ELFDATA2LSB: val = le16dec(base); break; case ELFDATANONE: errx(1, "invalid data format"); } break; case ELFCLASSNONE: errx(1, "invalid class"); } return val; } #if 0 static u_int64_t elf_get_half(Elf32_Ehdr *e, void *base, elf_member_t member) { u_int64_t val; val = 0; switch (e->e_ident[EI_CLASS]) { case ELFCLASS32: base = (char *)base + elf32_offsets[member]; switch (e->e_ident[EI_DATA]) { case ELFDATA2MSB: val = be16dec(base); break; case ELFDATA2LSB: val = le16dec(base); break; case ELFDATANONE: errx(1, "invalid data format"); } break; case ELFCLASS64: base = (char *)base + elf64_offsets[member]; switch (e->e_ident[EI_DATA]) { case ELFDATA2MSB: val = be32dec(base); break; case ELFDATA2LSB: val = le32dec(base); break; case ELFDATANONE: errx(1, "invalid data format"); } break; case ELFCLASSNONE: errx(1, "invalid class"); } return val; } #endif static u_int64_t elf_get_word(Elf32_Ehdr *e, void *base, elf_member_t member) { u_int64_t val; val = 0; switch (e->e_ident[EI_CLASS]) { case ELFCLASS32: base = (char *)base + elf32_offsets[member]; switch (e->e_ident[EI_DATA]) { case ELFDATA2MSB: val = be32dec(base); break; case ELFDATA2LSB: val = le32dec(base); break; case ELFDATANONE: errx(1, "invalid data format"); } break; case ELFCLASS64: base = (char *)base + elf64_offsets[member]; switch (e->e_ident[EI_DATA]) { case ELFDATA2MSB: val = be32dec(base); break; case ELFDATA2LSB: val = le32dec(base); break; case ELFDATANONE: errx(1, "invalid data format"); } break; case ELFCLASSNONE: errx(1, "invalid class"); } return val; } static u_int64_t elf_get_quad(Elf32_Ehdr *e, void *base, elf_member_t member) { u_int64_t val; val = 0; switch (e->e_ident[EI_CLASS]) { case ELFCLASS32: base = (char *)base + elf32_offsets[member]; switch (e->e_ident[EI_DATA]) { case ELFDATA2MSB: val = be32dec(base); break; case ELFDATA2LSB: val = le32dec(base); break; case ELFDATANONE: errx(1, "invalid data format"); } break; case ELFCLASS64: base = (char *)base + elf64_offsets[member]; switch (e->e_ident[EI_DATA]) { case ELFDATA2MSB: val = be64dec(base); break; case ELFDATA2LSB: val = le64dec(base); break; case ELFDATANONE: errx(1, "invalid data format"); } break; case ELFCLASSNONE: errx(1, "invalid class"); } return val; } static void usage(void) { fprintf(stderr, "usage: elfdump -a | -cdeGhinprs [-w file] file\n"); exit(1); } Index: user/alc/PQ_LAUNDRY/usr.bin/kdump/kdump.c =================================================================== --- user/alc/PQ_LAUNDRY/usr.bin/kdump/kdump.c (revision 306831) +++ user/alc/PQ_LAUNDRY/usr.bin/kdump/kdump.c (revision 306832) @@ -1,1894 +1,1858 @@ /*- * Copyright (c) 1988, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #ifndef lint static const char copyright[] = "@(#) Copyright (c) 1988, 1993\n\ The Regents of the University of California. All rights reserved.\n"; #endif /* not lint */ #ifndef lint #if 0 static char sccsid[] = "@(#)kdump.c 8.1 (Berkeley) 6/6/93"; #endif #endif /* not lint */ #include __FBSDID("$FreeBSD$"); #define _WANT_KERNEL_ERRNO #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef HAVE_LIBCASPER #include #endif #include #include #include +#include #include #include #include #include #include #include #include #include #include #include #include #include -#include #include #include #include #include "ktrace.h" #include "kdump_subr.h" #ifdef HAVE_LIBCASPER #include #include #include #endif u_int abidump(struct ktr_header *); int fetchprocinfo(struct ktr_header *, u_int *); int fread_tail(void *, int, int); void dumpheader(struct ktr_header *); void ktrsyscall(struct ktr_syscall *, u_int); void ktrsysret(struct ktr_sysret *, u_int); void ktrnamei(char *, int); void hexdump(char *, int, int); void visdump(char *, int, int); void ktrgenio(struct ktr_genio *, int); void ktrpsig(struct ktr_psig *); void ktrcsw(struct ktr_csw *); void ktrcsw_old(struct ktr_csw_old *); void ktruser(int, void *); void ktrcaprights(cap_rights_t *); void ktritimerval(struct itimerval *it); void ktrsockaddr(struct sockaddr *); void ktrstat(struct stat *); void ktrstruct(char *, size_t); void ktrcapfail(struct ktr_cap_fail *); void ktrfault(struct ktr_fault *); void ktrfaultend(struct ktr_faultend *); -void limitfd(int fd); void usage(void); #define TIMESTAMP_NONE 0x0 #define TIMESTAMP_ABSOLUTE 0x1 #define TIMESTAMP_ELAPSED 0x2 #define TIMESTAMP_RELATIVE 0x4 extern const char *signames[]; static int timestamp, decimal, fancy = 1, suppressdata, tail, threads, maxdata, resolv = 0, abiflag = 0, syscallno = 0; static const char *tracefile = DEF_TRACEFILE; static struct ktr_header ktr_header; #define TIME_FORMAT "%b %e %T %Y" #define eqs(s1, s2) (strcmp((s1), (s2)) == 0) #define print_number64(first,i,n,c) do { \ uint64_t __v; \ \ if (quad_align && (((ptrdiff_t)((i) - (first))) & 1) == 1) { \ (i)++; \ (n)--; \ } \ if (quad_slots == 2) \ __v = (uint64_t)(uint32_t)(i)[0] | \ ((uint64_t)(uint32_t)(i)[1]) << 32; \ else \ __v = (uint64_t)*(i); \ if (decimal) \ printf("%c%jd", (c), (intmax_t)__v); \ else \ printf("%c%#jx", (c), (uintmax_t)__v); \ (i) += quad_slots; \ (n) -= quad_slots; \ (c) = ','; \ } while (0) #define print_number(i,n,c) do { \ if (decimal) \ printf("%c%jd", c, (intmax_t)*i); \ else \ printf("%c%#jx", c, (uintmax_t)(u_register_t)*i); \ i++; \ n--; \ c = ','; \ } while (0) struct proc_info { TAILQ_ENTRY(proc_info) info; u_int sv_flags; pid_t pid; }; static TAILQ_HEAD(trace_procs, proc_info) trace_procs; #ifdef HAVE_LIBCASPER static cap_channel_t *cappwd, *capgrp; #endif static void strerror_init(void) { /* * Cache NLS data before entering capability mode. * XXXPJD: There should be strerror_init() and strsignal_init() in libc. */ (void)catopen("libc", NL_CAT_LOCALE); } static void localtime_init(void) { time_t ltime; /* * Allow localtime(3) to cache /etc/localtime content before entering * capability mode. * XXXPJD: There should be localtime_init() in libc. */ (void)time(<ime); (void)localtime(<ime); } #ifdef HAVE_LIBCASPER static int cappwdgrp_setup(cap_channel_t **cappwdp, cap_channel_t **capgrpp) { cap_channel_t *capcas, *cappwdloc, *capgrploc; const char *cmds[1], *fields[1]; capcas = cap_init(); if (capcas == NULL) { err(1, "unable to create casper process"); exit(1); } cappwdloc = cap_service_open(capcas, "system.pwd"); capgrploc = cap_service_open(capcas, "system.grp"); /* Casper capability no longer needed. */ cap_close(capcas); if (cappwdloc == NULL || capgrploc == NULL) { if (cappwdloc == NULL) warn("unable to open system.pwd service"); if (capgrploc == NULL) warn("unable to open system.grp service"); exit(1); } /* Limit system.pwd to only getpwuid() function and pw_name field. */ cmds[0] = "getpwuid"; if (cap_pwd_limit_cmds(cappwdloc, cmds, 1) < 0) err(1, "unable to limit system.pwd service"); fields[0] = "pw_name"; if (cap_pwd_limit_fields(cappwdloc, fields, 1) < 0) err(1, "unable to limit system.pwd service"); /* Limit system.grp to only getgrgid() function and gr_name field. */ cmds[0] = "getgrgid"; if (cap_grp_limit_cmds(capgrploc, cmds, 1) < 0) err(1, "unable to limit system.grp service"); fields[0] = "gr_name"; if (cap_grp_limit_fields(capgrploc, fields, 1) < 0) err(1, "unable to limit system.grp service"); *cappwdp = cappwdloc; *capgrpp = capgrploc; return (0); } #endif /* HAVE_LIBCASPER */ int main(int argc, char *argv[]) { int ch, ktrlen, size; void *m; int trpoints = ALL_POINTS; int drop_logged; pid_t pid = 0; u_int sv_flags; setlocale(LC_CTYPE, ""); timestamp = TIMESTAMP_NONE; while ((ch = getopt(argc,argv,"f:dElm:np:AHRrSsTt:")) != -1) switch (ch) { case 'A': abiflag = 1; break; case 'f': tracefile = optarg; break; case 'd': decimal = 1; break; case 'l': tail = 1; break; case 'm': maxdata = atoi(optarg); break; case 'n': fancy = 0; break; case 'p': pid = atoi(optarg); break; case 'r': resolv = 1; break; case 'S': syscallno = 1; break; case 's': suppressdata = 1; break; case 'E': timestamp |= TIMESTAMP_ELAPSED; break; case 'H': threads = 1; break; case 'R': timestamp |= TIMESTAMP_RELATIVE; break; case 'T': timestamp |= TIMESTAMP_ABSOLUTE; break; case 't': trpoints = getpoints(optarg); if (trpoints < 0) errx(1, "unknown trace point in %s", optarg); break; default: usage(); } if (argc > optind) usage(); m = malloc(size = 1025); if (m == NULL) errx(1, "%s", strerror(ENOMEM)); if (strcmp(tracefile, "-") != 0) if (!freopen(tracefile, "r", stdin)) err(1, "%s", tracefile); strerror_init(); localtime_init(); #ifdef HAVE_LIBCASPER if (resolv != 0) { if (cappwdgrp_setup(&cappwd, &capgrp) < 0) { cappwd = NULL; capgrp = NULL; } } if (resolv == 0 || (cappwd != NULL && capgrp != NULL)) { if (cap_enter() < 0 && errno != ENOSYS) err(1, "unable to enter capability mode"); } #else if (resolv == 0) { if (cap_enter() < 0 && errno != ENOSYS) err(1, "unable to enter capability mode"); } #endif - limitfd(STDIN_FILENO); - limitfd(STDOUT_FILENO); - limitfd(STDERR_FILENO); + if (caph_limit_stdio() == -1) + err(1, "unable to limit stdio"); TAILQ_INIT(&trace_procs); drop_logged = 0; while (fread_tail(&ktr_header, sizeof(struct ktr_header), 1)) { if (ktr_header.ktr_type & KTR_DROP) { ktr_header.ktr_type &= ~KTR_DROP; if (!drop_logged && threads) { printf( "%6jd %6jd %-8.*s Events dropped.\n", (intmax_t)ktr_header.ktr_pid, ktr_header.ktr_tid > 0 ? (intmax_t)ktr_header.ktr_tid : 0, MAXCOMLEN, ktr_header.ktr_comm); drop_logged = 1; } else if (!drop_logged) { printf("%6jd %-8.*s Events dropped.\n", (intmax_t)ktr_header.ktr_pid, MAXCOMLEN, ktr_header.ktr_comm); drop_logged = 1; } } if (trpoints & (1< size) { m = realloc(m, ktrlen+1); if (m == NULL) errx(1, "%s", strerror(ENOMEM)); size = ktrlen; } if (ktrlen && fread_tail(m, ktrlen, 1) == 0) errx(1, "data too short"); if (fetchprocinfo(&ktr_header, (u_int *)m) != 0) continue; sv_flags = abidump(&ktr_header); if (pid && ktr_header.ktr_pid != pid && ktr_header.ktr_tid != pid) continue; if ((trpoints & (1<ktr_type) { case KTR_PROCCTOR: TAILQ_FOREACH(pi, &trace_procs, info) { if (pi->pid == kth->ktr_pid) { TAILQ_REMOVE(&trace_procs, pi, info); break; } } pi = malloc(sizeof(struct proc_info)); if (pi == NULL) errx(1, "%s", strerror(ENOMEM)); pi->sv_flags = *flags; pi->pid = kth->ktr_pid; TAILQ_INSERT_TAIL(&trace_procs, pi, info); return (1); case KTR_PROCDTOR: TAILQ_FOREACH(pi, &trace_procs, info) { if (pi->pid == kth->ktr_pid) { TAILQ_REMOVE(&trace_procs, pi, info); free(pi); break; } } return (1); } return (0); } u_int abidump(struct ktr_header *kth) { struct proc_info *pi; const char *abi; const char *arch; u_int flags = 0; TAILQ_FOREACH(pi, &trace_procs, info) { if (pi->pid == kth->ktr_pid) { flags = pi->sv_flags; break; } } if (abiflag == 0) return (flags); switch (flags & SV_ABI_MASK) { case SV_ABI_LINUX: abi = "L"; break; case SV_ABI_FREEBSD: abi = "F"; break; case SV_ABI_CLOUDABI: abi = "C"; break; default: abi = "U"; break; } if (flags & SV_LP64) arch = "64"; else if (flags & SV_ILP32) arch = "32"; else arch = "00"; printf("%s%s ", abi, arch); return (flags); } void dumpheader(struct ktr_header *kth) { static char unknown[64]; static struct timeval prevtime, prevtime_e; struct timeval temp; const char *type; const char *sign; switch (kth->ktr_type) { case KTR_SYSCALL: type = "CALL"; break; case KTR_SYSRET: type = "RET "; break; case KTR_NAMEI: type = "NAMI"; break; case KTR_GENIO: type = "GIO "; break; case KTR_PSIG: type = "PSIG"; break; case KTR_CSW: type = "CSW "; break; case KTR_USER: type = "USER"; break; case KTR_STRUCT: type = "STRU"; break; case KTR_SYSCTL: type = "SCTL"; break; case KTR_PROCCTOR: /* FALLTHROUGH */ case KTR_PROCDTOR: return; case KTR_CAPFAIL: type = "CAP "; break; case KTR_FAULT: type = "PFLT"; break; case KTR_FAULTEND: type = "PRET"; break; default: sprintf(unknown, "UNKNOWN(%d)", kth->ktr_type); type = unknown; } /* * The ktr_tid field was previously the ktr_buffer field, which held * the kernel pointer value for the buffer associated with data * following the record header. It now holds a threadid, but only * for trace files after the change. Older trace files still contain * kernel pointers. Detect this and suppress the results by printing * negative tid's as 0. */ if (threads) printf("%6jd %6jd %-8.*s ", (intmax_t)kth->ktr_pid, kth->ktr_tid > 0 ? (intmax_t)kth->ktr_tid : 0, MAXCOMLEN, kth->ktr_comm); else printf("%6jd %-8.*s ", (intmax_t)kth->ktr_pid, MAXCOMLEN, kth->ktr_comm); if (timestamp) { if (timestamp & TIMESTAMP_ABSOLUTE) { printf("%jd.%06ld ", (intmax_t)kth->ktr_time.tv_sec, kth->ktr_time.tv_usec); } if (timestamp & TIMESTAMP_ELAPSED) { if (prevtime_e.tv_sec == 0) prevtime_e = kth->ktr_time; timersub(&kth->ktr_time, &prevtime_e, &temp); printf("%jd.%06ld ", (intmax_t)temp.tv_sec, temp.tv_usec); } if (timestamp & TIMESTAMP_RELATIVE) { if (prevtime.tv_sec == 0) prevtime = kth->ktr_time; if (timercmp(&kth->ktr_time, &prevtime, <)) { timersub(&prevtime, &kth->ktr_time, &temp); sign = "-"; } else { timersub(&kth->ktr_time, &prevtime, &temp); sign = ""; } prevtime = kth->ktr_time; printf("%s%jd.%06ld ", sign, (intmax_t)temp.tv_sec, temp.tv_usec); } } printf("%s ", type); } #include static void ioctlname(unsigned long val) { const char *str; str = sysdecode_ioctlname(val); if (str != NULL) printf("%s", str); else if (decimal) printf("%lu", val); else printf("%#lx", val); } static enum sysdecode_abi syscallabi(u_int sv_flags) { if (sv_flags == 0) return (SYSDECODE_ABI_FREEBSD); switch (sv_flags & SV_ABI_MASK) { case SV_ABI_FREEBSD: return (SYSDECODE_ABI_FREEBSD); #if defined(__amd64__) || defined(__i386__) case SV_ABI_LINUX: #ifdef __amd64__ if (sv_flags & SV_ILP32) return (SYSDECODE_ABI_LINUX32); #endif return (SYSDECODE_ABI_LINUX); #endif #if defined(__aarch64__) || defined(__amd64__) case SV_ABI_CLOUDABI: return (SYSDECODE_ABI_CLOUDABI64); #endif default: return (SYSDECODE_ABI_UNKNOWN); } } static void syscallname(u_int code, u_int sv_flags) { const char *name; name = sysdecode_syscallname(syscallabi(sv_flags), code); if (name == NULL) printf("[%d]", code); else { printf("%s", name); if (syscallno) printf("[%d]", code); } } void ktrsyscall(struct ktr_syscall *ktr, u_int sv_flags) { int narg = ktr->ktr_narg; register_t *ip, *first; intmax_t arg; int quad_align, quad_slots; syscallname(ktr->ktr_code, sv_flags); ip = first = &ktr->ktr_args[0]; if (narg) { char c = '('; if (fancy && (sv_flags == 0 || (sv_flags & SV_ABI_MASK) == SV_ABI_FREEBSD)) { quad_align = 0; if (sv_flags & SV_ILP32) { #ifdef __powerpc__ quad_align = 1; #endif quad_slots = 2; } else quad_slots = 1; switch (ktr->ktr_code) { case SYS_bindat: case SYS_connectat: case SYS_faccessat: case SYS_fchmodat: case SYS_fchownat: case SYS_fstatat: case SYS_futimesat: case SYS_linkat: case SYS_mkdirat: case SYS_mkfifoat: case SYS_mknodat: case SYS_openat: case SYS_readlinkat: case SYS_renameat: case SYS_unlinkat: case SYS_utimensat: putchar('('); atfdname(*ip, decimal); c = ','; ip++; narg--; break; } switch (ktr->ktr_code) { case SYS_ioctl: { print_number(ip, narg, c); putchar(c); ioctlname(*ip); c = ','; ip++; narg--; break; } case SYS_ptrace: putchar('('); ptraceopname(*ip); c = ','; ip++; narg--; break; case SYS_access: case SYS_eaccess: case SYS_faccessat: print_number(ip, narg, c); putchar(','); accessmodename(*ip); ip++; narg--; break; case SYS_open: case SYS_openat: print_number(ip, narg, c); putchar(','); flagsandmodename(ip[0], ip[1], decimal); ip += 2; narg -= 2; break; case SYS_wait4: print_number(ip, narg, c); print_number(ip, narg, c); /* * A flags value of zero is valid for * wait4() but not for wait6(), so * handle zero special here. */ if (*ip == 0) { print_number(ip, narg, c); } else { putchar(','); wait6optname(*ip); ip++; narg--; } break; case SYS_wait6: putchar('('); idtypename(*ip, decimal); c = ','; ip++; narg--; print_number64(first, ip, narg, c); print_number(ip, narg, c); putchar(','); wait6optname(*ip); ip++; narg--; break; case SYS_chmod: case SYS_fchmod: case SYS_lchmod: case SYS_fchmodat: print_number(ip, narg, c); putchar(','); modename(*ip); ip++; narg--; break; case SYS_mknod: case SYS_mknodat: print_number(ip, narg, c); putchar(','); modename(*ip); ip++; narg--; break; case SYS_getfsstat: print_number(ip, narg, c); print_number(ip, narg, c); putchar(','); getfsstatflagsname(*ip); ip++; narg--; break; case SYS_mount: print_number(ip, narg, c); print_number(ip, narg, c); putchar(','); mountflagsname(*ip); ip++; narg--; break; case SYS_unmount: print_number(ip, narg, c); putchar(','); mountflagsname(*ip); ip++; narg--; break; case SYS_recvmsg: case SYS_sendmsg: print_number(ip, narg, c); print_number(ip, narg, c); putchar(','); sendrecvflagsname(*ip); ip++; narg--; break; case SYS_recvfrom: case SYS_sendto: print_number(ip, narg, c); print_number(ip, narg, c); print_number(ip, narg, c); putchar(','); sendrecvflagsname(*ip); ip++; narg--; break; case SYS_chflags: case SYS_fchflags: case SYS_lchflags: print_number(ip, narg, c); putchar(','); modename(*ip); ip++; narg--; break; case SYS_kill: print_number(ip, narg, c); putchar(','); signame(*ip); ip++; narg--; break; case SYS_reboot: putchar('('); rebootoptname(*ip); ip++; narg--; break; case SYS_umask: putchar('('); modename(*ip); ip++; narg--; break; case SYS_msync: print_number(ip, narg, c); print_number(ip, narg, c); putchar(','); msyncflagsname(*ip); ip++; narg--; break; #ifdef SYS_freebsd6_mmap case SYS_freebsd6_mmap: print_number(ip, narg, c); print_number(ip, narg, c); putchar(','); mmapprotname(*ip); putchar(','); ip++; narg--; mmapflagsname(*ip); ip++; narg--; break; #endif case SYS_mmap: print_number(ip, narg, c); print_number(ip, narg, c); putchar(','); mmapprotname(*ip); putchar(','); ip++; narg--; mmapflagsname(*ip); ip++; narg--; break; case SYS_mprotect: print_number(ip, narg, c); print_number(ip, narg, c); putchar(','); mmapprotname(*ip); ip++; narg--; break; case SYS_madvise: print_number(ip, narg, c); print_number(ip, narg, c); putchar(','); madvisebehavname(*ip); ip++; narg--; break; case SYS_setpriority: print_number(ip, narg, c); print_number(ip, narg, c); putchar(','); prioname(*ip); ip++; narg--; break; case SYS_fcntl: print_number(ip, narg, c); putchar(','); fcntlcmdname(ip[0], ip[1], decimal); ip += 2; narg -= 2; break; case SYS_socket: { int sockdomain; putchar('('); sockdomain = *ip; sockdomainname(sockdomain); ip++; narg--; putchar(','); socktypenamewithflags(*ip); ip++; narg--; if (sockdomain == PF_INET || sockdomain == PF_INET6) { putchar(','); sockipprotoname(*ip); ip++; narg--; } c = ','; break; } case SYS_setsockopt: case SYS_getsockopt: print_number(ip, narg, c); putchar(','); sockoptlevelname(*ip, decimal); if (*ip == SOL_SOCKET) { ip++; narg--; putchar(','); sockoptname(*ip); } ip++; narg--; break; #ifdef SYS_freebsd6_lseek case SYS_freebsd6_lseek: print_number(ip, narg, c); /* Hidden 'pad' argument, not in lseek(2) */ print_number(ip, narg, c); print_number64(first, ip, narg, c); putchar(','); whencename(*ip); ip++; narg--; break; #endif case SYS_lseek: print_number(ip, narg, c); print_number64(first, ip, narg, c); putchar(','); whencename(*ip); ip++; narg--; break; case SYS_flock: print_number(ip, narg, c); putchar(','); flockname(*ip); ip++; narg--; break; case SYS_mkfifo: case SYS_mkfifoat: case SYS_mkdir: case SYS_mkdirat: print_number(ip, narg, c); putchar(','); modename(*ip); ip++; narg--; break; case SYS_shutdown: print_number(ip, narg, c); putchar(','); shutdownhowname(*ip); ip++; narg--; break; case SYS_socketpair: putchar('('); sockdomainname(*ip); ip++; narg--; putchar(','); socktypenamewithflags(*ip); ip++; narg--; c = ','; break; case SYS_getrlimit: case SYS_setrlimit: putchar('('); rlimitname(*ip); ip++; narg--; c = ','; break; case SYS_quotactl: print_number(ip, narg, c); putchar(','); quotactlname(*ip); ip++; narg--; c = ','; break; case SYS_nfssvc: putchar('('); nfssvcname(*ip); ip++; narg--; c = ','; break; case SYS_rtprio: putchar('('); rtprioname(*ip); ip++; narg--; c = ','; break; case SYS___semctl: print_number(ip, narg, c); print_number(ip, narg, c); putchar(','); semctlname(*ip); ip++; narg--; break; case SYS_semget: print_number(ip, narg, c); print_number(ip, narg, c); putchar(','); semgetname(*ip); ip++; narg--; break; case SYS_msgctl: print_number(ip, narg, c); putchar(','); shmctlname(*ip); ip++; narg--; break; case SYS_shmat: print_number(ip, narg, c); print_number(ip, narg, c); putchar(','); shmatname(*ip); ip++; narg--; break; case SYS_shmctl: print_number(ip, narg, c); putchar(','); shmctlname(*ip); ip++; narg--; break; case SYS_shm_open: print_number(ip, narg, c); putchar(','); flagsname(ip[0]); printf(",0%o", (unsigned int)ip[1]); ip += 3; narg -= 3; break; case SYS_minherit: print_number(ip, narg, c); print_number(ip, narg, c); putchar(','); minheritname(*ip); ip++; narg--; break; case SYS_rfork: putchar('('); rforkname(*ip); ip++; narg--; c = ','; break; case SYS_lio_listio: putchar('('); lio_listioname(*ip); ip++; narg--; c = ','; break; case SYS_mlockall: putchar('('); mlockallname(*ip); ip++; narg--; break; case SYS_sched_setscheduler: print_number(ip, narg, c); putchar(','); schedpolicyname(*ip); ip++; narg--; break; case SYS_sched_get_priority_max: case SYS_sched_get_priority_min: putchar('('); schedpolicyname(*ip); ip++; narg--; break; case SYS_sendfile: print_number(ip, narg, c); print_number(ip, narg, c); print_number(ip, narg, c); print_number(ip, narg, c); print_number(ip, narg, c); print_number(ip, narg, c); putchar(','); sendfileflagsname(*(int *)ip); ip++; narg--; break; case SYS_kldsym: print_number(ip, narg, c); putchar(','); kldsymcmdname(*ip); ip++; narg--; break; case SYS_sigprocmask: putchar('('); sigprocmaskhowname(*ip); ip++; narg--; c = ','; break; case SYS___acl_get_file: case SYS___acl_set_file: case SYS___acl_get_fd: case SYS___acl_set_fd: case SYS___acl_delete_file: case SYS___acl_delete_fd: case SYS___acl_aclcheck_file: case SYS___acl_aclcheck_fd: case SYS___acl_get_link: case SYS___acl_set_link: case SYS___acl_delete_link: case SYS___acl_aclcheck_link: print_number(ip, narg, c); putchar(','); acltypename(*ip); ip++; narg--; break; case SYS_sigaction: putchar('('); signame(*ip); ip++; narg--; c = ','; break; case SYS_extattrctl: print_number(ip, narg, c); putchar(','); extattrctlname(*ip); ip++; narg--; break; case SYS_nmount: print_number(ip, narg, c); print_number(ip, narg, c); putchar(','); mountflagsname(*ip); ip++; narg--; break; case SYS_thr_create: print_number(ip, narg, c); print_number(ip, narg, c); putchar(','); thrcreateflagsname(*ip); ip++; narg--; break; case SYS_thr_kill: print_number(ip, narg, c); putchar(','); signame(*ip); ip++; narg--; break; case SYS_kldunloadf: print_number(ip, narg, c); putchar(','); kldunloadfflagsname(*ip); ip++; narg--; break; case SYS_linkat: case SYS_renameat: case SYS_symlinkat: print_number(ip, narg, c); putchar(','); atfdname(*ip, decimal); ip++; narg--; break; case SYS_cap_fcntls_limit: print_number(ip, narg, c); putchar(','); arg = *ip; ip++; narg--; capfcntlname(arg); break; case SYS_posix_fadvise: print_number(ip, narg, c); print_number(ip, narg, c); print_number(ip, narg, c); (void)putchar(','); fadvisebehavname((int)*ip); ip++; narg--; break; case SYS_procctl: putchar('('); idtypename(*ip, decimal); c = ','; ip++; narg--; print_number64(first, ip, narg, c); putchar(','); procctlcmdname(*ip); ip++; narg--; break; case SYS__umtx_op: print_number(ip, narg, c); putchar(','); umtxopname(*ip); switch (*ip) { case UMTX_OP_CV_WAIT: ip++; narg--; putchar(','); umtxcvwaitflags(*ip); break; case UMTX_OP_RW_RDLOCK: ip++; narg--; putchar(','); umtxrwlockflags(*ip); break; } ip++; narg--; break; case SYS_ftruncate: case SYS_truncate: print_number(ip, narg, c); print_number64(first, ip, narg, c); break; } } while (narg > 0) { print_number(ip, narg, c); } putchar(')'); } putchar('\n'); } void ktrsysret(struct ktr_sysret *ktr, u_int sv_flags) { register_t ret = ktr->ktr_retval; int error = ktr->ktr_error; syscallname(ktr->ktr_code, sv_flags); printf(" "); if (error == 0) { if (fancy) { printf("%ld", (long)ret); if (ret < 0 || ret > 9) printf("/%#lx", (unsigned long)ret); } else { if (decimal) printf("%ld", (long)ret); else printf("%#lx", (unsigned long)ret); } } else if (error == ERESTART) printf("RESTART"); else if (error == EJUSTRETURN) printf("JUSTRETURN"); else { printf("-1 errno %d", sysdecode_freebsd_to_abi_errno( syscallabi(sv_flags), error)); if (fancy) printf(" %s", strerror(ktr->ktr_error)); } putchar('\n'); } void ktrnamei(char *cp, int len) { printf("\"%.*s\"\n", len, cp); } void hexdump(char *p, int len, int screenwidth) { int n, i; int width; width = 0; do { width += 2; i = 13; /* base offset */ i += (width / 2) + 1; /* spaces every second byte */ i += (width * 2); /* width of bytes */ i += 3; /* " |" */ i += width; /* each byte */ i += 1; /* "|" */ } while (i < screenwidth); width -= 2; for (n = 0; n < len; n += width) { for (i = n; i < n + width; i++) { if ((i % width) == 0) { /* beginning of line */ printf(" 0x%04x", i); } if ((i % 2) == 0) { printf(" "); } if (i < len) printf("%02x", p[i] & 0xff); else printf(" "); } printf(" |"); for (i = n; i < n + width; i++) { if (i >= len) break; if (p[i] >= ' ' && p[i] <= '~') printf("%c", p[i]); else printf("."); } printf("|\n"); } if ((i % width) != 0) printf("\n"); } void visdump(char *dp, int datalen, int screenwidth) { int col = 0; char *cp; int width; char visbuf[5]; printf(" \""); col = 8; for (;datalen > 0; datalen--, dp++) { vis(visbuf, *dp, VIS_CSTYLE, *(dp+1)); cp = visbuf; /* * Keep track of printables and * space chars (like fold(1)). */ if (col == 0) { putchar('\t'); col = 8; } switch(*cp) { case '\n': col = 0; putchar('\n'); continue; case '\t': width = 8 - (col&07); break; default: width = strlen(cp); } if (col + width > (screenwidth-2)) { printf("\\\n\t"); col = 8; } col += width; do { putchar(*cp++); } while (*cp); } if (col == 0) printf(" "); printf("\"\n"); } void ktrgenio(struct ktr_genio *ktr, int len) { int datalen = len - sizeof (struct ktr_genio); char *dp = (char *)ktr + sizeof (struct ktr_genio); static int screenwidth = 0; int i, binary; printf("fd %d %s %d byte%s\n", ktr->ktr_fd, ktr->ktr_rw == UIO_READ ? "read" : "wrote", datalen, datalen == 1 ? "" : "s"); if (suppressdata) return; if (screenwidth == 0) { struct winsize ws; if (fancy && ioctl(fileno(stderr), TIOCGWINSZ, &ws) != -1 && ws.ws_col > 8) screenwidth = ws.ws_col; else screenwidth = 80; } if (maxdata && datalen > maxdata) datalen = maxdata; for (i = 0, binary = 0; i < datalen && binary == 0; i++) { if (dp[i] >= 32 && dp[i] < 127) continue; if (dp[i] == 10 || dp[i] == 13 || dp[i] == 0 || dp[i] == 9) continue; binary = 1; } if (binary) hexdump(dp, datalen, screenwidth); else visdump(dp, datalen, screenwidth); } const char *signames[] = { "NULL", "HUP", "INT", "QUIT", "ILL", "TRAP", "IOT", /* 1 - 6 */ "EMT", "FPE", "KILL", "BUS", "SEGV", "SYS", /* 7 - 12 */ "PIPE", "ALRM", "TERM", "URG", "STOP", "TSTP", /* 13 - 18 */ "CONT", "CHLD", "TTIN", "TTOU", "IO", "XCPU", /* 19 - 24 */ "XFSZ", "VTALRM", "PROF", "WINCH", "29", "USR1", /* 25 - 30 */ "USR2", NULL, /* 31 - 32 */ }; void ktrpsig(struct ktr_psig *psig) { if (psig->signo > 0 && psig->signo < NSIG) printf("SIG%s ", signames[psig->signo]); else printf("SIG %d ", psig->signo); if (psig->action == SIG_DFL) { printf("SIG_DFL code="); sigcodename(psig->signo, psig->code); putchar('\n'); } else { printf("caught handler=0x%lx mask=0x%x code=", (u_long)psig->action, psig->mask.__bits[0]); sigcodename(psig->signo, psig->code); putchar('\n'); } } void ktrcsw_old(struct ktr_csw_old *cs) { printf("%s %s\n", cs->out ? "stop" : "resume", cs->user ? "user" : "kernel"); } void ktrcsw(struct ktr_csw *cs) { printf("%s %s \"%s\"\n", cs->out ? "stop" : "resume", cs->user ? "user" : "kernel", cs->wmesg); } void ktruser(int len, void *p) { unsigned char *cp; if (sysdecode_utrace(stdout, p, len)) { printf("\n"); return; } printf("%d ", len); cp = p; while (len--) if (decimal) printf(" %d", *cp++); else printf(" %02x", *cp++); printf("\n"); } void ktrcaprights(cap_rights_t *rightsp) { printf("cap_rights_t "); capname(rightsp); printf("\n"); } static void ktrtimeval(struct timeval *tv) { printf("{%ld, %ld}", (long)tv->tv_sec, tv->tv_usec); } void ktritimerval(struct itimerval *it) { printf("itimerval { .interval = "); ktrtimeval(&it->it_interval); printf(", .value = "); ktrtimeval(&it->it_value); printf(" }\n"); } void ktrsockaddr(struct sockaddr *sa) { /* TODO: Support additional address families #include struct sockaddr_natm *natm; #include struct sockaddr_nb *nb; */ char addr[64]; /* * note: ktrstruct() has already verified that sa points to a * buffer at least sizeof(struct sockaddr) bytes long and exactly * sa->sa_len bytes long. */ printf("struct sockaddr { "); sockfamilyname(sa->sa_family); printf(", "); #define check_sockaddr_len(n) \ if (sa_##n.s##n##_len < sizeof(struct sockaddr_##n)) { \ printf("invalid"); \ break; \ } switch(sa->sa_family) { case AF_INET: { struct sockaddr_in sa_in; memset(&sa_in, 0, sizeof(sa_in)); memcpy(&sa_in, sa, sa->sa_len); check_sockaddr_len(in); inet_ntop(AF_INET, &sa_in.sin_addr, addr, sizeof addr); printf("%s:%u", addr, ntohs(sa_in.sin_port)); break; } case AF_INET6: { struct sockaddr_in6 sa_in6; memset(&sa_in6, 0, sizeof(sa_in6)); memcpy(&sa_in6, sa, sa->sa_len); check_sockaddr_len(in6); getnameinfo((struct sockaddr *)&sa_in6, sizeof(sa_in6), addr, sizeof(addr), NULL, 0, NI_NUMERICHOST); printf("[%s]:%u", addr, htons(sa_in6.sin6_port)); break; } case AF_UNIX: { struct sockaddr_un sa_un; memset(&sa_un, 0, sizeof(sa_un)); memcpy(&sa_un, sa, sa->sa_len); printf("%.*s", (int)sizeof(sa_un.sun_path), sa_un.sun_path); break; } default: printf("unknown address family"); } printf(" }\n"); } void ktrstat(struct stat *statp) { char mode[12], timestr[PATH_MAX + 4]; struct passwd *pwd; struct group *grp; struct tm *tm; /* * note: ktrstruct() has already verified that statp points to a * buffer exactly sizeof(struct stat) bytes long. */ printf("struct stat {"); printf("dev=%ju, ino=%ju, ", (uintmax_t)statp->st_dev, (uintmax_t)statp->st_ino); if (resolv == 0) printf("mode=0%jo, ", (uintmax_t)statp->st_mode); else { strmode(statp->st_mode, mode); printf("mode=%s, ", mode); } printf("nlink=%ju, ", (uintmax_t)statp->st_nlink); if (resolv == 0) { pwd = NULL; } else { #ifdef HAVE_LIBCASPER if (cappwd != NULL) pwd = cap_getpwuid(cappwd, statp->st_uid); else #endif pwd = getpwuid(statp->st_uid); } if (pwd == NULL) printf("uid=%ju, ", (uintmax_t)statp->st_uid); else printf("uid=\"%s\", ", pwd->pw_name); if (resolv == 0) { grp = NULL; } else { #ifdef HAVE_LIBCASPER if (capgrp != NULL) grp = cap_getgrgid(capgrp, statp->st_gid); else #endif grp = getgrgid(statp->st_gid); } if (grp == NULL) printf("gid=%ju, ", (uintmax_t)statp->st_gid); else printf("gid=\"%s\", ", grp->gr_name); printf("rdev=%ju, ", (uintmax_t)statp->st_rdev); printf("atime="); if (resolv == 0) printf("%jd", (intmax_t)statp->st_atim.tv_sec); else { tm = localtime(&statp->st_atim.tv_sec); strftime(timestr, sizeof(timestr), TIME_FORMAT, tm); printf("\"%s\"", timestr); } if (statp->st_atim.tv_nsec != 0) printf(".%09ld, ", statp->st_atim.tv_nsec); else printf(", "); printf("mtime="); if (resolv == 0) printf("%jd", (intmax_t)statp->st_mtim.tv_sec); else { tm = localtime(&statp->st_mtim.tv_sec); strftime(timestr, sizeof(timestr), TIME_FORMAT, tm); printf("\"%s\"", timestr); } if (statp->st_mtim.tv_nsec != 0) printf(".%09ld, ", statp->st_mtim.tv_nsec); else printf(", "); printf("ctime="); if (resolv == 0) printf("%jd", (intmax_t)statp->st_ctim.tv_sec); else { tm = localtime(&statp->st_ctim.tv_sec); strftime(timestr, sizeof(timestr), TIME_FORMAT, tm); printf("\"%s\"", timestr); } if (statp->st_ctim.tv_nsec != 0) printf(".%09ld, ", statp->st_ctim.tv_nsec); else printf(", "); printf("birthtime="); if (resolv == 0) printf("%jd", (intmax_t)statp->st_birthtim.tv_sec); else { tm = localtime(&statp->st_birthtim.tv_sec); strftime(timestr, sizeof(timestr), TIME_FORMAT, tm); printf("\"%s\"", timestr); } if (statp->st_birthtim.tv_nsec != 0) printf(".%09ld, ", statp->st_birthtim.tv_nsec); else printf(", "); printf("size=%jd, blksize=%ju, blocks=%jd, flags=0x%x", (uintmax_t)statp->st_size, (uintmax_t)statp->st_blksize, (intmax_t)statp->st_blocks, statp->st_flags); printf(" }\n"); } void ktrstruct(char *buf, size_t buflen) { char *name, *data; size_t namelen, datalen; int i; cap_rights_t rights; struct itimerval it; struct stat sb; struct sockaddr_storage ss; for (name = buf, namelen = 0; namelen < buflen && name[namelen] != '\0'; ++namelen) /* nothing */; if (namelen == buflen) goto invalid; if (name[namelen] != '\0') goto invalid; data = buf + namelen + 1; datalen = buflen - namelen - 1; if (datalen == 0) goto invalid; /* sanity check */ for (i = 0; i < (int)namelen; ++i) if (!isalpha(name[i])) goto invalid; if (strcmp(name, "caprights") == 0) { if (datalen != sizeof(cap_rights_t)) goto invalid; memcpy(&rights, data, datalen); ktrcaprights(&rights); } else if (strcmp(name, "itimerval") == 0) { if (datalen != sizeof(struct itimerval)) goto invalid; memcpy(&it, data, datalen); ktritimerval(&it); } else if (strcmp(name, "stat") == 0) { if (datalen != sizeof(struct stat)) goto invalid; memcpy(&sb, data, datalen); ktrstat(&sb); } else if (strcmp(name, "sockaddr") == 0) { if (datalen > sizeof(ss)) goto invalid; memcpy(&ss, data, datalen); if (datalen != ss.ss_len) goto invalid; ktrsockaddr((struct sockaddr *)&ss); } else { printf("unknown structure\n"); } return; invalid: printf("invalid record\n"); } void ktrcapfail(struct ktr_cap_fail *ktr) { switch (ktr->cap_type) { case CAPFAIL_NOTCAPABLE: /* operation on fd with insufficient capabilities */ printf("operation requires "); capname(&ktr->cap_needed); printf(", descriptor holds "); capname(&ktr->cap_held); break; case CAPFAIL_INCREASE: /* requested more capabilities than fd already has */ printf("attempt to increase capabilities from "); capname(&ktr->cap_held); printf(" to "); capname(&ktr->cap_needed); break; case CAPFAIL_SYSCALL: /* called restricted syscall */ printf("disallowed system call"); break; case CAPFAIL_LOOKUP: /* used ".." in strict-relative mode */ printf("restricted VFS lookup"); break; default: printf("unknown capability failure: "); capname(&ktr->cap_needed); printf(" "); capname(&ktr->cap_held); break; } printf("\n"); } void ktrfault(struct ktr_fault *ktr) { printf("0x%jx ", (uintmax_t)ktr->vaddr); vmprotname(ktr->type); printf("\n"); } void ktrfaultend(struct ktr_faultend *ktr) { vmresultname(ktr->result); printf("\n"); } void usage(void) { fprintf(stderr, "usage: kdump [-dEnlHRrSsTA] [-f trfile] " "[-m maxdata] [-p pid] [-t trstr]\n"); exit(1); } Index: user/alc/PQ_LAUNDRY/usr.bin/localedef/ctype.c =================================================================== --- user/alc/PQ_LAUNDRY/usr.bin/localedef/ctype.c (revision 306831) +++ user/alc/PQ_LAUNDRY/usr.bin/localedef/ctype.c (revision 306832) @@ -1,463 +1,463 @@ /* * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright 2012 Garrett D'Amore All rights reserved. * Copyright 2015 John Marino * * This source code is derived from the illumos localedef command, and * provided under BSD-style license terms by Nexenta Systems, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * LC_CTYPE database generation routines for localedef. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include "localedef.h" #include "parser.h" #include "runefile.h" /* Needed for bootstrapping, _CTYPE_N */ #ifndef _CTYPE_N #define _CTYPE_N 0x00400000L #endif #define _ISUPPER _CTYPE_U #define _ISLOWER _CTYPE_L #define _ISDIGIT _CTYPE_D #define _ISXDIGIT _CTYPE_X #define _ISSPACE _CTYPE_S #define _ISBLANK _CTYPE_B #define _ISALPHA _CTYPE_A #define _ISPUNCT _CTYPE_P #define _ISGRAPH _CTYPE_G #define _ISPRINT _CTYPE_R #define _ISCNTRL _CTYPE_C #define _E1 _CTYPE_Q #define _E2 _CTYPE_I #define _E3 0 #define _E4 _CTYPE_N #define _E5 _CTYPE_T static wchar_t last_ctype; static int ctype_compare(const void *n1, const void *n2); typedef struct ctype_node { wchar_t wc; int32_t ctype; int32_t toupper; int32_t tolower; RB_ENTRY(ctype_node) entry; } ctype_node_t; static RB_HEAD(ctypes, ctype_node) ctypes; RB_GENERATE_STATIC(ctypes, ctype_node, entry, ctype_compare); static int ctype_compare(const void *n1, const void *n2) { const ctype_node_t *c1 = n1; const ctype_node_t *c2 = n2; return (c1->wc < c2->wc ? -1 : c1->wc > c2->wc ? 1 : 0); } void init_ctype(void) { RB_INIT(&ctypes); } static void add_ctype_impl(ctype_node_t *ctn) { switch (last_kw) { case T_ISUPPER: ctn->ctype |= (_ISUPPER | _ISALPHA | _ISGRAPH | _ISPRINT); break; case T_ISLOWER: ctn->ctype |= (_ISLOWER | _ISALPHA | _ISGRAPH | _ISPRINT); break; case T_ISALPHA: ctn->ctype |= (_ISALPHA | _ISGRAPH | _ISPRINT); break; case T_ISDIGIT: ctn->ctype |= (_ISDIGIT | _ISGRAPH | _ISPRINT | _ISXDIGIT | _E4); break; case T_ISSPACE: ctn->ctype |= _ISSPACE; break; case T_ISCNTRL: ctn->ctype |= _ISCNTRL; break; case T_ISGRAPH: ctn->ctype |= (_ISGRAPH | _ISPRINT); break; case T_ISPRINT: ctn->ctype |= _ISPRINT; break; case T_ISPUNCT: ctn->ctype |= (_ISPUNCT | _ISGRAPH | _ISPRINT); break; case T_ISXDIGIT: ctn->ctype |= (_ISXDIGIT | _ISPRINT); break; case T_ISBLANK: ctn->ctype |= (_ISBLANK | _ISSPACE); break; case T_ISPHONOGRAM: ctn->ctype |= (_E1 | _ISPRINT | _ISGRAPH); break; case T_ISIDEOGRAM: ctn->ctype |= (_E2 | _ISPRINT | _ISGRAPH); break; case T_ISENGLISH: ctn->ctype |= (_E3 | _ISPRINT | _ISGRAPH); break; case T_ISNUMBER: ctn->ctype |= (_E4 | _ISPRINT | _ISGRAPH); break; case T_ISSPECIAL: ctn->ctype |= (_E5 | _ISPRINT | _ISGRAPH); break; case T_ISALNUM: /* * We can't do anything with this. The character * should already be specified as a digit or alpha. */ break; default: errf("not a valid character class"); } } static ctype_node_t * get_ctype(wchar_t wc) { ctype_node_t srch; ctype_node_t *ctn; srch.wc = wc; if ((ctn = RB_FIND(ctypes, &ctypes, &srch)) == NULL) { if ((ctn = calloc(1, sizeof (*ctn))) == NULL) { errf("out of memory"); return (NULL); } ctn->wc = wc; RB_INSERT(ctypes, &ctypes, ctn); } return (ctn); } void add_ctype(int val) { ctype_node_t *ctn; if ((ctn = get_ctype(val)) == NULL) { INTERR; return; } add_ctype_impl(ctn); last_ctype = ctn->wc; } void add_ctype_range(wchar_t end) { ctype_node_t *ctn; wchar_t cur; if (end < last_ctype) { errf("malformed character range (%u ... %u))", last_ctype, end); return; } for (cur = last_ctype + 1; cur <= end; cur++) { if ((ctn = get_ctype(cur)) == NULL) { INTERR; return; } add_ctype_impl(ctn); } last_ctype = end; } /* * A word about widths: if the width mask is specified, then libc * unconditionally honors it. Otherwise, it assumes printable * characters have width 1, and non-printable characters have width * -1 (except for NULL which is special with with 0). Hence, we have * no need to inject defaults here -- the "default" unset value of 0 * indicates that libc should use its own logic in wcwidth as described. */ void add_width(int wc, int width) { ctype_node_t *ctn; if ((ctn = get_ctype(wc)) == NULL) { INTERR; return; } ctn->ctype &= ~(_CTYPE_SWM); switch (width) { case 0: ctn->ctype |= _CTYPE_SW0; break; case 1: ctn->ctype |= _CTYPE_SW1; break; case 2: ctn->ctype |= _CTYPE_SW2; break; case 3: ctn->ctype |= _CTYPE_SW3; break; } } void add_width_range(int start, int end, int width) { for (; start <= end; start++) { add_width(start, width); } } void add_caseconv(int val, int wc) { ctype_node_t *ctn; ctn = get_ctype(val); if (ctn == NULL) { INTERR; return; } switch (last_kw) { case T_TOUPPER: ctn->toupper = wc; break; case T_TOLOWER: ctn->tolower = wc; break; default: INTERR; break; } } void dump_ctype(void) { FILE *f; _FileRuneLocale rl; ctype_node_t *ctn, *last_ct, *last_lo, *last_up; _FileRuneEntry *ct = NULL; _FileRuneEntry *lo = NULL; _FileRuneEntry *up = NULL; wchar_t wc; (void) memset(&rl, 0, sizeof (rl)); last_ct = NULL; last_lo = NULL; last_up = NULL; if ((f = open_category()) == NULL) return; (void) memcpy(rl.magic, _FILE_RUNE_MAGIC_1, 8); (void) strncpy(rl.encoding, get_wide_encoding(), sizeof (rl.encoding)); /* * Initialize the identity map. */ for (wc = 0; (unsigned)wc < _CACHED_RUNES; wc++) { rl.maplower[wc] = wc; rl.mapupper[wc] = wc; } RB_FOREACH(ctn, ctypes, &ctypes) { int conflict = 0; wc = ctn->wc; /* * POSIX requires certain portable characters have * certain types. Add them if they are missing. */ if ((wc >= 1) && (wc <= 127)) { if ((wc >= 'A') && (wc <= 'Z')) ctn->ctype |= _ISUPPER; if ((wc >= 'a') && (wc <= 'z')) ctn->ctype |= _ISLOWER; if ((wc >= '0') && (wc <= '9')) ctn->ctype |= _ISDIGIT; if (wc == ' ') ctn->ctype |= _ISPRINT; if (strchr(" \f\n\r\t\v", (char)wc) != NULL) ctn->ctype |= _ISSPACE; if (strchr("0123456789ABCDEFabcdef", (char)wc) != NULL) ctn->ctype |= _ISXDIGIT; if (strchr(" \t", (char)wc)) ctn->ctype |= _ISBLANK; /* * Technically these settings are only * required for the C locale. However, it * turns out that because of the historical * version of isprint(), we need them for all * locales as well. Note that these are not * necessarily valid punctation characters in * the current language, but ispunct() needs * to return TRUE for them. */ if (strchr("!\"'#$%&()*+,-./:;<=>?@[\\]^_`{|}~", (char)wc)) ctn->ctype |= _ISPUNCT; } /* * POSIX also requires that certain types imply * others. Add any inferred types here. */ if (ctn->ctype & (_ISUPPER |_ISLOWER)) ctn->ctype |= _ISALPHA; if (ctn->ctype & _ISDIGIT) ctn->ctype |= _ISXDIGIT; if (ctn->ctype & _ISBLANK) ctn->ctype |= _ISSPACE; if (ctn->ctype & (_ISALPHA|_ISDIGIT|_ISXDIGIT)) ctn->ctype |= _ISGRAPH; if (ctn->ctype & _ISGRAPH) ctn->ctype |= _ISPRINT; /* * Finally, POSIX requires that certain combinations * are invalid. We don't flag this as a fatal error, * but we will warn about. */ if ((ctn->ctype & _ISALPHA) && (ctn->ctype & (_ISPUNCT|_ISDIGIT))) conflict++; if ((ctn->ctype & _ISPUNCT) & (ctn->ctype & (_ISDIGIT|_ISALPHA|_ISXDIGIT))) conflict++; if ((ctn->ctype & _ISSPACE) && (ctn->ctype & _ISGRAPH)) conflict++; if ((ctn->ctype & _ISCNTRL) & _ISPRINT) conflict++; if ((wc == ' ') && (ctn->ctype & (_ISPUNCT|_ISGRAPH))) conflict++; if (conflict) { warn("conflicting classes for character 0x%x (%x)", wc, ctn->ctype); } /* * Handle the lower 256 characters using the simple * optimization. Note that if we have not defined the * upper/lower case, then we identity map it. */ if ((unsigned)wc < _CACHED_RUNES) { rl.runetype[wc] = ctn->ctype; if (ctn->tolower) rl.maplower[wc] = ctn->tolower; if (ctn->toupper) rl.mapupper[wc] = ctn->toupper; continue; } - if ((last_ct != NULL) && (last_ct->ctype == ctn->ctype)) { + if ((last_ct != NULL) && (last_ct->ctype == ctn->ctype) && + (last_ct->wc + 1 == wc)) { ct[rl.runetype_ext_nranges-1].max = wc; - last_ct = ctn; } else { rl.runetype_ext_nranges++; ct = realloc(ct, sizeof (*ct) * rl.runetype_ext_nranges); ct[rl.runetype_ext_nranges - 1].min = wc; ct[rl.runetype_ext_nranges - 1].max = wc; ct[rl.runetype_ext_nranges - 1].map = ctn->ctype; - last_ct = ctn; } + last_ct = ctn; if (ctn->tolower == 0) { last_lo = NULL; } else if ((last_lo != NULL) && (last_lo->tolower + 1 == ctn->tolower)) { lo[rl.maplower_ext_nranges-1].max = wc; last_lo = ctn; } else { rl.maplower_ext_nranges++; lo = realloc(lo, sizeof (*lo) * rl.maplower_ext_nranges); lo[rl.maplower_ext_nranges - 1].min = wc; lo[rl.maplower_ext_nranges - 1].max = wc; lo[rl.maplower_ext_nranges - 1].map = ctn->tolower; last_lo = ctn; } if (ctn->toupper == 0) { last_up = NULL; } else if ((last_up != NULL) && (last_up->toupper + 1 == ctn->toupper)) { up[rl.mapupper_ext_nranges-1].max = wc; last_up = ctn; } else { rl.mapupper_ext_nranges++; up = realloc(up, sizeof (*up) * rl.mapupper_ext_nranges); up[rl.mapupper_ext_nranges - 1].min = wc; up[rl.mapupper_ext_nranges - 1].max = wc; up[rl.mapupper_ext_nranges - 1].map = ctn->toupper; last_up = ctn; } } if ((wr_category(&rl, sizeof (rl), f) < 0) || (wr_category(ct, sizeof (*ct) * rl.runetype_ext_nranges, f) < 0) || (wr_category(lo, sizeof (*lo) * rl.maplower_ext_nranges, f) < 0) || (wr_category(up, sizeof (*up) * rl.mapupper_ext_nranges, f) < 0)) { return; } close_category(f); } Index: user/alc/PQ_LAUNDRY/usr.bin/localedef/parser.y =================================================================== --- user/alc/PQ_LAUNDRY/usr.bin/localedef/parser.y (revision 306831) +++ user/alc/PQ_LAUNDRY/usr.bin/localedef/parser.y (revision 306832) @@ -1,706 +1,705 @@ %{ /* * Copyright 2010 Nexenta Systems, Inc. All rights reserved. * Copyright 2015 John Marino * * This source code is derived from the illumos localedef command, and * provided under BSD-style license terms by Nexenta Systems, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ */ /* * POSIX localedef grammar. */ #include #include #include #include "localedef.h" %} %union { int num; wchar_t wc; char *token; collsym_t *collsym; collelem_t *collelem; } %token T_CODE_SET %token T_MB_CUR_MAX %token T_MB_CUR_MIN %token T_COM_CHAR %token T_ESC_CHAR %token T_LT %token T_GT %token T_NL %token T_SEMI %token T_COMMA %token T_ELLIPSIS %token T_RPAREN %token T_LPAREN %token T_QUOTE %token T_NULL %token T_WS %token T_END %token T_COPY %token T_CHARMAP %token T_WIDTH %token T_CTYPE %token T_ISUPPER %token T_ISLOWER %token T_ISALPHA %token T_ISDIGIT %token T_ISPUNCT %token T_ISXDIGIT %token T_ISSPACE %token T_ISPRINT %token T_ISGRAPH %token T_ISBLANK %token T_ISCNTRL %token T_ISALNUM %token T_ISSPECIAL %token T_ISPHONOGRAM %token T_ISIDEOGRAM %token T_ISENGLISH %token T_ISNUMBER %token T_TOUPPER %token T_TOLOWER %token T_COLLATE %token T_COLLATING_SYMBOL %token T_COLLATING_ELEMENT %token T_ORDER_START %token T_ORDER_END %token T_FORWARD %token T_BACKWARD %token T_POSITION %token T_FROM %token T_UNDEFINED %token T_IGNORE %token T_MESSAGES %token T_YESSTR %token T_NOSTR %token T_YESEXPR %token T_NOEXPR %token T_MONETARY %token T_INT_CURR_SYMBOL %token T_CURRENCY_SYMBOL %token T_MON_DECIMAL_POINT %token T_MON_THOUSANDS_SEP %token T_POSITIVE_SIGN %token T_NEGATIVE_SIGN %token T_MON_GROUPING %token T_INT_FRAC_DIGITS %token T_FRAC_DIGITS %token T_P_CS_PRECEDES %token T_P_SEP_BY_SPACE %token T_N_CS_PRECEDES %token T_N_SEP_BY_SPACE %token T_P_SIGN_POSN %token T_N_SIGN_POSN %token T_INT_P_CS_PRECEDES %token T_INT_N_CS_PRECEDES %token T_INT_P_SEP_BY_SPACE %token T_INT_N_SEP_BY_SPACE %token T_INT_P_SIGN_POSN %token T_INT_N_SIGN_POSN %token T_NUMERIC %token T_DECIMAL_POINT %token T_THOUSANDS_SEP %token T_GROUPING %token T_TIME %token T_ABDAY %token T_DAY %token T_ABMON %token T_MON %token T_ERA %token T_ERA_D_FMT %token T_ERA_T_FMT %token T_ERA_D_T_FMT %token T_ALT_DIGITS %token T_D_T_FMT %token T_D_FMT %token T_T_FMT %token T_AM_PM %token T_T_FMT_AMPM %token T_DATE_FMT %token T_CHAR %token T_NAME %token T_NUMBER %token T_SYMBOL %token T_COLLSYM %token T_COLLELEM %% localedef : setting_list categories | categories ; string : T_QUOTE charlist T_QUOTE | T_QUOTE T_QUOTE ; charlist : charlist T_CHAR { add_wcs($2); } | T_CHAR { add_wcs($1); } ; setting_list : setting_list setting | setting ; setting : T_COM_CHAR T_CHAR T_NL { com_char = $2; } | T_ESC_CHAR T_CHAR T_NL { esc_char = $2; } | T_MB_CUR_MAX T_NUMBER T_NL { mb_cur_max = $2; } | T_MB_CUR_MIN T_NUMBER T_NL { mb_cur_min = $2; } | T_CODE_SET string T_NL { wchar_t *w = get_wcs(); set_wide_encoding(to_mb_string(w)); free(w); } | T_CODE_SET T_NAME T_NL { set_wide_encoding($2); } ; copycat : T_COPY T_NAME T_NL { copy_category($2); } | T_COPY string T_NL { wchar_t *w = get_wcs(); copy_category(to_mb_string(w)); free(w); } ; categories : categories category | category ; category : charmap | messages | monetary | ctype | collate | numeric | time ; charmap : T_CHARMAP T_NL charmap_list T_END T_CHARMAP T_NL | T_WIDTH T_NL width_list T_END T_WIDTH T_NL ; charmap_list : charmap_list charmap_entry | charmap_entry ; charmap_entry : T_SYMBOL T_CHAR { add_charmap($1, $2); scan_to_eol(); } | T_SYMBOL T_ELLIPSIS T_SYMBOL T_CHAR { add_charmap_range($1, $3, $4); scan_to_eol(); } | T_NL ; width_list : width_list width_entry | width_entry ; width_entry : T_CHAR T_NUMBER T_NL { add_width($1, $2); } | T_SYMBOL T_NUMBER T_NL { add_charmap_undefined($1); } | T_CHAR T_ELLIPSIS T_CHAR T_NUMBER T_NL { add_width_range($1, $3, $4); } | T_SYMBOL T_ELLIPSIS T_SYMBOL T_NUMBER T_NL { add_charmap_undefined($1); add_charmap_undefined($3); } | T_CHAR T_ELLIPSIS T_SYMBOL T_NUMBER T_NL { add_width($1, $4); add_charmap_undefined($3); } | T_SYMBOL T_ELLIPSIS T_CHAR T_NUMBER T_NL { add_width($3, $4); add_charmap_undefined($1); } | T_NL ; ctype : T_CTYPE T_NL ctype_list T_END T_CTYPE T_NL { dump_ctype(); } | T_CTYPE T_NL copycat T_END T_CTYPE T_NL ; ctype_list : ctype_list ctype_kw | ctype_kw ; ctype_kw : T_ISUPPER cc_list T_NL | T_ISLOWER cc_list T_NL | T_ISALPHA cc_list T_NL | T_ISDIGIT cc_list T_NL | T_ISPUNCT cc_list T_NL | T_ISXDIGIT cc_list T_NL | T_ISSPACE cc_list T_NL | T_ISPRINT cc_list T_NL | T_ISGRAPH cc_list T_NL | T_ISBLANK cc_list T_NL | T_ISCNTRL cc_list T_NL | T_ISALNUM cc_list T_NL | T_ISSPECIAL cc_list T_NL | T_ISENGLISH cc_list T_NL | T_ISNUMBER cc_list T_NL | T_ISIDEOGRAM cc_list T_NL | T_ISPHONOGRAM cc_list T_NL | T_TOUPPER conv_list T_NL | T_TOLOWER conv_list T_NL ; +cc_list : cc_list T_SEMI cc_range_end + | cc_list T_SEMI cc_char + | cc_char + ; -cc_list : cc_list T_SEMI T_CHAR +cc_range_end : T_ELLIPSIS T_SEMI T_CHAR { - add_ctype($3); + add_ctype_range($3); } - | cc_list T_SEMI T_SYMBOL - { - add_charmap_undefined($3); - } - | cc_list T_SEMI T_ELLIPSIS T_SEMI T_CHAR - { - /* note that the endpoints *must* be characters */ - add_ctype_range($5); - } - | T_CHAR + ; + +cc_char : T_CHAR { add_ctype($1); } | T_SYMBOL { add_charmap_undefined($1); } ; conv_list : conv_list T_SEMI conv_pair | conv_pair ; conv_pair : T_LPAREN T_CHAR T_COMMA T_CHAR T_RPAREN { add_caseconv($2, $4); } | T_LPAREN T_SYMBOL T_COMMA T_CHAR T_RPAREN { add_charmap_undefined($2); } | T_LPAREN T_SYMBOL T_COMMA T_SYMBOL T_RPAREN { add_charmap_undefined($2); add_charmap_undefined($4); } | T_LPAREN T_CHAR T_COMMA T_SYMBOL T_RPAREN { add_charmap_undefined($4); } ; collate : T_COLLATE T_NL coll_order T_END T_COLLATE T_NL { dump_collate(); } | T_COLLATE T_NL coll_optional coll_order T_END T_COLLATE T_NL { dump_collate(); } | T_COLLATE T_NL copycat T_END T_COLLATE T_NL ; coll_optional : coll_optional coll_symbols | coll_optional coll_elements | coll_symbols | coll_elements ; coll_symbols : T_COLLATING_SYMBOL T_SYMBOL T_NL { define_collsym($2); } ; coll_elements : T_COLLATING_ELEMENT T_SYMBOL T_FROM string T_NL { define_collelem($2, get_wcs()); } ; coll_order : T_ORDER_START T_NL order_list T_ORDER_END T_NL { /* If no order list supplied default to one forward */ add_order_bit(T_FORWARD); add_order_directive(); } | T_ORDER_START order_args T_NL order_list T_ORDER_END T_NL ; order_args : order_args T_SEMI order_arg { add_order_directive(); } | order_arg { add_order_directive(); } ; order_arg : order_arg T_COMMA order_dir | order_dir ; order_dir : T_FORWARD { add_order_bit(T_FORWARD); } | T_BACKWARD { add_order_bit(T_BACKWARD); } | T_POSITION { add_order_bit(T_POSITION); } ; order_list : order_list order_item | order_item ; order_item : T_COLLSYM T_NL { end_order_collsym($1); } | order_itemkw T_NL { end_order(); } | order_itemkw order_weights T_NL { end_order(); } ; order_itemkw : T_CHAR { start_order_char($1); } | T_ELLIPSIS { start_order_ellipsis(); } | T_COLLELEM { start_order_collelem($1); } | T_UNDEFINED { start_order_undefined(); } | T_SYMBOL { start_order_symbol($1); } ; order_weights : order_weights T_SEMI order_weight | order_weights T_SEMI | order_weight ; order_weight : T_COLLELEM { add_order_collelem($1); } | T_COLLSYM { add_order_collsym($1); } | T_CHAR { add_order_char($1); } | T_ELLIPSIS { add_order_ellipsis(); } | T_IGNORE { add_order_ignore(); } | T_SYMBOL { add_order_symbol($1); } | T_QUOTE order_str T_QUOTE { add_order_subst(); } ; order_str : order_str order_stritem | order_stritem ; order_stritem : T_CHAR { add_subst_char($1); } | T_COLLSYM { add_subst_collsym($1); } | T_COLLELEM { add_subst_collelem($1); } | T_SYMBOL { add_subst_symbol($1); } ; messages : T_MESSAGES T_NL messages_list T_END T_MESSAGES T_NL { dump_messages(); } | T_MESSAGES T_NL copycat T_END T_MESSAGES T_NL ; messages_list : messages_list messages_item | messages_item ; messages_kw : T_YESSTR | T_NOSTR | T_YESEXPR | T_NOEXPR ; messages_item : messages_kw string T_NL { add_message(get_wcs()); } ; monetary : T_MONETARY T_NL monetary_list T_END T_MONETARY T_NL { dump_monetary(); } | T_MONETARY T_NL copycat T_END T_MONETARY T_NL ; monetary_list : monetary_list monetary_kw | monetary_kw ; monetary_strkw : T_INT_CURR_SYMBOL | T_CURRENCY_SYMBOL | T_MON_DECIMAL_POINT | T_MON_THOUSANDS_SEP | T_POSITIVE_SIGN | T_NEGATIVE_SIGN ; monetary_numkw : T_INT_FRAC_DIGITS | T_FRAC_DIGITS | T_P_CS_PRECEDES | T_P_SEP_BY_SPACE | T_N_CS_PRECEDES | T_N_SEP_BY_SPACE | T_P_SIGN_POSN | T_N_SIGN_POSN | T_INT_P_CS_PRECEDES | T_INT_N_CS_PRECEDES | T_INT_P_SEP_BY_SPACE | T_INT_N_SEP_BY_SPACE | T_INT_P_SIGN_POSN | T_INT_N_SIGN_POSN ; monetary_kw : monetary_strkw string T_NL { add_monetary_str(get_wcs()); } | monetary_numkw T_NUMBER T_NL { add_monetary_num($2); } | T_MON_GROUPING mon_group_list T_NL ; mon_group_list : T_NUMBER { reset_monetary_group(); add_monetary_group($1); } | mon_group_list T_SEMI T_NUMBER { add_monetary_group($3); } ; numeric : T_NUMERIC T_NL numeric_list T_END T_NUMERIC T_NL { dump_numeric(); } | T_NUMERIC T_NL copycat T_END T_NUMERIC T_NL ; numeric_list : numeric_list numeric_item | numeric_item ; numeric_item : numeric_strkw string T_NL { add_numeric_str(get_wcs()); } | T_GROUPING group_list T_NL ; numeric_strkw : T_DECIMAL_POINT | T_THOUSANDS_SEP ; group_list : T_NUMBER { reset_numeric_group(); add_numeric_group($1); } | group_list T_SEMI T_NUMBER { add_numeric_group($3); } ; time : T_TIME T_NL time_kwlist T_END T_TIME T_NL { dump_time(); } | T_TIME T_NL copycat T_END T_NUMERIC T_NL ; time_kwlist : time_kwlist time_kw | time_kw ; time_kw : time_strkw string T_NL { add_time_str(get_wcs()); } | time_listkw time_list T_NL { check_time_list(); } ; time_listkw : T_ABDAY | T_DAY | T_ABMON | T_MON | T_ERA | T_ALT_DIGITS | T_AM_PM ; time_strkw : T_ERA_D_T_FMT | T_ERA_T_FMT | T_ERA_D_FMT | T_D_T_FMT | T_D_FMT | T_T_FMT | T_T_FMT_AMPM | T_DATE_FMT ; time_list : time_list T_SEMI string { add_time_list(get_wcs()); } | string { reset_time_list(); add_time_list(get_wcs()); } ; Property changes on: user/alc/PQ_LAUNDRY/usr.bin/localedef/parser.y ___________________________________________________________________ Added: svn:keywords ## -0,0 +1 ## +FreeBSD=%H \ No newline at end of property Index: user/alc/PQ_LAUNDRY/usr.bin/tee/tee.c =================================================================== --- user/alc/PQ_LAUNDRY/usr.bin/tee/tee.c (revision 306831) +++ user/alc/PQ_LAUNDRY/usr.bin/tee/tee.c (revision 306832) @@ -1,172 +1,158 @@ /* * Copyright (c) 1988, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #ifndef lint static const char copyright[] = "@(#) Copyright (c) 1988, 1993\n\ The Regents of the University of California. All rights reserved.\n"; #endif /* not lint */ #ifndef lint #if 0 static char sccsid[] = "@(#)tee.c 8.1 (Berkeley) 6/6/93"; #endif static const char rcsid[] = "$FreeBSD$"; #endif /* not lint */ #include #include #include +#include #include #include #include #include #include #include #include -#include #include typedef struct _list { struct _list *next; int fd; const char *name; } LIST; static LIST *head; static void add(int, const char *); static void usage(void); int main(int argc, char *argv[]) { LIST *p; int n, fd, rval, wval; char *bp; int append, ch, exitval; char *buf; - cap_rights_t rights; - unsigned long cmd; #define BSIZE (8 * 1024) append = 0; while ((ch = getopt(argc, argv, "ai")) != -1) switch((char)ch) { case 'a': append = 1; break; case 'i': (void)signal(SIGINT, SIG_IGN); break; case '?': default: usage(); } argv += optind; argc -= optind; if ((buf = malloc(BSIZE)) == NULL) err(1, "malloc"); - cap_rights_init(&rights, CAP_READ, CAP_FSTAT); - if (cap_rights_limit(STDIN_FILENO, &rights) < 0 && errno != ENOSYS) - err(EXIT_FAILURE, "unable to limit rights for stdin"); - cap_rights_init(&rights, CAP_WRITE, CAP_FSTAT, CAP_IOCTL); - if (cap_rights_limit(STDERR_FILENO, &rights) < 0 && errno != ENOSYS) - err(EXIT_FAILURE, "unable to limit rights for stderr"); - cmd = TIOCGETA; - if (cap_ioctls_limit(STDERR_FILENO, &cmd, 1) < 0 && errno != ENOSYS) - err(EXIT_FAILURE, "unable to limit ioctls for stderr"); + if (caph_limit_stdin() == -1 || caph_limit_stderr() == -1) + err(EXIT_FAILURE, "unable to limit stdio"); add(STDOUT_FILENO, "stdout"); for (exitval = 0; *argv; ++argv) if ((fd = open(*argv, append ? O_WRONLY|O_CREAT|O_APPEND : O_WRONLY|O_CREAT|O_TRUNC, DEFFILEMODE)) < 0) { warn("%s", *argv); exitval = 1; } else add(fd, *argv); if (cap_enter() < 0 && errno != ENOSYS) err(EXIT_FAILURE, "unable to enter capability mode"); while ((rval = read(STDIN_FILENO, buf, BSIZE)) > 0) for (p = head; p; p = p->next) { n = rval; bp = buf; do { if ((wval = write(p->fd, bp, n)) == -1) { warn("%s", p->name); exitval = 1; break; } bp += wval; } while (n -= wval); } if (rval < 0) err(1, "read"); exit(exitval); } static void usage(void) { (void)fprintf(stderr, "usage: tee [-ai] [file ...]\n"); exit(1); } static void add(int fd, const char *name) { LIST *p; cap_rights_t rights; - unsigned long cmd; - if (fd == STDOUT_FILENO) - cap_rights_init(&rights, CAP_WRITE, CAP_FSTAT, CAP_IOCTL); - else - cap_rights_init(&rights, CAP_WRITE, CAP_FSTAT); - if (cap_rights_limit(fd, &rights) < 0 && errno != ENOSYS) - err(EXIT_FAILURE, "unable to limit rights"); - if (fd == STDOUT_FILENO) { - cmd = TIOCGETA; - if (cap_ioctls_limit(fd, &cmd, 1) < 0 && errno != ENOSYS) - err(EXIT_FAILURE, "unable to limit ioctls for stdout"); + if (caph_limit_stdout() == -1) + err(EXIT_FAILURE, "unable to limit stdout"); + } else { + cap_rights_init(&rights, CAP_WRITE, CAP_FSTAT); + if (cap_rights_limit(fd, &rights) < 0 && errno != ENOSYS) + err(EXIT_FAILURE, "unable to limit rights"); } if ((p = malloc(sizeof(LIST))) == NULL) err(1, "malloc"); p->fd = fd; p->name = name; p->next = head; head = p; } Index: user/alc/PQ_LAUNDRY/usr.bin/tr/tr.c =================================================================== --- user/alc/PQ_LAUNDRY/usr.bin/tr/tr.c (revision 306831) +++ user/alc/PQ_LAUNDRY/usr.bin/tr/tr.c (revision 306832) @@ -1,400 +1,382 @@ /* * Copyright (c) 1988, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #ifndef lint static const char copyright[] = "@(#) Copyright (c) 1988, 1993\n\ The Regents of the University of California. All rights reserved.\n"; #endif #ifndef lint static const char sccsid[] = "@(#)tr.c 8.2 (Berkeley) 5/4/95"; #endif #include #include +#include #include #include -#include #include #include #include #include #include #include -#include #include #include #include #include "cmap.h" #include "cset.h" #include "extern.h" static STR s1 = { STRING1, NORMAL, 0, OOBCH, 0, { 0, OOBCH }, NULL, NULL }; static STR s2 = { STRING2, NORMAL, 0, OOBCH, 0, { 0, OOBCH }, NULL, NULL }; static struct cset *setup(char *, STR *, int, int); static void usage(void); int main(int argc, char **argv) { static int carray[NCHARS_SB]; - cap_rights_t rights; - unsigned long cmd; struct cmap *map; struct cset *delete, *squeeze; int n, *p; int Cflag, cflag, dflag, sflag, isstring2; wint_t ch, cnt, lastch; (void)setlocale(LC_ALL, ""); - cap_rights_init(&rights, CAP_FSTAT, CAP_IOCTL, CAP_READ); - if (cap_rights_limit(STDIN_FILENO, &rights) < 0 && errno != ENOSYS) - err(1, "unable to limit rights for stdin"); - cap_rights_init(&rights, CAP_FSTAT, CAP_IOCTL, CAP_WRITE); - if (cap_rights_limit(STDOUT_FILENO, &rights) < 0 && errno != ENOSYS) - err(1, "unable to limit rights for stdout"); - if (cap_rights_limit(STDERR_FILENO, &rights) < 0 && errno != ENOSYS) - err(1, "unable to limit rights for stderr"); - - /* Required for isatty(3). */ - cmd = TIOCGETA; - if (cap_ioctls_limit(STDIN_FILENO, &cmd, 1) < 0 && errno != ENOSYS) - err(1, "unable to limit ioctls for stdin"); - if (cap_ioctls_limit(STDOUT_FILENO, &cmd, 1) < 0 && errno != ENOSYS) - err(1, "unable to limit ioctls for stdout"); - if (cap_ioctls_limit(STDERR_FILENO, &cmd, 1) < 0 && errno != ENOSYS) - err(1, "unable to limit ioctls for stderr"); + if (caph_limit_stdio() == -1) + err(1, "unable to limit stdio"); if (cap_enter() < 0 && errno != ENOSYS) err(1, "unable to enter capability mode"); Cflag = cflag = dflag = sflag = 0; while ((ch = getopt(argc, argv, "Ccdsu")) != -1) switch((char)ch) { case 'C': Cflag = 1; cflag = 0; break; case 'c': cflag = 1; Cflag = 0; break; case 'd': dflag = 1; break; case 's': sflag = 1; break; case 'u': setbuf(stdout, (char *)NULL); break; case '?': default: usage(); } argc -= optind; argv += optind; switch(argc) { case 0: default: usage(); /* NOTREACHED */ case 1: isstring2 = 0; break; case 2: isstring2 = 1; break; } /* * tr -ds [-Cc] string1 string2 * Delete all characters (or complemented characters) in string1. * Squeeze all characters in string2. */ if (dflag && sflag) { if (!isstring2) usage(); delete = setup(argv[0], &s1, cflag, Cflag); squeeze = setup(argv[1], &s2, 0, 0); for (lastch = OOBCH; (ch = getwchar()) != WEOF;) if (!cset_in(delete, ch) && (lastch != ch || !cset_in(squeeze, ch))) { lastch = ch; (void)putwchar(ch); } if (ferror(stdin)) err(1, NULL); exit(0); } /* * tr -d [-Cc] string1 * Delete all characters (or complemented characters) in string1. */ if (dflag) { if (isstring2) usage(); delete = setup(argv[0], &s1, cflag, Cflag); while ((ch = getwchar()) != WEOF) if (!cset_in(delete, ch)) (void)putwchar(ch); if (ferror(stdin)) err(1, NULL); exit(0); } /* * tr -s [-Cc] string1 * Squeeze all characters (or complemented characters) in string1. */ if (sflag && !isstring2) { squeeze = setup(argv[0], &s1, cflag, Cflag); for (lastch = OOBCH; (ch = getwchar()) != WEOF;) if (lastch != ch || !cset_in(squeeze, ch)) { lastch = ch; (void)putwchar(ch); } if (ferror(stdin)) err(1, NULL); exit(0); } /* * tr [-Ccs] string1 string2 * Replace all characters (or complemented characters) in string1 with * the character in the same position in string2. If the -s option is * specified, squeeze all the characters in string2. */ if (!isstring2) usage(); map = cmap_alloc(); if (map == NULL) err(1, NULL); squeeze = cset_alloc(); if (squeeze == NULL) err(1, NULL); s1.str = argv[0]; if (Cflag || cflag) { cmap_default(map, OOBCH); if ((s2.str = strdup(argv[1])) == NULL) errx(1, "strdup(argv[1])"); } else s2.str = argv[1]; if (!next(&s2)) errx(1, "empty string2"); /* * For -s result will contain only those characters defined * as the second characters in each of the toupper or tolower * pairs. */ /* If string2 runs out of characters, use the last one specified. */ while (next(&s1)) { again: if (s1.state == CCLASS_LOWER && s2.state == CCLASS_UPPER && s1.cnt == 1 && s2.cnt == 1) { do { ch = towupper(s1.lastch); cmap_add(map, s1.lastch, ch); if (sflag && iswupper(ch)) cset_add(squeeze, ch); if (!next(&s1)) goto endloop; } while (s1.state == CCLASS_LOWER && s1.cnt > 1); /* skip upper set */ do { if (!next(&s2)) break; } while (s2.state == CCLASS_UPPER && s2.cnt > 1); goto again; } else if (s1.state == CCLASS_UPPER && s2.state == CCLASS_LOWER && s1.cnt == 1 && s2.cnt == 1) { do { ch = towlower(s1.lastch); cmap_add(map, s1.lastch, ch); if (sflag && iswlower(ch)) cset_add(squeeze, ch); if (!next(&s1)) goto endloop; } while (s1.state == CCLASS_UPPER && s1.cnt > 1); /* skip lower set */ do { if (!next(&s2)) break; } while (s2.state == CCLASS_LOWER && s2.cnt > 1); goto again; } else { cmap_add(map, s1.lastch, s2.lastch); if (sflag) cset_add(squeeze, s2.lastch); } (void)next(&s2); } endloop: if (cflag || (Cflag && MB_CUR_MAX > 1)) { /* * This is somewhat tricky: since the character set is * potentially huge, we need to avoid allocating a map * entry for every character. Our strategy is to set the * default mapping to the last character of string #2 * (= the one that gets automatically repeated), then to * add back identity mappings for characters that should * remain unchanged. We don't waste space on identity mappings * for non-characters with the -C option; those are simulated * in the I/O loop. */ s2.str = argv[1]; s2.state = NORMAL; for (cnt = 0; cnt < WINT_MAX; cnt++) { if (Cflag && !iswrune(cnt)) continue; if (cmap_lookup(map, cnt) == OOBCH) { if (next(&s2)) { cmap_add(map, cnt, s2.lastch); if (sflag) cset_add(squeeze, s2.lastch); } } else cmap_add(map, cnt, cnt); if ((s2.state == EOS || s2.state == INFINITE) && cnt >= cmap_max(map)) break; } cmap_default(map, s2.lastch); } else if (Cflag) { for (p = carray, cnt = 0; cnt < NCHARS_SB; cnt++) { if (cmap_lookup(map, cnt) == OOBCH && iswrune(cnt)) *p++ = cnt; else cmap_add(map, cnt, cnt); } n = p - carray; if (Cflag && n > 1) (void)mergesort(carray, n, sizeof(*carray), charcoll); s2.str = argv[1]; s2.state = NORMAL; for (cnt = 0; cnt < n; cnt++) { (void)next(&s2); cmap_add(map, carray[cnt], s2.lastch); /* * Chars taken from s2 can be different this time * due to lack of complex upper/lower processing, * so fill string2 again to not miss some. */ if (sflag) cset_add(squeeze, s2.lastch); } } cset_cache(squeeze); cmap_cache(map); if (sflag) for (lastch = OOBCH; (ch = getwchar()) != WEOF;) { if (!Cflag || iswrune(ch)) ch = cmap_lookup(map, ch); if (lastch != ch || !cset_in(squeeze, ch)) { lastch = ch; (void)putwchar(ch); } } else while ((ch = getwchar()) != WEOF) { if (!Cflag || iswrune(ch)) ch = cmap_lookup(map, ch); (void)putwchar(ch); } if (ferror(stdin)) err(1, NULL); exit (0); } static struct cset * setup(char *arg, STR *str, int cflag, int Cflag) { struct cset *cs; cs = cset_alloc(); if (cs == NULL) err(1, NULL); str->str = arg; while (next(str)) cset_add(cs, str->lastch); if (Cflag) cset_addclass(cs, wctype("rune"), true); if (cflag || Cflag) cset_invert(cs); cset_cache(cs); return (cs); } int charcoll(const void *a, const void *b) { static char sa[2], sb[2]; sa[0] = *(const int *)a; sb[0] = *(const int *)b; return (strcoll(sa, sb)); } static void usage(void) { (void)fprintf(stderr, "%s\n%s\n%s\n%s\n", "usage: tr [-Ccsu] string1 string2", " tr [-Ccu] -d string1", " tr [-Ccu] -s string1", " tr [-Ccu] -ds string1 string2"); exit(1); } Index: user/alc/PQ_LAUNDRY/usr.sbin/arp/arp.4 =================================================================== --- user/alc/PQ_LAUNDRY/usr.sbin/arp/arp.4 (revision 306831) +++ user/alc/PQ_LAUNDRY/usr.sbin/arp/arp.4 (revision 306832) @@ -1,253 +1,251 @@ .\" Copyright (c) 1985, 1986, 1988, 1994 .\" The Regents of the University of California. All rights reserved. .\" .\" Redistribution and use in source and binary forms, with or without .\" modification, are permitted provided that the following conditions .\" are met: .\" 1. Redistributions of source code must retain the above copyright .\" notice, this list of conditions and the following disclaimer. .\" 2. Redistributions in binary form must reproduce the above copyright .\" notice, this list of conditions and the following disclaimer in the .\" documentation and/or other materials provided with the distribution. .\" 4. Neither the name of the University nor the names of its contributors .\" may be used to endorse or promote products derived from this software .\" without specific prior written permission. .\" .\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND .\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE .\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE .\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE .\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL .\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS .\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) .\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT .\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF .\" SUCH DAMAGE. .\" .\" @(#)arp4.4 6.5 (Berkeley) 4/18/94 .\" $FreeBSD$ .\" -.Dd October 3, 2016 +.Dd October 7, 2016 .Dt ARP 4 .Os .Sh NAME .Nm arp .Nd Address Resolution Protocol .Sh SYNOPSIS .Cd "device ether" .Sh DESCRIPTION The Address Resolution Protocol (ARP) is used to dynamically map between Protocol Addresses (such as IP addresses) and Local Network Addresses (such as Ethernet addresses). This implementation maps IP addresses to Ethernet, ARCnet, or Token Ring addresses. It is used by all the Ethernet interface drivers. .Pp ARP caches Internet-Ethernet address mappings. When an interface requests a mapping for an address not in the cache, ARP queues the message which requires the mapping and broadcasts a message on the associated network requesting the address mapping. If a response is provided, the new mapping is cached and any pending message is transmitted. ARP will queue at most one packet while waiting for a response to a mapping request; only the most recently ``transmitted'' packet is kept. If the target host does not respond after several requests, the host is considered to be down allowing an error to be returned to transmission attempts. Further demand for this mapping causes ARP request retransmissions, that are ratelimited to one packet per second. The error is .Er EHOSTDOWN for a non-responding destination host, and .Er EHOSTUNREACH for a non-responding router. .Pp The ARP cache is stored in the system routing table as dynamically-created host routes. The route to a directly-attached Ethernet network is installed as a .Dq cloning route (one with the .Li RTF_CLONING flag set), causing routes to individual hosts on that network to be created on demand. These routes time out periodically (normally 20 minutes after validated; entries are not validated when not in use). .Pp ARP entries may be added, deleted or changed with the .Xr arp 8 utility. Manually-added entries may be temporary or permanent, and may be .Dq published , in which case the system will respond to ARP requests for that host as if it were the target of the request. .Pp In the past, ARP was used to negotiate the use of a trailer encapsulation. This is no longer supported. .Pp ARP watches passively for hosts impersonating the local host (i.e., a host which responds to an ARP mapping request for the local host's address). .Pp Proxy ARP is a feature whereby the local host will respond to requests for addresses other than itself, with its own address. Normally, proxy ARP in .Fx is set up on a host-by-host basis using the .Xr arp 8 utility, by adding an entry for each host inside a given subnet for which proxying of ARP requests is desired. However, the .Dq "proxy all" feature causes the local host to act as a proxy for .Em all hosts reachable through some other network interface, different from the one the request came in from. It may be enabled by setting the .Xr sysctl 8 MIB variable .Va net.link.ether.inet.proxyall to 1. .Sh MIB Variables The ARP protocol implements a number of configurable variables in .Va net.link.ether.inet branch of the .Xr sysctl 3 MIB. .Bl -tag -width "log_arp_permanent_modify" .It Va allow_multicast -Should the kernel install ARP entries with multicast bit set in -the hardware address. -Installing such entries is RFC 1812 violation, but some prorietary -load balancing techniques require routers on network to do so. +Install ARP entries with the multicast bit set in the hardware address. +Installing such entries is an RFC 1812 violation, but some proprietary load +balancing techniques require routers to do so. Turned off by default. .It Va garp_rexmit_count -Should the kernel retransmit gratuitous ARP (GARP) packets when an IPv4 address -is added to an interface. +Retransmit gratuitous ARP (GARP) packets when an IPv4 address is added to an +interface. A GARP is always transmitted when an IPv4 address is added to an interface. -A non-zero value of this sysctl will cause the GARP packet to be retransmitted -the stated number of times. +A non-zero value causes the GARP packet to be retransmitted the stated number +of times. The interval between retransmissions is doubled each time, so the retransmission intervals are: {1, 2, 4, 8, 16, ...} (seconds). The default value of zero means only the initial GARP is sent; no additional GARP packets are retransmitted. The maximum value is sixteen. .Pp -Although a single GARP packet (the default behavior) is usually sufficient, in -some circumstances, such as when a shared address is passed between cluster -nodes, this single GARP may be dropped or lost. -This can lead to neighbors on the network link working with a stale ARP cache -and sending packets destined for that address to the node that previously owned -the address, which may not respond. +The default behavior of a single GARP packet is usually sufficient. +However, a single GARP might be dropped or lost in some circumstances. +This is particularly harmful when a shared address is passed between cluster +nodes. +Neighbors on the network link might then work with a stale ARP cache and send +packets destined for that address to the node that previously owned the +address, which might not respond. .It Va log_arp_movements -Should the kernel log movements of IP addresses from one hardware -address to an other. +Log movements of IP addresses from one hardware address to another. See .Sx DIAGNOSTICS below. Turned on by default. .It Va log_arp_permanent_modify -Should the kernel log attempts of remote host on network to modify a -permanent ARP entry. +Log attempts by a remote host to modify a permanent ARP entry. See .Sx DIAGNOSTICS below. Turned on by default. .It Va log_arp_wrong_iface -Should the kernel log attempts to insert an ARP entry on an interface -when the IP network the address belongs to is connected to an other -interface. +Log attempts to insert an ARP entry on an interface when the IP network to +which the address belongs is connected to another interface. See .Sx DIAGNOSTICS below. Turned on by default. .It Va max_log_per_second -Limit number of remotely triggered logging events to a configured value -per second. +Limit the number of remotely triggered logging events to a configured value per +second. Default is 1 log message per second. .It Va max_age How long an ARP entry is held in the cache until it needs to be refreshed. Default is 1200 seconds. .It Va maxhold -How many packets hold in the per-entry output queue while the entry +How many packets to hold in the per-entry output queue while the entry is being resolved. Default is one packet. .It Va maxtries -Number of retransmits before host is considered down and error is returned. +Number of retransmits before a host is considered down and an error is +returned. Default is 5 tries. .It Va proxyall -Enables ARP proxying for all hosts on net. +Enables ARP proxying. Turned off by default. .It Va wait Lifetime of an incomplete ARP entry. Default is 20 seconds. .El .Sh DIAGNOSTICS .Bl -diag .It "arp: %x:%x:%x:%x:%x:%x is using my IP address %d.%d.%d.%d on %s!" ARP has discovered another host on the local network which responds to mapping requests for its own Internet address with a different Ethernet address, generally indicating that two hosts are attempting to use the same Internet address. .It "arp: link address is broadcast for IP address %d.%d.%d.%d!" ARP requested information for a host, and received an answer indicating that the host's ethernet address is the ethernet broadcast address. This indicates a misconfigured or broken device. .It "arp: %d.%d.%d.%d moved from %x:%x:%x:%x:%x:%x to %x:%x:%x:%x:%x:%x on %s" ARP had a cached value for the ethernet address of the referenced host, but received a reply indicating that the host is at a new address. This can happen normally when host hardware addresses change, or when a mobile node arrives or leaves the local subnet. It can also indicate a problem with proxy ARP. This message can only be issued if the sysctl .Va net.link.ether.inet.log_arp_movements is set to 1, which is the system's default behaviour. .It "arpresolve: can't allocate llinfo for %d.%d.%d.%d" The route for the referenced host points to a device upon which ARP is required, but ARP was unable to allocate a routing table entry in which to store the host's MAC address. This usually points to a misconfigured routing table. It can also occur if the kernel cannot allocate memory. .It "arp: %d.%d.%d.%d is on if0 but got reply from %x:%x:%x:%x:%x:%x on if1" Physical connections exist to the same logical IP network on both if0 and if1. It can also occur if an entry already exists in the ARP cache for the IP address above, and the cable has been disconnected from if0, then reconnected to if1. This message can only be issued if the sysctl .Va net.link.ether.inet.log_arp_wrong_iface is set to 1, which is the system's default behaviour. .It "arp: %x:%x:%x:%x:%x:%x attempts to modify permanent entry for %d.%d.%d.%d on %s" ARP has received an ARP reply that attempts to overwrite a permanent entry in the local ARP table. This error will only be logged if the sysctl .Va net.link.ether.inet.log_arp_permanent_modify is set to 1, which is the system's default behaviour. .It "arp: %x:%x:%x:%x:%x:%x is multicast" Kernel refused to install an entry with multicast hardware address. If you really want such addresses being installed, set the sysctl .Va net.link.ether.inet.allow_multicast to a positive value. .El .Sh SEE ALSO .Xr inet 4 , .Xr route 4 , .Xr arp 8 , .Xr ifconfig 8 , .Xr route 8 , .Xr sysctl 8 .Rs .%A Plummer, D. .%B "An Ethernet Address Resolution Protocol" .%T RFC826 .Re .Rs .%A Leffler, S.J. .%A Karels, M.J. .%B "Trailer Encapsulations" .%T RFC893 .Re Index: user/alc/PQ_LAUNDRY/usr.sbin/makefs/cd9660.c =================================================================== --- user/alc/PQ_LAUNDRY/usr.sbin/makefs/cd9660.c (revision 306831) +++ user/alc/PQ_LAUNDRY/usr.sbin/makefs/cd9660.c (revision 306832) @@ -1,2134 +1,2135 @@ /* $NetBSD: cd9660.c,v 1.32 2011/08/23 17:09:11 christos Exp $ */ /* * Copyright (c) 2005 Daniel Watt, Walter Deignan, Ryan Gabrys, Alan * Perez-Rathke and Ram Vedam. All rights reserved. * * This code was written by Daniel Watt, Walter Deignan, Ryan Gabrys, * Alan Perez-Rathke and Ram Vedam. * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials provided * with the distribution. * * THIS SOFTWARE IS PROVIDED BY DANIEL WATT, WALTER DEIGNAN, RYAN * GABRYS, ALAN PEREZ-RATHKE AND RAM VEDAM ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL DANIEL WATT, WALTER DEIGNAN, RYAN * GABRYS, ALAN PEREZ-RATHKE AND RAM VEDAM BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF * USE,DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY * OF SUCH DAMAGE. */ /* * Copyright (c) 2001 Wasabi Systems, Inc. * All rights reserved. * * Written by Luke Mewburn for Wasabi Systems, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed for the NetBSD Project by * Wasabi Systems, Inc. * 4. The name of Wasabi Systems, Inc. may not be used to endorse * or promote products derived from this software without specific prior * written permission. * * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL WASABI SYSTEMS, INC * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include "makefs.h" #include "cd9660.h" #include "cd9660/iso9660_rrip.h" #include "cd9660/cd9660_archimedes.h" /* * Global variables */ iso9660_disk diskStructure; static void cd9660_finalize_PVD(void); static cd9660node *cd9660_allocate_cd9660node(void); static void cd9660_set_defaults(void); static int cd9660_arguments_set_string(const char *, const char *, int, char, char *); static void cd9660_populate_iso_dir_record( struct _iso_directory_record_cd9660 *, u_char, u_char, u_char, const char *); static void cd9660_setup_root_node(void); static int cd9660_setup_volume_descriptors(void); #if 0 static int cd9660_fill_extended_attribute_record(cd9660node *); #endif static void cd9660_sort_nodes(cd9660node *); static int cd9660_translate_node_common(cd9660node *); static int cd9660_translate_node(fsnode *, cd9660node *); static int cd9660_compare_filename(const char *, const char *); static void cd9660_sorted_child_insert(cd9660node *, cd9660node *); static int cd9660_handle_collisions(cd9660node *, int); static cd9660node *cd9660_rename_filename(cd9660node *, int, int); static void cd9660_copy_filenames(cd9660node *); static void cd9660_sorting_nodes(cd9660node *); static int cd9660_count_collisions(cd9660node *); static cd9660node *cd9660_rrip_move_directory(cd9660node *); static int cd9660_add_dot_records(cd9660node *); static void cd9660_convert_structure(fsnode *, cd9660node *, int, int *, int *); static void cd9660_free_structure(cd9660node *); static int cd9660_generate_path_table(void); static int cd9660_level1_convert_filename(const char *, char *, int); static int cd9660_level2_convert_filename(const char *, char *, int); #if 0 static int cd9660_joliet_convert_filename(const char *, char *, int); #endif static int cd9660_convert_filename(const char *, char *, int); static void cd9660_populate_dot_records(cd9660node *); static int64_t cd9660_compute_offsets(cd9660node *, int64_t); #if 0 static int cd9660_copy_stat_info(cd9660node *, cd9660node *, int); #endif static cd9660node *cd9660_create_virtual_entry(const char *, cd9660node *, int, int); static cd9660node *cd9660_create_file(const char *, cd9660node *, cd9660node *); static cd9660node *cd9660_create_directory(const char *, cd9660node *, cd9660node *); static cd9660node *cd9660_create_special_directory(u_char, cd9660node *); /* * Allocate and initialize a cd9660node * @returns struct cd9660node * Pointer to new node, or NULL on error */ static cd9660node * cd9660_allocate_cd9660node(void) { cd9660node *temp; if ((temp = calloc(1, sizeof(cd9660node))) == NULL) err(EXIT_FAILURE, "%s: calloc", __func__); TAILQ_INIT(&temp->cn_children); temp->parent = temp->dot_record = temp->dot_dot_record = NULL; temp->ptnext = temp->ptprev = temp->ptlast = NULL; temp->node = NULL; temp->isoDirRecord = NULL; temp->isoExtAttributes = NULL; temp->rr_real_parent = temp->rr_relocated = NULL; temp->su_tail_data = NULL; return temp; } int cd9660_defaults_set = 0; /** * Set default values for cd9660 extension to makefs */ static void cd9660_set_defaults(void) { /*Fix the sector size for now, though the spec allows for other sizes*/ diskStructure.sectorSize = 2048; /* Set up defaults in our own structure */ diskStructure.verbose_level = 0; diskStructure.keep_bad_images = 0; diskStructure.follow_sym_links = 0; diskStructure.isoLevel = 2; diskStructure.rock_ridge_enabled = 0; diskStructure.rock_ridge_renamed_dir_name = 0; diskStructure.rock_ridge_move_count = 0; diskStructure.rr_moved_dir = 0; diskStructure.archimedes_enabled = 0; diskStructure.chrp_boot = 0; diskStructure.include_padding_areas = 1; /* Spec breaking functionality */ diskStructure.allow_deep_trees = diskStructure.allow_start_dot = diskStructure.allow_max_name = diskStructure.allow_illegal_chars = diskStructure.allow_lowercase = diskStructure.allow_multidot = diskStructure.omit_trailing_period = 0; /* Make sure the PVD is clear */ memset(&diskStructure.primaryDescriptor, 0, 2048); memset(diskStructure.primaryDescriptor.publisher_id, 0x20,128); memset(diskStructure.primaryDescriptor.preparer_id, 0x20,128); memset(diskStructure.primaryDescriptor.application_id, 0x20,128); memset(diskStructure.primaryDescriptor.copyright_file_id, 0x20,37); memset(diskStructure.primaryDescriptor.abstract_file_id, 0x20,37); memset(diskStructure.primaryDescriptor.bibliographic_file_id, 0x20,37); strcpy(diskStructure.primaryDescriptor.system_id, "FreeBSD"); cd9660_defaults_set = 1; /* Boot support: Initially disabled */ diskStructure.has_generic_bootimage = 0; diskStructure.generic_bootimage = NULL; diskStructure.boot_image_directory = 0; /*memset(diskStructure.boot_descriptor, 0, 2048);*/ diskStructure.is_bootable = 0; TAILQ_INIT(&diskStructure.boot_images); LIST_INIT(&diskStructure.boot_entries); } void cd9660_prep_opts(fsinfo_t *fsopts __unused) { cd9660_set_defaults(); } void cd9660_cleanup_opts(fsinfo_t *fsopts __unused) { } static int cd9660_arguments_set_string(const char *val, const char *fieldtitle, int length, char testmode, char * dest) { int len, test; if (val == NULL) warnx("error: The %s requires a string argument", fieldtitle); else if ((len = strlen(val)) <= length) { if (testmode == 'd') test = cd9660_valid_d_chars(val); else test = cd9660_valid_a_chars(val); if (test) { memcpy(dest, val, len); if (test == 2) cd9660_uppercase_characters(dest, len); return 1; } else warnx("error: The %s must be composed of " "%c-characters", fieldtitle, testmode); } else warnx("error: The %s must be at most 32 characters long", fieldtitle); return 0; } /* * Command-line parsing function */ int cd9660_parse_opts(const char *option, fsinfo_t *fsopts) { char *var, *val; int rv; /* Set up allowed options - integer options ONLY */ option_t cd9660_options[] = { { "l", &diskStructure.isoLevel, 1, 2, "ISO Level" }, { "isolevel", &diskStructure.isoLevel, 1, 2, "ISO Level" }, { "verbose", &diskStructure.verbose_level, 0, 2, "Turns on verbose output" }, { "v", &diskStructure.verbose_level, 0 , 2, "Turns on verbose output"}, { .name = NULL } }; if (cd9660_defaults_set == 0) cd9660_set_defaults(); /* * Todo : finish implementing this, and make a function that * parses them */ /* string_option_t cd9660_string_options[] = { { "L", "Label", &diskStructure.primaryDescriptor.volume_id, 1, 32, "Disk Label", ISO_STRING_FILTER_DCHARS }, { NULL } } */ assert(option != NULL); if (debug & DEBUG_FS_PARSE_OPTS) printf("cd9660_parse_opts: got `%s'\n", option); if ((var = strdup(option)) == NULL) err(1, "allocating memory for copy of option string"); rv = 1; val = strchr(var, '='); if (val != NULL) *val++ = '\0'; /* First handle options with no parameters */ if (strcmp(var, "h") == 0) { diskStructure.displayHelp = 1; rv = 1; } else if (CD9660_IS_COMMAND_ARG_DUAL(var, "S", "follow-symlinks")) { /* this is not handled yet */ diskStructure.follow_sym_links = 1; rv = 1; } else if (CD9660_IS_COMMAND_ARG_DUAL(var, "L", "label")) { rv = cd9660_arguments_set_string(val, "Disk Label", 32, 'd', diskStructure.primaryDescriptor.volume_id); } else if (CD9660_IS_COMMAND_ARG_DUAL(var, "A", "applicationid")) { rv = cd9660_arguments_set_string(val, "Application Identifier", 128, 'a', diskStructure.primaryDescriptor.application_id); } else if(CD9660_IS_COMMAND_ARG_DUAL(var, "P", "publisher")) { rv = cd9660_arguments_set_string(val, "Publisher Identifier", 128, 'a', diskStructure.primaryDescriptor.publisher_id); } else if (CD9660_IS_COMMAND_ARG_DUAL(var, "p", "preparer")) { rv = cd9660_arguments_set_string(val, "Preparer Identifier", 128, 'a', diskStructure.primaryDescriptor.preparer_id); } else if (CD9660_IS_COMMAND_ARG_DUAL(var, "V", "volumeid")) { rv = cd9660_arguments_set_string(val, "Volume Set Identifier", 128, 'a', diskStructure.primaryDescriptor.volume_set_id); /* Boot options */ } else if (CD9660_IS_COMMAND_ARG_DUAL(var, "B", "bootimage")) { if (val == NULL) warnx("error: The Boot Image parameter requires a valid boot information string"); else rv = cd9660_add_boot_disk(val); } else if (CD9660_IS_COMMAND_ARG(var, "bootimagedir")) { /* * XXXfvdl this is unused. */ if (val == NULL) errx(1, "error: The Boot Image Directory parameter" " requires a directory name\n"); else { if ((diskStructure.boot_image_directory = malloc(strlen(val) + 1)) == NULL) { CD9660_MEM_ALLOC_ERROR("cd9660_parse_opts"); exit(1); } /* BIG TODO: Add the max length function here */ cd9660_arguments_set_string(val, "Boot Image Directory", 12 , 'd', diskStructure.boot_image_directory); } } else if (CD9660_IS_COMMAND_ARG_DUAL(var, "G", "generic-bootimage")) { if (val == NULL) warnx("error: The Boot Image parameter requires a valid boot information string"); else rv = cd9660_add_generic_bootimage(val); } else if (CD9660_IS_COMMAND_ARG(var, "no-trailing-padding")) diskStructure.include_padding_areas = 0; /* RRIP */ else if (CD9660_IS_COMMAND_ARG_DUAL(var, "R", "rockridge")) diskStructure.rock_ridge_enabled = 1; else if (CD9660_IS_COMMAND_ARG_DUAL(var, "A", "archimedes")) diskStructure.archimedes_enabled = 1; else if (CD9660_IS_COMMAND_ARG(var, "chrp-boot")) diskStructure.chrp_boot = 1; else if (CD9660_IS_COMMAND_ARG_DUAL(var, "K", "keep-bad-images")) diskStructure.keep_bad_images = 1; else if (CD9660_IS_COMMAND_ARG(var, "allow-deep-trees")) diskStructure.allow_deep_trees = 1; else if (CD9660_IS_COMMAND_ARG(var, "allow-max-name")) diskStructure.allow_max_name = 1; else if (CD9660_IS_COMMAND_ARG(var, "allow-illegal-chars")) diskStructure.allow_illegal_chars = 1; else if (CD9660_IS_COMMAND_ARG(var, "allow-lowercase")) diskStructure.allow_lowercase = 1; else if (CD9660_IS_COMMAND_ARG(var,"allow-multidot")) diskStructure.allow_multidot = 1; else if (CD9660_IS_COMMAND_ARG(var, "omit-trailing-period")) diskStructure.omit_trailing_period = 1; else if (CD9660_IS_COMMAND_ARG(var, "no-emul-boot") || CD9660_IS_COMMAND_ARG(var, "no-boot") || CD9660_IS_COMMAND_ARG(var, "hard-disk-boot")) { cd9660_eltorito_add_boot_option(var, 0); /* End of flag variables */ } else if (CD9660_IS_COMMAND_ARG(var, "boot-load-segment")) { if (val == NULL) { warnx("Option `%s' doesn't contain a value", var); rv = 0; } else { cd9660_eltorito_add_boot_option(var, val); } } else { if (val == NULL) { warnx("Option `%s' doesn't contain a value", var); rv = 0; } else rv = set_option(cd9660_options, var, val); } free(var); return (rv); } /* * Main function for cd9660_makefs * Builds the ISO image file * @param const char *image The image filename to create * @param const char *dir The directory that is being read * @param struct fsnode *root The root node of the filesystem tree * @param struct fsinfo_t *fsopts Any options */ void cd9660_makefs(const char *image, const char *dir, fsnode *root, fsinfo_t *fsopts) { int64_t startoffset; int numDirectories; uint64_t pathTableSectors; int64_t firstAvailableSector; int64_t totalSpace; int error; cd9660node *real_root; if (diskStructure.verbose_level > 0) printf("cd9660_makefs: ISO level is %i\n", diskStructure.isoLevel); if (diskStructure.isoLevel < 2 && diskStructure.allow_multidot) errx(1, "allow-multidot requires iso level of 2\n"); assert(image != NULL); assert(dir != NULL); assert(root != NULL); if (diskStructure.displayHelp) { /* * Display help here - probably want to put it in * a separate function */ return; } if (diskStructure.verbose_level > 0) printf("cd9660_makefs: image %s directory %s root %p\n", image, dir, root); /* Set up some constants. Later, these will be defined with options */ /* Counter needed for path tables */ numDirectories = 0; /* Convert tree to our own format */ /* Actually, we now need to add the REAL root node, at level 0 */ real_root = cd9660_allocate_cd9660node(); if ((real_root->isoDirRecord = malloc( sizeof(iso_directory_record_cd9660) )) == NULL) { CD9660_MEM_ALLOC_ERROR("cd9660_makefs"); exit(1); } /* Leave filename blank for root */ memset(real_root->isoDirRecord->name, 0, ISO_FILENAME_MAXLENGTH_WITH_PADDING); real_root->level = 0; diskStructure.rootNode = real_root; real_root->type = CD9660_TYPE_DIR; error = 0; real_root->node = root; cd9660_convert_structure(root, real_root, 1, &numDirectories, &error); if (TAILQ_EMPTY(&real_root->cn_children)) { errx(1, "cd9660_makefs: converted directory is empty. " "Tree conversion failed\n"); } else if (error != 0) { errx(1, "cd9660_makefs: tree conversion failed\n"); } else { if (diskStructure.verbose_level > 0) printf("cd9660_makefs: tree converted\n"); } /* Add the dot and dot dot records */ cd9660_add_dot_records(real_root); cd9660_setup_root_node(); if (diskStructure.verbose_level > 0) printf("cd9660_makefs: done converting tree\n"); /* non-SUSP extensions */ if (diskStructure.archimedes_enabled) archimedes_convert_tree(diskStructure.rootNode); /* Rock ridge / SUSP init pass */ if (diskStructure.rock_ridge_enabled) { cd9660_susp_initialize(diskStructure.rootNode, diskStructure.rootNode, NULL); } /* Build path table structure */ diskStructure.pathTableLength = cd9660_generate_path_table(); pathTableSectors = CD9660_BLOCKS(diskStructure.sectorSize, diskStructure.pathTableLength); firstAvailableSector = cd9660_setup_volume_descriptors(); if (diskStructure.is_bootable) { firstAvailableSector = cd9660_setup_boot(firstAvailableSector); if (firstAvailableSector < 0) errx(1, "setup_boot failed"); } /* LE first, then BE */ diskStructure.primaryLittleEndianTableSector = firstAvailableSector; diskStructure.primaryBigEndianTableSector = diskStructure.primaryLittleEndianTableSector + pathTableSectors; /* Set the secondary ones to -1, not going to use them for now */ diskStructure.secondaryBigEndianTableSector = -1; diskStructure.secondaryLittleEndianTableSector = -1; diskStructure.dataFirstSector = diskStructure.primaryBigEndianTableSector + pathTableSectors; if (diskStructure.verbose_level > 0) printf("cd9660_makefs: Path table conversion complete. " "Each table is %i bytes, or %" PRIu64 " sectors.\n", diskStructure.pathTableLength, pathTableSectors); startoffset = diskStructure.sectorSize*diskStructure.dataFirstSector; totalSpace = cd9660_compute_offsets(real_root, startoffset); diskStructure.totalSectors = diskStructure.dataFirstSector + CD9660_BLOCKS(diskStructure.sectorSize, totalSpace); /* Disabled until pass 1 is done */ if (diskStructure.rock_ridge_enabled) { diskStructure.susp_continuation_area_start_sector = diskStructure.totalSectors; diskStructure.totalSectors += CD9660_BLOCKS(diskStructure.sectorSize, diskStructure.susp_continuation_area_size); cd9660_susp_finalize(diskStructure.rootNode); } cd9660_finalize_PVD(); /* Add padding sectors, just for testing purposes right now */ /* diskStructure.totalSectors+=150; */ /* Debugging output */ if (diskStructure.verbose_level > 0) { printf("cd9660_makefs: Sectors 0-15 reserved\n"); printf("cd9660_makefs: Primary path tables starts in sector %" PRId64 "\n", diskStructure.primaryLittleEndianTableSector); printf("cd9660_makefs: File data starts in sector %" PRId64 "\n", diskStructure.dataFirstSector); printf("cd9660_makefs: Total sectors: %" PRId64 "\n", diskStructure.totalSectors); } /* * Add padding sectors at the end * TODO: Clean this up and separate padding */ if (diskStructure.include_padding_areas) diskStructure.totalSectors += 150; cd9660_write_image(image); if (diskStructure.verbose_level > 1) { debug_print_volume_descriptor_information(); debug_print_tree(real_root,0); debug_print_path_tree(real_root); } /* Clean up data structures */ cd9660_free_structure(real_root); if (diskStructure.verbose_level > 0) printf("cd9660_makefs: done\n"); } /* Generic function pointer - implement later */ typedef int (*cd9660node_func)(cd9660node *); static void cd9660_finalize_PVD(void) { time_t tstamp = stampst.st_ino ? stampst.st_mtime : time(NULL); /* root should be a fixed size of 34 bytes since it has no name */ memcpy(diskStructure.primaryDescriptor.root_directory_record, diskStructure.rootNode->dot_record->isoDirRecord, 34); /* In RRIP, this might be longer than 34 */ diskStructure.primaryDescriptor.root_directory_record[0] = 34; /* Set up all the important numbers in the PVD */ cd9660_bothendian_dword(diskStructure.totalSectors, (unsigned char *)diskStructure.primaryDescriptor.volume_space_size); cd9660_bothendian_word(1, (unsigned char *)diskStructure.primaryDescriptor.volume_set_size); cd9660_bothendian_word(1, (unsigned char *) diskStructure.primaryDescriptor.volume_sequence_number); cd9660_bothendian_word(diskStructure.sectorSize, (unsigned char *) diskStructure.primaryDescriptor.logical_block_size); cd9660_bothendian_dword(diskStructure.pathTableLength, (unsigned char *)diskStructure.primaryDescriptor.path_table_size); cd9660_731(diskStructure.primaryLittleEndianTableSector, (u_char *)diskStructure.primaryDescriptor.type_l_path_table); cd9660_732(diskStructure.primaryBigEndianTableSector, (u_char *)diskStructure.primaryDescriptor.type_m_path_table); diskStructure.primaryDescriptor.file_structure_version[0] = 1; /* Pad all strings with spaces instead of nulls */ cd9660_pad_string_spaces(diskStructure.primaryDescriptor.volume_id, 32); cd9660_pad_string_spaces(diskStructure.primaryDescriptor.system_id, 32); cd9660_pad_string_spaces(diskStructure.primaryDescriptor.volume_set_id, 128); cd9660_pad_string_spaces(diskStructure.primaryDescriptor.publisher_id, 128); cd9660_pad_string_spaces(diskStructure.primaryDescriptor.preparer_id, 128); cd9660_pad_string_spaces(diskStructure.primaryDescriptor.application_id, 128); cd9660_pad_string_spaces( diskStructure.primaryDescriptor.copyright_file_id, 37); cd9660_pad_string_spaces( diskStructure.primaryDescriptor.abstract_file_id, 37); cd9660_pad_string_spaces( diskStructure.primaryDescriptor.bibliographic_file_id, 37); /* Setup dates */ cd9660_time_8426( (unsigned char *)diskStructure.primaryDescriptor.creation_date, tstamp); cd9660_time_8426( (unsigned char *)diskStructure.primaryDescriptor.modification_date, tstamp); #if 0 cd9660_set_date(diskStructure.primaryDescriptor.expiration_date, tstamp); #endif memset(diskStructure.primaryDescriptor.expiration_date, '0', 16); diskStructure.primaryDescriptor.expiration_date[16] = 0; cd9660_time_8426( (unsigned char *)diskStructure.primaryDescriptor.effective_date, tstamp); /* make this sane */ cd9660_time_915(diskStructure.rootNode->dot_record->isoDirRecord->date, tstamp); } static void cd9660_populate_iso_dir_record(struct _iso_directory_record_cd9660 *record, u_char ext_attr_length, u_char flags, u_char name_len, const char * name) { record->ext_attr_length[0] = ext_attr_length; record->flags[0] = ISO_FLAG_CLEAR | flags; record->file_unit_size[0] = 0; record->interleave[0] = 0; cd9660_bothendian_word(1, record->volume_sequence_number); record->name_len[0] = name_len; memset(record->name, '\0', sizeof (record->name)); memcpy(record->name, name, name_len); record->length[0] = 33 + name_len; /* Todo : better rounding */ record->length[0] += (record->length[0] & 1) ? 1 : 0; } static void cd9660_setup_root_node(void) { cd9660_populate_iso_dir_record(diskStructure.rootNode->isoDirRecord, 0, ISO_FLAG_DIRECTORY, 1, "\0"); } /*********** SUPPORT FUNCTIONS ***********/ static int cd9660_setup_volume_descriptors(void) { /* Boot volume descriptor should come second */ int sector = 16; /* For now, a fixed 2 : PVD and terminator */ volume_descriptor *temp, *t; /* Set up the PVD */ if ((temp = malloc(sizeof(volume_descriptor))) == NULL) { CD9660_MEM_ALLOC_ERROR("cd9660_setup_volume_descriptors"); exit(1); } temp->volumeDescriptorData = (unsigned char *)&diskStructure.primaryDescriptor; temp->volumeDescriptorData[0] = ISO_VOLUME_DESCRIPTOR_PVD; temp->volumeDescriptorData[6] = 1; temp->sector = sector; memcpy(temp->volumeDescriptorData + 1, ISO_VOLUME_DESCRIPTOR_STANDARD_ID, 5); diskStructure.firstVolumeDescriptor = temp; sector++; /* Set up boot support if enabled. BVD must reside in sector 17 */ if (diskStructure.is_bootable) { if ((t = malloc(sizeof(volume_descriptor))) == NULL) { CD9660_MEM_ALLOC_ERROR( "cd9660_setup_volume_descriptors"); exit(1); } if ((t->volumeDescriptorData = malloc(2048)) == NULL) { CD9660_MEM_ALLOC_ERROR( "cd9660_setup_volume_descriptors"); exit(1); } temp->next = t; temp = t; memset(t->volumeDescriptorData, 0, 2048); t->sector = 17; if (diskStructure.verbose_level > 0) printf("Setting up boot volume descriptor\n"); cd9660_setup_boot_volume_descriptor(t); sector++; } /* Set up the terminator */ if ((t = malloc(sizeof(volume_descriptor))) == NULL) { CD9660_MEM_ALLOC_ERROR("cd9660_setup_volume_descriptors"); exit(1); } if ((t->volumeDescriptorData = malloc(2048)) == NULL) { CD9660_MEM_ALLOC_ERROR("cd9660_setup_volume_descriptors"); exit(1); } temp->next = t; memset(t->volumeDescriptorData, 0, 2048); t->volumeDescriptorData[0] = ISO_VOLUME_DESCRIPTOR_TERMINATOR; t->next = NULL; t->volumeDescriptorData[6] = 1; t->sector = sector; memcpy(t->volumeDescriptorData + 1, ISO_VOLUME_DESCRIPTOR_STANDARD_ID, 5); sector++; return sector; } #if 0 /* * Populate EAR at some point. Not required, but is used by NetBSD's * cd9660 support */ static int cd9660_fill_extended_attribute_record(cd9660node *node) { if ((node->isoExtAttributes = malloc(sizeof(struct iso_extended_attributes))) == NULL) { CD9660_MEM_ALLOC_ERROR("cd9660_fill_extended_attribute_record"); exit(1); }; return 1; } #endif static int cd9660_translate_node_common(cd9660node *newnode) { time_t tstamp = stampst.st_ino ? stampst.st_mtime : time(NULL); int test; u_char flag; char temp[ISO_FILENAME_MAXLENGTH_WITH_PADDING]; /* Now populate the isoDirRecord structure */ memset(temp, 0, ISO_FILENAME_MAXLENGTH_WITH_PADDING); test = cd9660_convert_filename(newnode->node->name, temp, !(S_ISDIR(newnode->node->type))); flag = ISO_FLAG_CLEAR; if (S_ISDIR(newnode->node->type)) flag |= ISO_FLAG_DIRECTORY; cd9660_populate_iso_dir_record(newnode->isoDirRecord, 0, flag, strlen(temp), temp); /* Set the various dates */ /* If we want to use the current date and time */ cd9660_time_915(newnode->isoDirRecord->date, tstamp); cd9660_bothendian_dword(newnode->fileDataLength, newnode->isoDirRecord->size); /* If the file is a link, we want to set the size to 0 */ if (S_ISLNK(newnode->node->type)) newnode->fileDataLength = 0; return 1; } /* * Translate fsnode to cd9660node * Translate filenames and other metadata, including dates, sizes, * permissions, etc * @param struct fsnode * The node generated by makefs * @param struct cd9660node * The intermediate node to be written to * @returns int 0 on failure, 1 on success */ static int cd9660_translate_node(fsnode *node, cd9660node *newnode) { if (node == NULL) { if (diskStructure.verbose_level > 0) printf("cd9660_translate_node: NULL node passed, " "returning\n"); return 0; } if ((newnode->isoDirRecord = malloc(sizeof(iso_directory_record_cd9660))) == NULL) { CD9660_MEM_ALLOC_ERROR("cd9660_translate_node"); return 0; } /* Set the node pointer */ newnode->node = node; /* Set the size */ if (!(S_ISDIR(node->type))) newnode->fileDataLength = node->inode->st.st_size; if (cd9660_translate_node_common(newnode) == 0) return 0; /* Finally, overwrite some of the values that are set by default */ cd9660_time_915(newnode->isoDirRecord->date, stampst.st_ino ? stampst.st_mtime : node->inode->st.st_mtime); return 1; } /* * Compares two ISO filenames * @param const char * The first file name * @param const char * The second file name * @returns : -1 if first is less than second, 0 if they are the same, 1 if * the second is greater than the first */ static int cd9660_compare_filename(const char *first, const char *second) { /* * This can be made more optimal once it has been tested * (the extra character, for example, is for testing) */ int p1 = 0; int p2 = 0; char c1, c2; /* First, on the filename */ while (p1 < ISO_FILENAME_MAXLENGTH_BEFORE_VERSION-1 && p2 < ISO_FILENAME_MAXLENGTH_BEFORE_VERSION-1) { c1 = first[p1]; c2 = second[p2]; if (c1 == '.' && c2 =='.') break; else if (c1 == '.') { p2++; c1 = ' '; } else if (c2 == '.') { p1++; c2 = ' '; } else { p1++; p2++; } if (c1 < c2) return -1; else if (c1 > c2) { return 1; } } if (first[p1] == '.' && second[p2] == '.') { p1++; p2++; while (p1 < ISO_FILENAME_MAXLENGTH_BEFORE_VERSION - 1 && p2 < ISO_FILENAME_MAXLENGTH_BEFORE_VERSION - 1) { c1 = first[p1]; c2 = second[p2]; if (c1 == ';' && c2 == ';') break; else if (c1 == ';') { p2++; c1 = ' '; } else if (c2 == ';') { p1++; c2 = ' '; } else { p1++; p2++; } if (c1 < c2) return -1; else if (c1 > c2) return 1; } } return 0; } /* * Insert a node into list with ISO sorting rules * @param cd9660node * The head node of the list * @param cd9660node * The node to be inserted */ static void cd9660_sorted_child_insert(cd9660node *parent, cd9660node *cn_new) { int compare; cd9660node *cn; struct cd9660_children_head *head = &parent->cn_children; /* TODO: Optimize? */ cn_new->parent = parent; /* * first will either be 0, the . or the .. * if . or .., this means no other entry may be written before first * if 0, the new node may be inserted at the head */ TAILQ_FOREACH(cn, head, cn_next_child) { /* * Dont insert a node twice - * that would cause an infinite loop */ if (cn_new == cn) return; compare = cd9660_compare_filename(cn_new->isoDirRecord->name, cn->isoDirRecord->name); if (compare == 0) compare = cd9660_compare_filename(cn_new->node->name, cn->node->name); if (compare < 0) break; } if (cn == NULL) TAILQ_INSERT_TAIL(head, cn_new, cn_next_child); else TAILQ_INSERT_BEFORE(cn, cn_new, cn_next_child); } /* * Called After cd9660_sorted_child_insert * handles file collisions by suffixing each filname with ~n * where n represents the files respective place in the ordering */ static int cd9660_handle_collisions(cd9660node *colliding, int past) { cd9660node *iter, *next, *prev; int skip; int delete_chars = 0; int temp_past = past; int temp_skip; int flag = 0; cd9660node *end_of_range; for (iter = TAILQ_FIRST(&colliding->cn_children); iter != NULL && (next = TAILQ_NEXT(iter, cn_next_child)) != NULL;) { if (strcmp(iter->isoDirRecord->name, next->isoDirRecord->name) != 0) { iter = TAILQ_NEXT(iter, cn_next_child); continue; } flag = 1; temp_skip = skip = cd9660_count_collisions(iter); end_of_range = iter; while (temp_skip > 0) { temp_skip--; end_of_range = TAILQ_NEXT(end_of_range, cn_next_child); } temp_past = past; while (temp_past > 0) { if ((next = TAILQ_NEXT(end_of_range, cn_next_child)) != NULL) end_of_range = next; else if ((prev = TAILQ_PREV(iter, cd9660_children_head, cn_next_child)) != NULL) iter = prev; else delete_chars++; temp_past--; } skip += past; iter = cd9660_rename_filename(iter, skip, delete_chars); } return flag; } static cd9660node * cd9660_rename_filename(cd9660node *iter, int num, int delete_chars) { int i = 0; int numbts, digit, digits, temp, powers, count; char *naming; int maxlength; char *tmp; if (diskStructure.verbose_level > 0) printf("Rename_filename called\n"); assert(1 <= diskStructure.isoLevel && diskStructure.isoLevel <= 2); /* TODO : A LOT of chanes regarding 8.3 filenames */ if (diskStructure.isoLevel == 1) maxlength = 8; else if (diskStructure.isoLevel == 2) maxlength = 31; else maxlength = ISO_FILENAME_MAXLENGTH_BEFORE_VERSION; tmp = malloc(ISO_FILENAME_MAXLENGTH_WITH_PADDING); while (i < num) { powers = 1; count = 0; digits = 1; while (((int)(i / powers) ) >= 10) { digits++; powers = powers * 10; } naming = iter->o_name; /* while ((*naming != '.') && (*naming != ';')) { naming++; count++; } */ while (count < maxlength) { if (*naming == ';') break; naming++; count++; } if ((count + digits) < maxlength) numbts = count; else numbts = maxlength - (digits); numbts -= delete_chars; /* 8.3 rules - keep the extension, add before the dot */ /* * This code makes a bunch of assumptions. * See if you can spot them all :) */ /* if (diskStructure.isoLevel == 1) { numbts = 8 - digits - delete_chars; if (dot < 0) { } else { if (dot < 8) { memmove(&tmp[numbts],&tmp[dot],4); } } } */ /* (copying just the filename before the '.' */ memcpy(tmp, (iter->o_name), numbts); /* adding the appropriate number following the name */ temp = i; while (digits > 0) { digit = (int)(temp / powers); temp = temp - digit * powers; sprintf(&tmp[numbts] , "%d", digit); digits--; numbts++; powers = powers / 10; } while ((*naming != ';') && (numbts < maxlength)) { tmp[numbts] = (*naming); naming++; numbts++; } tmp[numbts] = ';'; tmp[numbts+1] = '1'; tmp[numbts+2] = '\0'; /* * now tmp has exactly the identifier * we want so we'll copy it back to record */ memcpy((iter->isoDirRecord->name), tmp, numbts + 3); iter = TAILQ_NEXT(iter, cn_next_child); i++; } free(tmp); return iter; } /* Todo: Figure out why these functions are nec. */ static void cd9660_copy_filenames(cd9660node *node) { cd9660node *cn; if (TAILQ_EMPTY(&node->cn_children)) return; if (TAILQ_FIRST(&node->cn_children)->isoDirRecord == NULL) { debug_print_tree(diskStructure.rootNode, 0); exit(1); } TAILQ_FOREACH(cn, &node->cn_children, cn_next_child) { cd9660_copy_filenames(cn); memcpy(cn->o_name, cn->isoDirRecord->name, ISO_FILENAME_MAXLENGTH_WITH_PADDING); } } static void cd9660_sorting_nodes(cd9660node *node) { cd9660node *cn; TAILQ_FOREACH(cn, &node->cn_children, cn_next_child) cd9660_sorting_nodes(cn); cd9660_sort_nodes(node); } /* XXX Bubble sort. */ static void cd9660_sort_nodes(cd9660node *node) { cd9660node *cn, *next; do { TAILQ_FOREACH(cn, &node->cn_children, cn_next_child) { if ((next = TAILQ_NEXT(cn, cn_next_child)) == NULL) return; else if (strcmp(next->isoDirRecord->name, cn->isoDirRecord->name) >= 0) continue; TAILQ_REMOVE(&node->cn_children, next, cn_next_child); TAILQ_INSERT_BEFORE(cn, next, cn_next_child); break; } } while (cn != NULL); } static int cd9660_count_collisions(cd9660node *copy) { int count = 0; cd9660node *iter, *next; for (iter = copy; (next = TAILQ_NEXT(iter, cn_next_child)) != NULL; iter = next) { if (cd9660_compare_filename(iter->isoDirRecord->name, next->isoDirRecord->name) == 0) count++; else return count; } #if 0 if ((next = TAILQ_NEXT(iter, cn_next_child)) != NULL) { printf("cd9660_recurse_on_collision: count is %i \n", count); compare = cd9660_compare_filename(iter->isoDirRecord->name, next->isoDirRecord->name); if (compare == 0) { count++; return cd9660_recurse_on_collision(next, count); } else return count; } #endif return count; } static cd9660node * cd9660_rrip_move_directory(cd9660node *dir) { char newname[9]; cd9660node *tfile; /* * This function needs to: * 1) Create an empty virtual file in place of the old directory * 2) Point the virtual file to the new directory * 3) Point the relocated directory to its old parent * 4) Move the directory specified by dir into rr_moved_dir, * and rename it to "diskStructure.rock_ridge_move_count" (as a string) */ /* First see if the moved directory even exists */ if (diskStructure.rr_moved_dir == NULL) { diskStructure.rr_moved_dir = cd9660_create_directory(ISO_RRIP_DEFAULT_MOVE_DIR_NAME, diskStructure.rootNode, dir); if (diskStructure.rr_moved_dir == NULL) return 0; cd9660_time_915(diskStructure.rr_moved_dir->isoDirRecord->date, stampst.st_ino ? stampst.st_mtime : start_time.tv_sec); } /* Create a file with the same ORIGINAL name */ tfile = cd9660_create_file(dir->node->name, dir->parent, dir); if (tfile == NULL) return NULL; diskStructure.rock_ridge_move_count++; snprintf(newname, sizeof(newname), "%08i", diskStructure.rock_ridge_move_count); /* Point to old parent */ dir->rr_real_parent = dir->parent; /* Place the placeholder file */ if (TAILQ_EMPTY(&dir->rr_real_parent->cn_children)) { TAILQ_INSERT_HEAD(&dir->rr_real_parent->cn_children, tfile, cn_next_child); } else { cd9660_sorted_child_insert(dir->rr_real_parent, tfile); } /* Point to new parent */ dir->parent = diskStructure.rr_moved_dir; /* Point the file to the moved directory */ tfile->rr_relocated = dir; /* Actually move the directory */ cd9660_sorted_child_insert(diskStructure.rr_moved_dir, dir); /* TODO: Inherit permissions / ownership (basically the entire inode) */ /* Set the new name */ memset(dir->isoDirRecord->name, 0, ISO_FILENAME_MAXLENGTH_WITH_PADDING); strncpy(dir->isoDirRecord->name, newname, 8); dir->isoDirRecord->length[0] = 34 + 8; dir->isoDirRecord->name_len[0] = 8; return dir; } static int cd9660_add_dot_records(cd9660node *root) { struct cd9660_children_head *head = &root->cn_children; cd9660node *cn; TAILQ_FOREACH(cn, head, cn_next_child) { if ((cn->type & CD9660_TYPE_DIR) == 0) continue; /* Recursion first */ cd9660_add_dot_records(cn); } cd9660_create_special_directory(CD9660_TYPE_DOT, root); cd9660_create_special_directory(CD9660_TYPE_DOTDOT, root); return 1; } /* * Convert node to cd9660 structure * This function is designed to be called recursively on the root node of * the filesystem * Lots of recursion going on here, want to make sure it is efficient * @param struct fsnode * The root node to be converted * @param struct cd9660* The parent node (should not be NULL) * @param int Current directory depth * @param int* Running count of the number of directories that are being created */ static void cd9660_convert_structure(fsnode *root, cd9660node *parent_node, int level, int *numDirectories, int *error) { fsnode *iterator = root; cd9660node *this_node; int working_level; int add; int flag = 0; int counter = 0; /* * Newer, more efficient method, reduces recursion depth */ if (root == NULL) { warnx("%s: root is null\n", __func__); return; } /* Test for an empty directory - makefs still gives us the . record */ if ((S_ISDIR(root->type)) && (root->name[0] == '.') && (root->name[1] == '\0')) { root = root->next; if (root == NULL) return; } if ((this_node = cd9660_allocate_cd9660node()) == NULL) { CD9660_MEM_ALLOC_ERROR(__func__); } /* * To reduce the number of recursive calls, we will iterate over * the next pointers to the right. */ while (iterator != NULL) { add = 1; /* * Increment the directory count if this is a directory * Ignore "." entries. We will generate them later */ if (!S_ISDIR(iterator->type) || strcmp(iterator->name, ".") != 0) { /* Translate the node, including its filename */ this_node->parent = parent_node; cd9660_translate_node(iterator, this_node); this_node->level = level; if (S_ISDIR(iterator->type)) { (*numDirectories)++; this_node->type = CD9660_TYPE_DIR; working_level = level + 1; /* * If at level 8, directory would be at 8 * and have children at 9 which is not * allowed as per ISO spec */ if (level == 8) { if ((!diskStructure.allow_deep_trees) && (!diskStructure.rock_ridge_enabled)) { warnx("error: found entry " "with depth greater " "than 8."); (*error) = 1; return; } else if (diskStructure. rock_ridge_enabled) { working_level = 3; /* * Moved directory is actually * at level 2. */ this_node->level = working_level - 1; if (cd9660_rrip_move_directory( this_node) == NULL) { warnx("Failure in " "cd9660_rrip_" "move_directory" ); (*error) = 1; return; } add = 0; } } /* Do the recursive call on the children */ if (iterator->child != NULL) { cd9660_convert_structure( iterator->child, this_node, working_level, numDirectories, error); if ((*error) == 1) { warnx("%s: Error on recursive " "call", __func__); return; } } } else { /* Only directories should have children */ assert(iterator->child == NULL); this_node->type = CD9660_TYPE_FILE; } /* * Finally, do a sorted insert */ if (add) { cd9660_sorted_child_insert( parent_node, this_node); } /*Allocate new temp_node */ if (iterator->next != NULL) { this_node = cd9660_allocate_cd9660node(); if (this_node == NULL) CD9660_MEM_ALLOC_ERROR(__func__); } } iterator = iterator->next; } /* cd9660_handle_collisions(first_node); */ /* TODO: need cleanup */ cd9660_copy_filenames(parent_node); do { flag = cd9660_handle_collisions(parent_node, counter); counter++; cd9660_sorting_nodes(parent_node); } while ((flag == 1) && (counter < 100)); } /* * Clean up the cd9660node tree * This is designed to be called recursively on the root node * @param struct cd9660node *root The node to free * @returns void */ static void cd9660_free_structure(cd9660node *root) { cd9660node *cn; while ((cn = TAILQ_FIRST(&root->cn_children)) != NULL) { TAILQ_REMOVE(&root->cn_children, cn, cn_next_child); cd9660_free_structure(cn); } free(root); } /* * Be a little more memory conservative: * instead of having the TAILQ_ENTRY as part of the cd9660node, * just create a temporary structure */ struct ptq_entry { TAILQ_ENTRY(ptq_entry) ptq; cd9660node *node; } *n; #define PTQUEUE_NEW(n,s,r,t){\ n = malloc(sizeof(struct s)); \ if (n == NULL) \ return r; \ n->node = t;\ } /* * Generate the path tables * The specific implementation of this function is left as an exercise to the * programmer. It could be done recursively. Make sure you read how the path * table has to be laid out, it has levels. * @param struct iso9660_disk *disk The disk image * @returns int The number of built path tables (between 1 and 4), 0 on failure */ static int cd9660_generate_path_table(void) { cd9660node *cn, *dirNode = diskStructure.rootNode; cd9660node *last = dirNode; int pathTableSize = 0; /* computed as we go */ int counter = 1; /* root gets a count of 0 */ TAILQ_HEAD(cd9660_pt_head, ptq_entry) pt_head; TAILQ_INIT(&pt_head); PTQUEUE_NEW(n, ptq_entry, -1, diskStructure.rootNode); /* Push the root node */ TAILQ_INSERT_HEAD(&pt_head, n, ptq); /* Breadth-first traversal of file structure */ while (pt_head.tqh_first != 0) { n = pt_head.tqh_first; dirNode = n->node; TAILQ_REMOVE(&pt_head, pt_head.tqh_first, ptq); free(n); /* Update the size */ pathTableSize += ISO_PATHTABLE_ENTRY_BASESIZE + dirNode->isoDirRecord->name_len[0]+ (dirNode->isoDirRecord->name_len[0] % 2 == 0 ? 0 : 1); /* includes the padding bit */ dirNode->ptnumber=counter; if (dirNode != last) { last->ptnext = dirNode; dirNode->ptprev = last; } last = dirNode; /* Push children onto queue */ TAILQ_FOREACH(cn, &dirNode->cn_children, cn_next_child) { /* * Dont add the DOT and DOTDOT types to the path * table. */ if ((cn->type != CD9660_TYPE_DOT) && (cn->type != CD9660_TYPE_DOTDOT)) { if (S_ISDIR(cn->node->type)) { PTQUEUE_NEW(n, ptq_entry, -1, cn); TAILQ_INSERT_TAIL(&pt_head, n, ptq); } } } counter++; } return pathTableSize; } void cd9660_compute_full_filename(cd9660node *node, char *buf) { int len; len = CD9660MAXPATH + 1; len = snprintf(buf, len, "%s/%s/%s", node->node->root, node->node->path, node->node->name); if (len > CD9660MAXPATH) errx(1, "Pathname too long."); } /* NEW filename conversion method */ typedef int(*cd9660_filename_conversion_functor)(const char *, char *, int); /* * TODO: These two functions are almost identical. * Some code cleanup is possible here * * XXX bounds checking! */ static int cd9660_level1_convert_filename(const char *oldname, char *newname, int is_file) { /* * ISO 9660 : 10.1 * File Name shall not contain more than 8 d or d1 characters * File Name Extension shall not contain more than 3 d or d1 characters * Directory Identifier shall not contain more than 8 d or d1 characters */ int namelen = 0; int extlen = 0; int found_ext = 0; while (*oldname != '\0' && extlen < 3) { /* Handle period first, as it is special */ if (*oldname == '.') { if (found_ext) { *newname++ = '_'; extlen ++; } else { *newname++ = '.'; found_ext = 1; } } else { /* cut RISC OS file type off ISO name */ if (diskStructure.archimedes_enabled && *oldname == ',' && strlen(oldname) == 4) break; + /* Enforce 12.3 / 8 */ if (namelen == 8 && !found_ext) break; if (islower((unsigned char)*oldname)) *newname++ = toupper((unsigned char)*oldname); else if (isupper((unsigned char)*oldname) - || isdigit((unsigned char)*oldname)) + || isdigit((unsigned char)*oldname)) *newname++ = *oldname; else *newname++ = '_'; if (found_ext) extlen++; else namelen++; } - oldname ++; + oldname++; } if (is_file) { if (!found_ext && !diskStructure.omit_trailing_period) *newname++ = '.'; /* Add version */ sprintf(newname, ";%i", 1); } return namelen + extlen + found_ext; } /* XXX bounds checking! */ static int cd9660_level2_convert_filename(const char *oldname, char *newname, int is_file) { /* * ISO 9660 : 7.5.1 * File name : 0+ d or d1 characters * separator 1 (.) * File name extension : 0+ d or d1 characters * separator 2 (;) * File version number (5 characters, 1-32767) * 1 <= Sum of File name and File name extension <= 30 */ int namelen = 0; int extlen = 0; int found_ext = 0; while (*oldname != '\0' && namelen + extlen < 30) { /* Handle period first, as it is special */ if (*oldname == '.') { if (found_ext) { if (diskStructure.allow_multidot) { *newname++ = '.'; } else { *newname++ = '_'; } extlen ++; } else { *newname++ = '.'; found_ext = 1; } } else { /* cut RISC OS file type off ISO name */ if (diskStructure.archimedes_enabled && *oldname == ',' && strlen(oldname) == 4) break; if (islower((unsigned char)*oldname)) *newname++ = toupper((unsigned char)*oldname); else if (isupper((unsigned char)*oldname) || isdigit((unsigned char)*oldname)) *newname++ = *oldname; else if (diskStructure.allow_multidot && *oldname == '.') { *newname++ = '.'; } else { *newname++ = '_'; } if (found_ext) extlen++; else namelen++; } oldname ++; } if (is_file) { if (!found_ext && !diskStructure.omit_trailing_period) *newname++ = '.'; /* Add version */ sprintf(newname, ";%i", 1); } return namelen + extlen + found_ext; } #if 0 static int cd9660_joliet_convert_filename(const char *oldname, char *newname, int is_file) { /* TODO: implement later, move to cd9660_joliet.c ?? */ } #endif /* * Convert a file name to ISO compliant file name * @param char * oldname The original filename * @param char ** newname The new file name, in the appropriate character * set and of appropriate length * @param int 1 if file, 0 if directory * @returns int The length of the new string */ static int cd9660_convert_filename(const char *oldname, char *newname, int is_file) { assert(1 <= diskStructure.isoLevel && diskStructure.isoLevel <= 2); /* NEW */ cd9660_filename_conversion_functor conversion_function = NULL; if (diskStructure.isoLevel == 1) conversion_function = &cd9660_level1_convert_filename; else if (diskStructure.isoLevel == 2) conversion_function = &cd9660_level2_convert_filename; return (*conversion_function)(oldname, newname, is_file); } int cd9660_compute_record_size(cd9660node *node) { int size = node->isoDirRecord->length[0]; if (diskStructure.rock_ridge_enabled) size += node->susp_entry_size; size += node->su_tail_size; size += size & 1; /* Ensure length of record is even. */ assert(size <= 254); return size; } static void cd9660_populate_dot_records(cd9660node *node) { node->dot_record->fileDataSector = node->fileDataSector; memcpy(node->dot_record->isoDirRecord,node->isoDirRecord, 34); node->dot_record->isoDirRecord->name_len[0] = 1; node->dot_record->isoDirRecord->name[0] = 0; node->dot_record->isoDirRecord->name[1] = 0; node->dot_record->isoDirRecord->length[0] = 34; node->dot_record->fileRecordSize = cd9660_compute_record_size(node->dot_record); if (node == diskStructure.rootNode) { node->dot_dot_record->fileDataSector = node->fileDataSector; memcpy(node->dot_dot_record->isoDirRecord,node->isoDirRecord, 34); } else { node->dot_dot_record->fileDataSector = node->parent->fileDataSector; memcpy(node->dot_dot_record->isoDirRecord, node->parent->isoDirRecord,34); } node->dot_dot_record->isoDirRecord->name_len[0] = 1; node->dot_dot_record->isoDirRecord->name[0] = 1; node->dot_dot_record->isoDirRecord->name[1] = 0; node->dot_dot_record->isoDirRecord->length[0] = 34; node->dot_dot_record->fileRecordSize = cd9660_compute_record_size(node->dot_dot_record); } /* * @param struct cd9660node *node The node * @param int The offset (in bytes) - SHOULD align to the beginning of a sector * @returns int The total size of files and directory entries (should be * a multiple of sector size) */ static int64_t cd9660_compute_offsets(cd9660node *node, int64_t startOffset) { /* * This function needs to compute the size of directory records and * runs, file lengths, and set the appropriate variables both in * cd9660node and isoDirEntry */ int64_t used_bytes = 0; int64_t current_sector_usage = 0; cd9660node *child; fsinode *inode; int64_t r; assert(node != NULL); /* * NOTE : There needs to be some special case detection for * the "real root" node, since for it, node->node is undefined */ node->fileDataSector = -1; if (node->type & CD9660_TYPE_DIR) { node->fileRecordSize = cd9660_compute_record_size(node); /*Set what sector this directory starts in*/ node->fileDataSector = CD9660_BLOCKS(diskStructure.sectorSize,startOffset); cd9660_bothendian_dword(node->fileDataSector, node->isoDirRecord->extent); /* * First loop over children, need to know the size of * their directory records */ node->fileSectorsUsed = 1; TAILQ_FOREACH(child, &node->cn_children, cn_next_child) { node->fileDataLength += cd9660_compute_record_size(child); if ((cd9660_compute_record_size(child) + current_sector_usage) >= diskStructure.sectorSize) { current_sector_usage = 0; node->fileSectorsUsed++; } current_sector_usage += cd9660_compute_record_size(child); } cd9660_bothendian_dword(node->fileSectorsUsed * diskStructure.sectorSize,node->isoDirRecord->size); /* * This should point to the sector after the directory * record (or, the first byte in that sector) */ used_bytes += node->fileSectorsUsed * diskStructure.sectorSize; for (child = TAILQ_NEXT(node->dot_dot_record, cn_next_child); child != NULL; child = TAILQ_NEXT(child, cn_next_child)) { /* Directories need recursive call */ if (S_ISDIR(child->node->type)) { r = cd9660_compute_offsets(child, used_bytes + startOffset); if (r != -1) used_bytes += r; else return -1; } } /* Explicitly set the . and .. records */ cd9660_populate_dot_records(node); /* Finally, do another iteration to write the file data*/ for (child = TAILQ_NEXT(node->dot_dot_record, cn_next_child); child != NULL; child = TAILQ_NEXT(child, cn_next_child)) { /* Files need extent set */ if (S_ISDIR(child->node->type)) continue; child->fileRecordSize = cd9660_compute_record_size(child); child->fileSectorsUsed = CD9660_BLOCKS(diskStructure.sectorSize, child->fileDataLength); inode = child->node->inode; if ((inode->flags & FI_ALLOCATED) == 0) { inode->ino = CD9660_BLOCKS(diskStructure.sectorSize, used_bytes + startOffset); inode->flags |= FI_ALLOCATED; used_bytes += child->fileSectorsUsed * diskStructure.sectorSize; } else { INODE_WARNX(("%s: already allocated inode %d " "data sectors at %" PRIu32, __func__, (int)inode->st.st_ino, inode->ino)); } child->fileDataSector = inode->ino; cd9660_bothendian_dword(child->fileDataSector, child->isoDirRecord->extent); } } return used_bytes; } #if 0 /* Might get rid of this func */ static int cd9660_copy_stat_info(cd9660node *from, cd9660node *to, int file) { to->node->inode->st.st_dev = 0; to->node->inode->st.st_ino = 0; to->node->inode->st.st_size = 0; to->node->inode->st.st_blksize = from->node->inode->st.st_blksize; to->node->inode->st.st_atime = from->node->inode->st.st_atime; to->node->inode->st.st_mtime = from->node->inode->st.st_mtime; to->node->inode->st.st_ctime = from->node->inode->st.st_ctime; to->node->inode->st.st_uid = from->node->inode->st.st_uid; to->node->inode->st.st_gid = from->node->inode->st.st_gid; to->node->inode->st.st_mode = from->node->inode->st.st_mode; /* Clear out type */ to->node->inode->st.st_mode = to->node->inode->st.st_mode & ~(S_IFMT); if (file) to->node->inode->st.st_mode |= S_IFREG; else to->node->inode->st.st_mode |= S_IFDIR; return 1; } #endif static cd9660node * cd9660_create_virtual_entry(const char *name, cd9660node *parent, int file, int insert) { cd9660node *temp; fsnode * tfsnode; assert(parent != NULL); temp = cd9660_allocate_cd9660node(); if (temp == NULL) return NULL; if ((tfsnode = malloc(sizeof(fsnode))) == NULL) { CD9660_MEM_ALLOC_ERROR("cd9660_create_virtual_entry"); return NULL; } /* Assume for now name is a valid length */ if ((tfsnode->name = malloc(strlen(name) + 1)) == NULL) { CD9660_MEM_ALLOC_ERROR("cd9660_create_virtual_entry"); return NULL; } if ((temp->isoDirRecord = malloc(sizeof(iso_directory_record_cd9660))) == NULL) { CD9660_MEM_ALLOC_ERROR("cd9660_create_virtual_entry"); return NULL; } strcpy(tfsnode->name, name); cd9660_convert_filename(tfsnode->name, temp->isoDirRecord->name, file); temp->node = tfsnode; temp->parent = parent; if (insert) { if (temp->parent != NULL) { temp->level = temp->parent->level + 1; if (!TAILQ_EMPTY(&temp->parent->cn_children)) cd9660_sorted_child_insert(temp->parent, temp); else TAILQ_INSERT_HEAD(&temp->parent->cn_children, temp, cn_next_child); } } if (parent->node != NULL) { tfsnode->type = parent->node->type; } /* Clear out file type bits */ tfsnode->type &= ~(S_IFMT); if (file) tfsnode->type |= S_IFREG; else tfsnode->type |= S_IFDIR; /* Indicate that there is no spec entry (inode) */ tfsnode->flags &= ~(FSNODE_F_HASSPEC); #if 0 cd9660_copy_stat_info(parent, temp, file); #endif return temp; } static cd9660node * cd9660_create_file(const char * name, cd9660node *parent, cd9660node *me) { cd9660node *temp; temp = cd9660_create_virtual_entry(name,parent,1,1); if (temp == NULL) return NULL; temp->fileDataLength = 0; temp->type = CD9660_TYPE_FILE | CD9660_TYPE_VIRTUAL; if ((temp->node->inode = calloc(1, sizeof(fsinode))) == NULL) return NULL; *temp->node->inode = *me->node->inode; if (cd9660_translate_node_common(temp) == 0) return NULL; return temp; } /* * Create a new directory which does not exist on disk * @param const char * name The name to assign to the directory * @param const char * parent Pointer to the parent directory * @returns cd9660node * Pointer to the new directory */ static cd9660node * cd9660_create_directory(const char *name, cd9660node *parent, cd9660node *me) { cd9660node *temp; temp = cd9660_create_virtual_entry(name,parent,0,1); if (temp == NULL) return NULL; temp->node->type |= S_IFDIR; temp->type = CD9660_TYPE_DIR | CD9660_TYPE_VIRTUAL; if ((temp->node->inode = calloc(1, sizeof(fsinode))) == NULL) return NULL; *temp->node->inode = *me->node->inode; if (cd9660_translate_node_common(temp) == 0) return NULL; return temp; } static cd9660node * cd9660_create_special_directory(u_char type, cd9660node *parent) { cd9660node *temp, *first; char na[2]; assert(parent != NULL); if (type == CD9660_TYPE_DOT) na[0] = 0; else if (type == CD9660_TYPE_DOTDOT) na[0] = 1; else return 0; na[1] = 0; if ((temp = cd9660_create_virtual_entry(na, parent, 0, 0)) == NULL) return NULL; temp->parent = parent; temp->type = type; temp->isoDirRecord->length[0] = 34; /* Dot record is always first */ if (type == CD9660_TYPE_DOT) { parent->dot_record = temp; TAILQ_INSERT_HEAD(&parent->cn_children, temp, cn_next_child); /* DotDot should be second */ } else if (type == CD9660_TYPE_DOTDOT) { parent->dot_dot_record = temp; /* * If the first child is the dot record, insert * this second. Otherwise, insert it at the head. */ if ((first = TAILQ_FIRST(&parent->cn_children)) == NULL || (first->type & CD9660_TYPE_DOT) == 0) { TAILQ_INSERT_HEAD(&parent->cn_children, temp, cn_next_child); } else { TAILQ_INSERT_AFTER(&parent->cn_children, first, temp, cn_next_child); } } return temp; } int cd9660_add_generic_bootimage(const char *bootimage) { struct stat stbuf; assert(bootimage != NULL); if (*bootimage == '\0') { warnx("Error: Boot image must be a filename"); return 0; } if ((diskStructure.generic_bootimage = strdup(bootimage)) == NULL) { warn("%s: strdup", __func__); return 0; } /* Get information about the file */ if (lstat(diskStructure.generic_bootimage, &stbuf) == -1) err(EXIT_FAILURE, "%s: lstat(\"%s\")", __func__, diskStructure.generic_bootimage); if (stbuf.st_size > 32768) { warnx("Error: Boot image must be no greater than 32768 bytes"); return 0; } if (diskStructure.verbose_level > 0) { printf("Generic boot image image has size %lld\n", (long long)stbuf.st_size); } diskStructure.has_generic_bootimage = 1; return 1; } Index: user/alc/PQ_LAUNDRY/usr.sbin/pmcstat/pmcstat.c =================================================================== --- user/alc/PQ_LAUNDRY/usr.sbin/pmcstat/pmcstat.c (revision 306831) +++ user/alc/PQ_LAUNDRY/usr.sbin/pmcstat/pmcstat.c (revision 306832) @@ -1,1537 +1,1547 @@ /*- * Copyright (c) 2003-2008, Joseph Koshy * Copyright (c) 2007 The FreeBSD Foundation * All rights reserved. * * Portions of this software were developed by A. Joseph Koshy under * sponsorship from the FreeBSD Foundation and Google, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "pmcstat.h" /* * A given invocation of pmcstat(8) can manage multiple PMCs of both * the system-wide and per-process variety. Each of these could be in * 'counting mode' or in 'sampling mode'. * * For 'counting mode' PMCs, pmcstat(8) will periodically issue a * pmc_read() at the configured time interval and print out the value * of the requested PMCs. * * For 'sampling mode' PMCs it can log to a file for offline analysis, * or can analyse sampling data "on the fly", either by converting * samples to printed textual form or by creating gprof(1) compatible * profiles, one per program executed. When creating gprof(1) * profiles it can optionally merge entries from multiple processes * for a given executable into a single profile file. * * pmcstat(8) can also execute a command line and attach PMCs to the * resulting child process. The protocol used is as follows: * * - parent creates a socketpair for two way communication and * fork()s. * - subsequently: * * /Parent/ /Child/ * * - Wait for childs token. * - Sends token. * - Awaits signal to start. * - Attaches PMCs to the child's pid * and starts them. Sets up * monitoring for the child. * - Signals child to start. * - Receives signal, attempts exec(). * * After this point normal processing can happen. */ /* Globals */ int pmcstat_displayheight = DEFAULT_DISPLAY_HEIGHT; int pmcstat_displaywidth = DEFAULT_DISPLAY_WIDTH; static int pmcstat_sockpair[NSOCKPAIRFD]; static int pmcstat_kq; static kvm_t *pmcstat_kvm; static struct kinfo_proc *pmcstat_plist; struct pmcstat_args args; static void pmcstat_clone_event_descriptor(struct pmcstat_ev *ev, const cpuset_t *cpumask) { int cpu; struct pmcstat_ev *ev_clone; for (cpu = 0; cpu < CPU_SETSIZE; cpu++) { if (!CPU_ISSET(cpu, cpumask)) continue; if ((ev_clone = malloc(sizeof(*ev_clone))) == NULL) errx(EX_SOFTWARE, "ERROR: Out of memory"); (void) memset(ev_clone, 0, sizeof(*ev_clone)); ev_clone->ev_count = ev->ev_count; ev_clone->ev_cpu = cpu; ev_clone->ev_cumulative = ev->ev_cumulative; ev_clone->ev_flags = ev->ev_flags; ev_clone->ev_mode = ev->ev_mode; ev_clone->ev_name = strdup(ev->ev_name); + if (ev_clone->ev_name == NULL) + errx(EX_SOFTWARE, "ERROR: Out of memory"); ev_clone->ev_pmcid = ev->ev_pmcid; ev_clone->ev_saved = ev->ev_saved; ev_clone->ev_spec = strdup(ev->ev_spec); + if (ev_clone->ev_spec == NULL) + errx(EX_SOFTWARE, "ERROR: Out of memory"); STAILQ_INSERT_TAIL(&args.pa_events, ev_clone, ev_next); } } static void pmcstat_get_cpumask(const char *cpuspec, cpuset_t *cpumask) { int cpu; const char *s; char *end; CPU_ZERO(cpumask); s = cpuspec; do { cpu = strtol(s, &end, 0); if (cpu < 0 || end == s) errx(EX_USAGE, "ERROR: Illegal CPU specification \"%s\".", cpuspec); CPU_SET(cpu, cpumask); s = end + strspn(end, ", \t"); } while (*s); assert(!CPU_EMPTY(cpumask)); } void pmcstat_attach_pmcs(void) { struct pmcstat_ev *ev; struct pmcstat_target *pt; int count; /* Attach all process PMCs to target processes. */ count = 0; STAILQ_FOREACH(ev, &args.pa_events, ev_next) { if (PMC_IS_SYSTEM_MODE(ev->ev_mode)) continue; SLIST_FOREACH(pt, &args.pa_targets, pt_next) if (pmc_attach(ev->ev_pmcid, pt->pt_pid) == 0) count++; else if (errno != ESRCH) err(EX_OSERR, "ERROR: cannot attach pmc \"%s\" to process %d", ev->ev_name, (int)pt->pt_pid); } if (count == 0) errx(EX_DATAERR, "ERROR: No processes were attached to."); } void pmcstat_cleanup(void) { - struct pmcstat_ev *ev, *tmp; + struct pmcstat_ev *ev; /* release allocated PMCs. */ - STAILQ_FOREACH_SAFE(ev, &args.pa_events, ev_next, tmp) - if (ev->ev_pmcid != PMC_ID_INVALID) { - if (pmc_stop(ev->ev_pmcid) < 0) - err(EX_OSERR, "ERROR: cannot stop pmc 0x%x \"%s\"", - ev->ev_pmcid, ev->ev_name); - if (pmc_release(ev->ev_pmcid) < 0) - err(EX_OSERR, "ERROR: cannot release pmc 0x%x \"%s\"", - ev->ev_pmcid, ev->ev_name); - free(ev->ev_name); - free(ev->ev_spec); - STAILQ_REMOVE(&args.pa_events, ev, pmcstat_ev, ev_next); - free(ev); - } + STAILQ_FOREACH(ev, &args.pa_events, ev_next) + if (ev->ev_pmcid != PMC_ID_INVALID) { + if (pmc_stop(ev->ev_pmcid) < 0) + err(EX_OSERR, + "ERROR: cannot stop pmc 0x%x \"%s\"", + ev->ev_pmcid, ev->ev_name); + if (pmc_release(ev->ev_pmcid) < 0) + err(EX_OSERR, + "ERROR: cannot release pmc 0x%x \"%s\"", + ev->ev_pmcid, ev->ev_name); + } /* de-configure the log file if present. */ if (args.pa_flags & (FLAG_HAS_PIPE | FLAG_HAS_OUTPUT_LOGFILE)) (void) pmc_configure_logfile(-1); if (args.pa_logparser) { pmclog_close(args.pa_logparser); args.pa_logparser = NULL; } pmcstat_shutdown_logging(); } void pmcstat_create_process(void) { char token; pid_t pid; struct kevent kev; struct pmcstat_target *pt; if (socketpair(AF_UNIX, SOCK_STREAM, 0, pmcstat_sockpair) < 0) err(EX_OSERR, "ERROR: cannot create socket pair"); switch (pid = fork()) { case -1: err(EX_OSERR, "ERROR: cannot fork"); /*NOTREACHED*/ case 0: /* child */ (void) close(pmcstat_sockpair[PARENTSOCKET]); /* Write a token to tell our parent we've started executing. */ if (write(pmcstat_sockpair[CHILDSOCKET], "+", 1) != 1) err(EX_OSERR, "ERROR (child): cannot write token"); /* Wait for our parent to signal us to start. */ if (read(pmcstat_sockpair[CHILDSOCKET], &token, 1) < 0) err(EX_OSERR, "ERROR (child): cannot read token"); (void) close(pmcstat_sockpair[CHILDSOCKET]); /* exec() the program requested */ execvp(*args.pa_argv, args.pa_argv); /* and if that fails, notify the parent */ kill(getppid(), SIGCHLD); err(EX_OSERR, "ERROR: execvp \"%s\" failed", *args.pa_argv); /*NOTREACHED*/ default: /* parent */ (void) close(pmcstat_sockpair[CHILDSOCKET]); break; } /* Ask to be notified via a kevent when the target process exits. */ EV_SET(&kev, pid, EVFILT_PROC, EV_ADD|EV_ONESHOT, NOTE_EXIT, 0, NULL); if (kevent(pmcstat_kq, &kev, 1, NULL, 0, NULL) < 0) err(EX_OSERR, "ERROR: cannot monitor child process %d", pid); if ((pt = malloc(sizeof(*pt))) == NULL) errx(EX_SOFTWARE, "ERROR: Out of memory."); pt->pt_pid = pid; SLIST_INSERT_HEAD(&args.pa_targets, pt, pt_next); /* Wait for the child to signal that its ready to go. */ if (read(pmcstat_sockpair[PARENTSOCKET], &token, 1) < 0) err(EX_OSERR, "ERROR (parent): cannot read token"); return; } void pmcstat_find_targets(const char *spec) { int n, nproc, pid, rv; struct pmcstat_target *pt; char errbuf[_POSIX2_LINE_MAX], *end; static struct kinfo_proc *kp; regex_t reg; regmatch_t regmatch; /* First check if we've been given a process id. */ pid = strtol(spec, &end, 0); if (end != spec && pid >= 0) { if ((pt = malloc(sizeof(*pt))) == NULL) goto outofmemory; pt->pt_pid = pid; SLIST_INSERT_HEAD(&args.pa_targets, pt, pt_next); return; } /* Otherwise treat arg as a regular expression naming processes. */ if (pmcstat_kvm == NULL) { if ((pmcstat_kvm = kvm_openfiles(NULL, "/dev/null", NULL, 0, errbuf)) == NULL) err(EX_OSERR, "ERROR: Cannot open kernel \"%s\"", errbuf); if ((pmcstat_plist = kvm_getprocs(pmcstat_kvm, KERN_PROC_PROC, 0, &nproc)) == NULL) err(EX_OSERR, "ERROR: Cannot get process list: %s", kvm_geterr(pmcstat_kvm)); } else nproc = 0; if ((rv = regcomp(®, spec, REG_EXTENDED|REG_NOSUB)) != 0) { regerror(rv, ®, errbuf, sizeof(errbuf)); err(EX_DATAERR, "ERROR: Failed to compile regex \"%s\": %s", spec, errbuf); } for (n = 0, kp = pmcstat_plist; n < nproc; n++, kp++) { if ((rv = regexec(®, kp->ki_comm, 1, ®match, 0)) == 0) { if ((pt = malloc(sizeof(*pt))) == NULL) goto outofmemory; pt->pt_pid = kp->ki_pid; SLIST_INSERT_HEAD(&args.pa_targets, pt, pt_next); } else if (rv != REG_NOMATCH) { regerror(rv, ®, errbuf, sizeof(errbuf)); errx(EX_SOFTWARE, "ERROR: Regex evalation failed: %s", errbuf); } } regfree(®); return; outofmemory: errx(EX_SOFTWARE, "Out of memory."); /*NOTREACHED*/ } void pmcstat_kill_process(void) { struct pmcstat_target *pt; assert(args.pa_flags & FLAG_HAS_COMMANDLINE); /* * If a command line was specified, it would be the very first * in the list, before any other processes specified by -t. */ pt = SLIST_FIRST(&args.pa_targets); assert(pt != NULL); if (kill(pt->pt_pid, SIGINT) != 0) err(EX_OSERR, "ERROR: cannot signal child process"); } void pmcstat_start_pmcs(void) { struct pmcstat_ev *ev; STAILQ_FOREACH(ev, &args.pa_events, ev_next) { assert(ev->ev_pmcid != PMC_ID_INVALID); if (pmc_start(ev->ev_pmcid) < 0) { warn("ERROR: Cannot start pmc 0x%x \"%s\"", ev->ev_pmcid, ev->ev_name); pmcstat_cleanup(); exit(EX_OSERR); } } } void pmcstat_print_headers(void) { struct pmcstat_ev *ev; int c, w; (void) fprintf(args.pa_printfile, PRINT_HEADER_PREFIX); STAILQ_FOREACH(ev, &args.pa_events, ev_next) { if (PMC_IS_SAMPLING_MODE(ev->ev_mode)) continue; c = PMC_IS_SYSTEM_MODE(ev->ev_mode) ? 's' : 'p'; if (ev->ev_fieldskip != 0) (void) fprintf(args.pa_printfile, "%*s", ev->ev_fieldskip, ""); w = ev->ev_fieldwidth - ev->ev_fieldskip - 2; if (c == 's') (void) fprintf(args.pa_printfile, "s/%02d/%-*s ", ev->ev_cpu, w-3, ev->ev_name); else (void) fprintf(args.pa_printfile, "p/%*s ", w, ev->ev_name); } (void) fflush(args.pa_printfile); } void pmcstat_print_counters(void) { int extra_width; struct pmcstat_ev *ev; pmc_value_t value; extra_width = sizeof(PRINT_HEADER_PREFIX) - 1; STAILQ_FOREACH(ev, &args.pa_events, ev_next) { /* skip sampling mode counters */ if (PMC_IS_SAMPLING_MODE(ev->ev_mode)) continue; if (pmc_read(ev->ev_pmcid, &value) < 0) err(EX_OSERR, "ERROR: Cannot read pmc \"%s\"", ev->ev_name); (void) fprintf(args.pa_printfile, "%*ju ", ev->ev_fieldwidth + extra_width, (uintmax_t) ev->ev_cumulative ? value : (value - ev->ev_saved)); if (ev->ev_cumulative == 0) ev->ev_saved = value; extra_width = 0; } (void) fflush(args.pa_printfile); } /* * Print output */ void pmcstat_print_pmcs(void) { static int linecount = 0; /* check if we need to print a header line */ if (++linecount > pmcstat_displayheight) { (void) fprintf(args.pa_printfile, "\n"); linecount = 1; } if (linecount == 1) pmcstat_print_headers(); (void) fprintf(args.pa_printfile, "\n"); pmcstat_print_counters(); return; } /* * Do process profiling * * If a pid was specified, attach each allocated PMC to the target * process. Otherwise, fork a child and attach the PMCs to the child, * and have the child exec() the target program. */ void pmcstat_start_process(void) { /* Signal the child to proceed. */ if (write(pmcstat_sockpair[PARENTSOCKET], "!", 1) != 1) err(EX_OSERR, "ERROR (parent): write of token failed"); (void) close(pmcstat_sockpair[PARENTSOCKET]); } void pmcstat_show_usage(void) { errx(EX_USAGE, "[options] [commandline]\n" "\t Measure process and/or system performance using hardware\n" "\t performance monitoring counters.\n" "\t Options include:\n" "\t -C\t\t (toggle) show cumulative counts\n" "\t -D path\t create profiles in directory \"path\"\n" "\t -E\t\t (toggle) show counts at process exit\n" "\t -F file\t write a system-wide callgraph (Kcachegrind format)" " to \"file\"\n" "\t -G file\t write a system-wide callgraph to \"file\"\n" "\t -M file\t print executable/gmon file map to \"file\"\n" "\t -N\t\t (toggle) capture callchains\n" "\t -O file\t send log output to \"file\"\n" "\t -P spec\t allocate a process-private sampling PMC\n" "\t -R file\t read events from \"file\"\n" "\t -S spec\t allocate a system-wide sampling PMC\n" "\t -T\t\t start in top mode\n" "\t -W\t\t (toggle) show counts per context switch\n" "\t -a file\t print sampled PCs and callgraph to \"file\"\n" "\t -c cpu-list\t set cpus for subsequent system-wide PMCs\n" "\t -d\t\t (toggle) track descendants\n" "\t -e\t\t use wide history counter for gprof(1) output\n" "\t -f spec\t pass \"spec\" to as plugin option\n" "\t -g\t\t produce gprof(1) compatible profiles\n" "\t -k dir\t\t set the path to the kernel\n" "\t -l secs\t set duration time\n" "\t -m file\t print sampled PCs to \"file\"\n" "\t -n rate\t set sampling rate\n" "\t -o file\t send print output to \"file\"\n" "\t -p spec\t allocate a process-private counting PMC\n" "\t -q\t\t suppress verbosity\n" "\t -r fsroot\t specify FS root directory\n" "\t -s spec\t allocate a system-wide counting PMC\n" "\t -t process-spec attach to running processes matching " "\"process-spec\"\n" "\t -v\t\t increase verbosity\n" "\t -w secs\t set printing time interval\n" "\t -z depth\t limit callchain display depth" ); } /* * At exit handler for top mode */ void pmcstat_topexit(void) { if (!args.pa_toptty) return; /* * Shutdown ncurses. */ clrtoeol(); refresh(); endwin(); } /* * Main */ int main(int argc, char **argv) { cpuset_t cpumask, rootmask; double interval; double duration; int option, npmc; int c, check_driver_stats, current_sampling_count; int do_callchain, do_descendants, do_logproccsw, do_logprocexit; int do_print, do_read; size_t len; int graphdepth; int pipefd[2], rfd; int use_cumulative_counts; short cf, cb; char *end, *tmp; const char *errmsg, *graphfilename; enum pmcstat_state runstate; struct pmc_driverstats ds_start, ds_end; struct pmcstat_ev *ev; struct sigaction sa; struct kevent kev; struct winsize ws; struct stat sb; char buffer[PATH_MAX]; check_driver_stats = 0; current_sampling_count = DEFAULT_SAMPLE_COUNT; do_callchain = 1; do_descendants = 0; do_logproccsw = 0; do_logprocexit = 0; use_cumulative_counts = 0; graphfilename = "-"; args.pa_required = 0; args.pa_flags = 0; args.pa_verbosity = 1; args.pa_logfd = -1; args.pa_fsroot = ""; args.pa_samplesdir = "."; args.pa_printfile = stderr; args.pa_graphdepth = DEFAULT_CALLGRAPH_DEPTH; args.pa_graphfile = NULL; args.pa_interval = DEFAULT_WAIT_INTERVAL; args.pa_mapfilename = NULL; args.pa_inputpath = NULL; args.pa_outputpath = NULL; args.pa_pplugin = PMCSTAT_PL_NONE; args.pa_plugin = PMCSTAT_PL_NONE; args.pa_ctdumpinstr = 1; args.pa_topmode = PMCSTAT_TOP_DELTA; args.pa_toptty = 0; args.pa_topcolor = 0; args.pa_mergepmc = 0; args.pa_duration = 0.0; STAILQ_INIT(&args.pa_events); SLIST_INIT(&args.pa_targets); bzero(&ds_start, sizeof(ds_start)); bzero(&ds_end, sizeof(ds_end)); ev = NULL; CPU_ZERO(&cpumask); /* Default to using the running system kernel. */ len = 0; if (sysctlbyname("kern.bootfile", NULL, &len, NULL, 0) == -1) err(EX_OSERR, "ERROR: Cannot determine path of running kernel"); - args.pa_kernel = malloc(len + 1); + args.pa_kernel = malloc(len); + if (args.pa_kernel == NULL) + errx(EX_SOFTWARE, "ERROR: Out of memory."); if (sysctlbyname("kern.bootfile", args.pa_kernel, &len, NULL, 0) == -1) err(EX_OSERR, "ERROR: Cannot determine path of running kernel"); /* * The initial CPU mask specifies the root mask of this process * which is usually all CPUs in the system. */ if (cpuset_getaffinity(CPU_LEVEL_ROOT, CPU_WHICH_PID, -1, sizeof(rootmask), &rootmask) == -1) err(EX_OSERR, "ERROR: Cannot determine the root set of CPUs"); CPU_COPY(&rootmask, &cpumask); while ((option = getopt(argc, argv, "CD:EF:G:M:NO:P:R:S:TWa:c:def:gk:l:m:n:o:p:qr:s:t:vw:z:")) != -1) switch (option) { case 'a': /* Annotate + callgraph */ args.pa_flags |= FLAG_DO_ANNOTATE; args.pa_plugin = PMCSTAT_PL_ANNOTATE_CG; graphfilename = optarg; break; case 'C': /* cumulative values */ use_cumulative_counts = !use_cumulative_counts; args.pa_required |= FLAG_HAS_COUNTING_PMCS; break; case 'c': /* CPU */ if (optarg[0] == '*' && optarg[1] == '\0') CPU_COPY(&rootmask, &cpumask); else pmcstat_get_cpumask(optarg, &cpumask); args.pa_flags |= FLAGS_HAS_CPUMASK; args.pa_required |= FLAG_HAS_SYSTEM_PMCS; break; case 'D': if (stat(optarg, &sb) < 0) err(EX_OSERR, "ERROR: Cannot stat \"%s\"", optarg); if (!S_ISDIR(sb.st_mode)) errx(EX_USAGE, "ERROR: \"%s\" is not a directory.", optarg); args.pa_samplesdir = optarg; args.pa_flags |= FLAG_HAS_SAMPLESDIR; args.pa_required |= FLAG_DO_GPROF; break; case 'd': /* toggle descendents */ do_descendants = !do_descendants; args.pa_required |= FLAG_HAS_PROCESS_PMCS; break; case 'e': /* wide gprof metrics */ args.pa_flags |= FLAG_DO_WIDE_GPROF_HC; break; case 'F': /* produce a system-wide calltree */ args.pa_flags |= FLAG_DO_CALLGRAPHS; args.pa_plugin = PMCSTAT_PL_CALLTREE; graphfilename = optarg; break; case 'f': /* plugins options */ if (args.pa_plugin == PMCSTAT_PL_NONE) err(EX_USAGE, "ERROR: Need -g/-G/-m/-T."); pmcstat_pluginconfigure_log(optarg); break; case 'G': /* produce a system-wide callgraph */ args.pa_flags |= FLAG_DO_CALLGRAPHS; args.pa_plugin = PMCSTAT_PL_CALLGRAPH; graphfilename = optarg; break; case 'g': /* produce gprof compatible profiles */ args.pa_flags |= FLAG_DO_GPROF; args.pa_pplugin = PMCSTAT_PL_CALLGRAPH; args.pa_plugin = PMCSTAT_PL_GPROF; break; case 'k': /* pathname to the kernel */ free(args.pa_kernel); args.pa_kernel = strdup(optarg); + if (args.pa_kernel == NULL) + errx(EX_SOFTWARE, "ERROR: Out of memory"); args.pa_required |= FLAG_DO_ANALYSIS; args.pa_flags |= FLAG_HAS_KERNELPATH; break; case 'l': /* time duration in seconds */ duration = strtod(optarg, &end); if (*end != '\0' || duration <= 0) errx(EX_USAGE, "ERROR: Illegal duration time " "value \"%s\".", optarg); args.pa_flags |= FLAG_HAS_DURATION; args.pa_duration = duration; break; case 'm': args.pa_flags |= FLAG_DO_ANNOTATE; args.pa_plugin = PMCSTAT_PL_ANNOTATE; graphfilename = optarg; break; case 'E': /* log process exit */ do_logprocexit = !do_logprocexit; args.pa_required |= (FLAG_HAS_PROCESS_PMCS | FLAG_HAS_COUNTING_PMCS | FLAG_HAS_OUTPUT_LOGFILE); break; case 'M': /* mapfile */ args.pa_mapfilename = optarg; break; case 'N': do_callchain = !do_callchain; args.pa_required |= FLAG_HAS_SAMPLING_PMCS; break; case 'p': /* process virtual counting PMC */ case 's': /* system-wide counting PMC */ case 'P': /* process virtual sampling PMC */ case 'S': /* system-wide sampling PMC */ if ((ev = malloc(sizeof(*ev))) == NULL) errx(EX_SOFTWARE, "ERROR: Out of memory."); switch (option) { case 'p': ev->ev_mode = PMC_MODE_TC; break; case 's': ev->ev_mode = PMC_MODE_SC; break; case 'P': ev->ev_mode = PMC_MODE_TS; break; case 'S': ev->ev_mode = PMC_MODE_SS; break; } if (option == 'P' || option == 'p') { args.pa_flags |= FLAG_HAS_PROCESS_PMCS; args.pa_required |= (FLAG_HAS_COMMANDLINE | FLAG_HAS_TARGET); } if (option == 'P' || option == 'S') { args.pa_flags |= FLAG_HAS_SAMPLING_PMCS; args.pa_required |= (FLAG_HAS_PIPE | FLAG_HAS_OUTPUT_LOGFILE); } if (option == 'p' || option == 's') args.pa_flags |= FLAG_HAS_COUNTING_PMCS; if (option == 's' || option == 'S') args.pa_flags |= FLAG_HAS_SYSTEM_PMCS; ev->ev_spec = strdup(optarg); + if (ev->ev_spec == NULL) + errx(EX_SOFTWARE, "ERROR: Out of memory."); if (option == 'S' || option == 'P') ev->ev_count = current_sampling_count; else ev->ev_count = -1; if (option == 'S' || option == 's') ev->ev_cpu = CPU_FFS(&cpumask) - 1; else ev->ev_cpu = PMC_CPU_ANY; ev->ev_flags = 0; if (do_callchain) ev->ev_flags |= PMC_F_CALLCHAIN; if (do_descendants) ev->ev_flags |= PMC_F_DESCENDANTS; if (do_logprocexit) ev->ev_flags |= PMC_F_LOG_PROCEXIT; if (do_logproccsw) ev->ev_flags |= PMC_F_LOG_PROCCSW; ev->ev_cumulative = use_cumulative_counts; ev->ev_saved = 0LL; ev->ev_pmcid = PMC_ID_INVALID; /* extract event name */ c = strcspn(optarg, ", \t"); ev->ev_name = malloc(c + 1); + if (ev->ev_name == NULL) + errx(EX_SOFTWARE, "ERROR: Out of memory."); (void) strncpy(ev->ev_name, optarg, c); *(ev->ev_name + c) = '\0'; STAILQ_INSERT_TAIL(&args.pa_events, ev, ev_next); if (option == 's' || option == 'S') { CPU_CLR(ev->ev_cpu, &cpumask); pmcstat_clone_event_descriptor(ev, &cpumask); CPU_SET(ev->ev_cpu, &cpumask); } break; case 'n': /* sampling count */ current_sampling_count = strtol(optarg, &end, 0); if (*end != '\0' || current_sampling_count <= 0) errx(EX_USAGE, "ERROR: Illegal count value \"%s\".", optarg); args.pa_required |= FLAG_HAS_SAMPLING_PMCS; break; case 'o': /* outputfile */ if (args.pa_printfile != NULL && args.pa_printfile != stdout && args.pa_printfile != stderr) (void) fclose(args.pa_printfile); if ((args.pa_printfile = fopen(optarg, "w")) == NULL) errx(EX_OSERR, "ERROR: cannot open \"%s\" for writing.", optarg); args.pa_flags |= FLAG_DO_PRINT; break; case 'O': /* sampling output */ if (args.pa_outputpath) errx(EX_USAGE, "ERROR: option -O may only be specified once."); args.pa_outputpath = optarg; args.pa_flags |= FLAG_HAS_OUTPUT_LOGFILE; break; case 'q': /* quiet mode */ args.pa_verbosity = 0; break; case 'r': /* root FS path */ args.pa_fsroot = optarg; break; case 'R': /* read an existing log file */ if (args.pa_inputpath != NULL) errx(EX_USAGE, "ERROR: option -R may only be specified once."); args.pa_inputpath = optarg; if (args.pa_printfile == stderr) args.pa_printfile = stdout; args.pa_flags |= FLAG_READ_LOGFILE; break; case 't': /* target pid or process name */ pmcstat_find_targets(optarg); args.pa_flags |= FLAG_HAS_TARGET; args.pa_required |= FLAG_HAS_PROCESS_PMCS; break; case 'T': /* top mode */ args.pa_flags |= FLAG_DO_TOP; args.pa_plugin = PMCSTAT_PL_CALLGRAPH; args.pa_ctdumpinstr = 0; args.pa_mergepmc = 1; if (args.pa_printfile == stderr) args.pa_printfile = stdout; break; case 'v': /* verbose */ args.pa_verbosity++; break; case 'w': /* wait interval */ interval = strtod(optarg, &end); if (*end != '\0' || interval <= 0) errx(EX_USAGE, "ERROR: Illegal wait interval value \"%s\".", optarg); args.pa_flags |= FLAG_HAS_WAIT_INTERVAL; args.pa_interval = interval; break; case 'W': /* toggle LOG_CSW */ do_logproccsw = !do_logproccsw; args.pa_required |= (FLAG_HAS_PROCESS_PMCS | FLAG_HAS_COUNTING_PMCS | FLAG_HAS_OUTPUT_LOGFILE); break; case 'z': graphdepth = strtod(optarg, &end); if (*end != '\0' || graphdepth <= 0) errx(EX_USAGE, "ERROR: Illegal callchain depth \"%s\".", optarg); args.pa_graphdepth = graphdepth; args.pa_required |= FLAG_DO_CALLGRAPHS; break; case '?': default: pmcstat_show_usage(); break; } args.pa_argc = (argc -= optind); args.pa_argv = (argv += optind); /* If we read from logfile and no specified CPU mask use * the maximum CPU count. */ if ((args.pa_flags & FLAG_READ_LOGFILE) && (args.pa_flags & FLAGS_HAS_CPUMASK) == 0) CPU_FILL(&cpumask); args.pa_cpumask = cpumask; /* For selecting CPUs using -R. */ if (argc) /* command line present */ args.pa_flags |= FLAG_HAS_COMMANDLINE; if (args.pa_flags & (FLAG_DO_GPROF | FLAG_DO_CALLGRAPHS | FLAG_DO_ANNOTATE | FLAG_DO_TOP)) args.pa_flags |= FLAG_DO_ANALYSIS; /* * Check invocation syntax. */ /* disallow -O and -R together */ if (args.pa_outputpath && args.pa_inputpath) errx(EX_USAGE, "ERROR: options -O and -R are mutually exclusive."); /* disallow -T and -l together */ if ((args.pa_flags & FLAG_HAS_DURATION) && (args.pa_flags & FLAG_DO_TOP)) errx(EX_USAGE, "ERROR: options -T and -l are mutually " "exclusive."); /* -a and -m require -R */ if (args.pa_flags & FLAG_DO_ANNOTATE && args.pa_inputpath == NULL) errx(EX_USAGE, "ERROR: option %s requires an input file", args.pa_plugin == PMCSTAT_PL_ANNOTATE ? "-m" : "-a"); /* -m option is not allowed combined with -g or -G. */ if (args.pa_flags & FLAG_DO_ANNOTATE && args.pa_flags & (FLAG_DO_GPROF | FLAG_DO_CALLGRAPHS)) errx(EX_USAGE, "ERROR: option -m and -g | -G are mutually exclusive"); if (args.pa_flags & FLAG_READ_LOGFILE) { errmsg = NULL; if (args.pa_flags & FLAG_HAS_COMMANDLINE) errmsg = "a command line specification"; else if (args.pa_flags & FLAG_HAS_TARGET) errmsg = "option -t"; else if (!STAILQ_EMPTY(&args.pa_events)) errmsg = "a PMC event specification"; if (errmsg) errx(EX_USAGE, "ERROR: option -R may not be used with %s.", errmsg); } else if (STAILQ_EMPTY(&args.pa_events)) /* All other uses require a PMC spec. */ pmcstat_show_usage(); /* check for -t pid without a process PMC spec */ if ((args.pa_required & FLAG_HAS_TARGET) && (args.pa_flags & FLAG_HAS_PROCESS_PMCS) == 0) errx(EX_USAGE, "ERROR: option -t requires a process mode PMC to be specified." ); /* check for process-mode options without a command or -t pid */ if ((args.pa_required & FLAG_HAS_PROCESS_PMCS) && (args.pa_flags & (FLAG_HAS_COMMANDLINE | FLAG_HAS_TARGET)) == 0) errx(EX_USAGE, "ERROR: options -d, -E, -p, -P, and -W require a command line or target process." ); /* check for -p | -P without a target process of some sort */ if ((args.pa_required & (FLAG_HAS_COMMANDLINE | FLAG_HAS_TARGET)) && (args.pa_flags & (FLAG_HAS_COMMANDLINE | FLAG_HAS_TARGET)) == 0) errx(EX_USAGE, "ERROR: options -P and -p require a target process or a command line." ); /* check for process-mode options without a process-mode PMC */ if ((args.pa_required & FLAG_HAS_PROCESS_PMCS) && (args.pa_flags & FLAG_HAS_PROCESS_PMCS) == 0) errx(EX_USAGE, "ERROR: options -d, -E, and -W require a process mode PMC to be specified." ); /* check for -c cpu with no system mode PMCs or logfile. */ if ((args.pa_required & FLAG_HAS_SYSTEM_PMCS) && (args.pa_flags & FLAG_HAS_SYSTEM_PMCS) == 0 && (args.pa_flags & FLAG_READ_LOGFILE) == 0) errx(EX_USAGE, "ERROR: option -c requires at least one system mode PMC to be specified." ); /* check for counting mode options without a counting PMC */ if ((args.pa_required & FLAG_HAS_COUNTING_PMCS) && (args.pa_flags & FLAG_HAS_COUNTING_PMCS) == 0) errx(EX_USAGE, "ERROR: options -C, -W and -o require at least one counting mode PMC to be specified." ); /* check for sampling mode options without a sampling PMC spec */ if ((args.pa_required & FLAG_HAS_SAMPLING_PMCS) && (args.pa_flags & FLAG_HAS_SAMPLING_PMCS) == 0) errx(EX_USAGE, "ERROR: options -N, -n and -O require at least one sampling mode PMC to be specified." ); /* check if -g/-G/-m/-T are being used correctly */ if ((args.pa_flags & FLAG_DO_ANALYSIS) && !(args.pa_flags & (FLAG_HAS_SAMPLING_PMCS|FLAG_READ_LOGFILE))) errx(EX_USAGE, "ERROR: options -g/-G/-m/-T require sampling PMCs or -R to be specified." ); /* check if -e was specified without -g */ if ((args.pa_flags & FLAG_DO_WIDE_GPROF_HC) && !(args.pa_flags & FLAG_DO_GPROF)) errx(EX_USAGE, "ERROR: option -e requires gprof mode to be specified." ); /* check if -O was spuriously specified */ if ((args.pa_flags & FLAG_HAS_OUTPUT_LOGFILE) && (args.pa_required & FLAG_HAS_OUTPUT_LOGFILE) == 0) errx(EX_USAGE, "ERROR: option -O is used only with options -E, -P, -S and -W." ); /* -k kernel path require -g/-G/-m/-T or -R */ if ((args.pa_flags & FLAG_HAS_KERNELPATH) && (args.pa_flags & FLAG_DO_ANALYSIS) == 0 && (args.pa_flags & FLAG_READ_LOGFILE) == 0) errx(EX_USAGE, "ERROR: option -k is only used with -g/-R/-m/-T."); /* -D only applies to gprof output mode (-g) */ if ((args.pa_flags & FLAG_HAS_SAMPLESDIR) && (args.pa_flags & FLAG_DO_GPROF) == 0) errx(EX_USAGE, "ERROR: option -D is only used with -g."); /* -M mapfile requires -g or -R */ if (args.pa_mapfilename != NULL && (args.pa_flags & FLAG_DO_GPROF) == 0 && (args.pa_flags & FLAG_READ_LOGFILE) == 0) errx(EX_USAGE, "ERROR: option -M is only used with -g/-R."); /* * Disallow textual output of sampling PMCs if counting PMCs * have also been asked for, mostly because the combined output * is difficult to make sense of. */ if ((args.pa_flags & FLAG_HAS_COUNTING_PMCS) && (args.pa_flags & FLAG_HAS_SAMPLING_PMCS) && ((args.pa_flags & FLAG_HAS_OUTPUT_LOGFILE) == 0)) errx(EX_USAGE, "ERROR: option -O is required if counting and sampling PMCs are specified together." ); /* * Check if 'kerneldir' refers to a file rather than a * directory. If so, use `dirname path` to determine the * kernel directory. */ (void) snprintf(buffer, sizeof(buffer), "%s%s", args.pa_fsroot, args.pa_kernel); if (stat(buffer, &sb) < 0) err(EX_OSERR, "ERROR: Cannot locate kernel \"%s\"", buffer); if (!S_ISREG(sb.st_mode) && !S_ISDIR(sb.st_mode)) errx(EX_USAGE, "ERROR: \"%s\": Unsupported file type.", buffer); if (!S_ISDIR(sb.st_mode)) { tmp = args.pa_kernel; args.pa_kernel = strdup(dirname(args.pa_kernel)); + if (args.pa_kernel == NULL) + errx(EX_SOFTWARE, "ERROR: Out of memory"); free(tmp); (void) snprintf(buffer, sizeof(buffer), "%s%s", args.pa_fsroot, args.pa_kernel); if (stat(buffer, &sb) < 0) err(EX_OSERR, "ERROR: Cannot stat \"%s\"", buffer); if (!S_ISDIR(sb.st_mode)) errx(EX_USAGE, "ERROR: \"%s\" is not a directory.", buffer); } /* * If we have a callgraph be created, select the outputfile. */ if (args.pa_flags & FLAG_DO_CALLGRAPHS) { if (strcmp(graphfilename, "-") == 0) args.pa_graphfile = args.pa_printfile; else { args.pa_graphfile = fopen(graphfilename, "w"); if (args.pa_graphfile == NULL) err(EX_OSERR, "ERROR: cannot open \"%s\" for writing", graphfilename); } } if (args.pa_flags & FLAG_DO_ANNOTATE) { args.pa_graphfile = fopen(graphfilename, "w"); if (args.pa_graphfile == NULL) err(EX_OSERR, "ERROR: cannot open \"%s\" for writing", graphfilename); } /* if we've been asked to process a log file, skip init */ if ((args.pa_flags & FLAG_READ_LOGFILE) == 0) { if (pmc_init() < 0) err(EX_UNAVAILABLE, "ERROR: Initialization of the pmc(3) library failed" ); if ((npmc = pmc_npmc(0)) < 0) /* assume all CPUs are identical */ err(EX_OSERR, "ERROR: Cannot determine the number of PMCs on CPU %d", 0); } /* Allocate a kqueue */ if ((pmcstat_kq = kqueue()) < 0) err(EX_OSERR, "ERROR: Cannot allocate kqueue"); /* Setup the logfile as the source. */ if (args.pa_flags & FLAG_READ_LOGFILE) { /* * Print the log in textual form if we haven't been * asked to generate profiling information. */ if ((args.pa_flags & FLAG_DO_ANALYSIS) == 0) args.pa_flags |= FLAG_DO_PRINT; pmcstat_initialize_logging(); rfd = pmcstat_open_log(args.pa_inputpath, PMCSTAT_OPEN_FOR_READ); if ((args.pa_logparser = pmclog_open(rfd)) == NULL) err(EX_OSERR, "ERROR: Cannot create parser"); if (fcntl(rfd, F_SETFL, O_NONBLOCK) < 0) err(EX_OSERR, "ERROR: fcntl(2) failed"); EV_SET(&kev, rfd, EVFILT_READ, EV_ADD, 0, 0, NULL); if (kevent(pmcstat_kq, &kev, 1, NULL, 0, NULL) < 0) err(EX_OSERR, "ERROR: Cannot register kevent"); } /* * Configure the specified log file or setup a default log * consumer via a pipe. */ if (args.pa_required & FLAG_HAS_OUTPUT_LOGFILE) { if (args.pa_outputpath) args.pa_logfd = pmcstat_open_log(args.pa_outputpath, PMCSTAT_OPEN_FOR_WRITE); else { /* * process the log on the fly by reading it in * through a pipe. */ if (pipe(pipefd) < 0) err(EX_OSERR, "ERROR: pipe(2) failed"); if (fcntl(pipefd[READPIPEFD], F_SETFL, O_NONBLOCK) < 0) err(EX_OSERR, "ERROR: fcntl(2) failed"); EV_SET(&kev, pipefd[READPIPEFD], EVFILT_READ, EV_ADD, 0, 0, NULL); if (kevent(pmcstat_kq, &kev, 1, NULL, 0, NULL) < 0) err(EX_OSERR, "ERROR: Cannot register kevent"); args.pa_logfd = pipefd[WRITEPIPEFD]; args.pa_flags |= FLAG_HAS_PIPE; if ((args.pa_flags & FLAG_DO_TOP) == 0) args.pa_flags |= FLAG_DO_PRINT; args.pa_logparser = pmclog_open(pipefd[READPIPEFD]); } if (pmc_configure_logfile(args.pa_logfd) < 0) err(EX_OSERR, "ERROR: Cannot configure log file"); } /* remember to check for driver errors if we are sampling or logging */ check_driver_stats = (args.pa_flags & FLAG_HAS_SAMPLING_PMCS) || (args.pa_flags & FLAG_HAS_OUTPUT_LOGFILE); /* if (args.pa_flags & FLAG_READ_LOGFILE) { * Allocate PMCs. */ STAILQ_FOREACH(ev, &args.pa_events, ev_next) { if (pmc_allocate(ev->ev_spec, ev->ev_mode, ev->ev_flags, ev->ev_cpu, &ev->ev_pmcid) < 0) err(EX_OSERR, "ERROR: Cannot allocate %s-mode pmc with specification \"%s\"", PMC_IS_SYSTEM_MODE(ev->ev_mode) ? "system" : "process", ev->ev_spec); if (PMC_IS_SAMPLING_MODE(ev->ev_mode) && pmc_set(ev->ev_pmcid, ev->ev_count) < 0) err(EX_OSERR, "ERROR: Cannot set sampling count for PMC \"%s\"", ev->ev_name); } /* compute printout widths */ STAILQ_FOREACH(ev, &args.pa_events, ev_next) { int counter_width; int display_width; int header_width; (void) pmc_width(ev->ev_pmcid, &counter_width); header_width = strlen(ev->ev_name) + 2; /* prefix '%c/' */ display_width = (int) floor(counter_width / 3.32193) + 1; if (PMC_IS_SYSTEM_MODE(ev->ev_mode)) header_width += 3; /* 2 digit CPU number + '/' */ if (header_width > display_width) { ev->ev_fieldskip = 0; ev->ev_fieldwidth = header_width; } else { ev->ev_fieldskip = display_width - header_width; ev->ev_fieldwidth = display_width; } } /* * If our output is being set to a terminal, register a handler * for window size changes. */ if (isatty(fileno(args.pa_printfile))) { if (ioctl(fileno(args.pa_printfile), TIOCGWINSZ, &ws) < 0) err(EX_OSERR, "ERROR: Cannot determine window size"); pmcstat_displayheight = ws.ws_row - 1; pmcstat_displaywidth = ws.ws_col - 1; EV_SET(&kev, SIGWINCH, EVFILT_SIGNAL, EV_ADD, 0, 0, NULL); if (kevent(pmcstat_kq, &kev, 1, NULL, 0, NULL) < 0) err(EX_OSERR, "ERROR: Cannot register kevent for SIGWINCH"); args.pa_toptty = 1; } /* * Listen to key input in top mode. */ if (args.pa_flags & FLAG_DO_TOP) { EV_SET(&kev, fileno(stdin), EVFILT_READ, EV_ADD, 0, 0, NULL); if (kevent(pmcstat_kq, &kev, 1, NULL, 0, NULL) < 0) err(EX_OSERR, "ERROR: Cannot register kevent"); } EV_SET(&kev, SIGINT, EVFILT_SIGNAL, EV_ADD, 0, 0, NULL); if (kevent(pmcstat_kq, &kev, 1, NULL, 0, NULL) < 0) err(EX_OSERR, "ERROR: Cannot register kevent for SIGINT"); EV_SET(&kev, SIGIO, EVFILT_SIGNAL, EV_ADD, 0, 0, NULL); if (kevent(pmcstat_kq, &kev, 1, NULL, 0, NULL) < 0) err(EX_OSERR, "ERROR: Cannot register kevent for SIGIO"); /* * An exec() failure of a forked child is signalled by the * child sending the parent a SIGCHLD. We don't register an * actual signal handler for SIGCHLD, but instead use our * kqueue to pick up the signal. */ EV_SET(&kev, SIGCHLD, EVFILT_SIGNAL, EV_ADD, 0, 0, NULL); if (kevent(pmcstat_kq, &kev, 1, NULL, 0, NULL) < 0) err(EX_OSERR, "ERROR: Cannot register kevent for SIGCHLD"); /* * Setup a timer if we have counting mode PMCs needing to be printed or * top mode plugin is active. */ if (((args.pa_flags & FLAG_HAS_COUNTING_PMCS) && (args.pa_required & FLAG_HAS_OUTPUT_LOGFILE) == 0) || (args.pa_flags & FLAG_DO_TOP)) { EV_SET(&kev, 0, EVFILT_TIMER, EV_ADD, 0, args.pa_interval * 1000, NULL); if (kevent(pmcstat_kq, &kev, 1, NULL, 0, NULL) < 0) err(EX_OSERR, "ERROR: Cannot register kevent for timer"); } /* * Setup a duration timer if we have sampling mode PMCs and * a duration time is set */ if ((args.pa_flags & FLAG_HAS_SAMPLING_PMCS) && (args.pa_flags & FLAG_HAS_DURATION)) { EV_SET(&kev, 0, EVFILT_TIMER, EV_ADD, 0, args.pa_duration * 1000, NULL); if (kevent(pmcstat_kq, &kev, 1, NULL, 0, NULL) < 0) err(EX_OSERR, "ERROR: Cannot register kevent for " "time duration"); } /* attach PMCs to the target process, starting it if specified */ if (args.pa_flags & FLAG_HAS_COMMANDLINE) pmcstat_create_process(); if (check_driver_stats && pmc_get_driver_stats(&ds_start) < 0) err(EX_OSERR, "ERROR: Cannot retrieve driver statistics"); /* Attach process pmcs to the target process. */ if (args.pa_flags & (FLAG_HAS_TARGET | FLAG_HAS_COMMANDLINE)) { if (SLIST_EMPTY(&args.pa_targets)) errx(EX_DATAERR, "ERROR: No matching target processes."); if (args.pa_flags & FLAG_HAS_PROCESS_PMCS) pmcstat_attach_pmcs(); if (pmcstat_kvm) { kvm_close(pmcstat_kvm); pmcstat_kvm = NULL; } } /* start the pmcs */ pmcstat_start_pmcs(); /* start the (commandline) process if needed */ if (args.pa_flags & FLAG_HAS_COMMANDLINE) pmcstat_start_process(); /* initialize logging */ pmcstat_initialize_logging(); /* Handle SIGINT using the kqueue loop */ sa.sa_handler = SIG_IGN; sa.sa_flags = 0; (void) sigemptyset(&sa.sa_mask); if (sigaction(SIGINT, &sa, NULL) < 0) err(EX_OSERR, "ERROR: Cannot install signal handler"); /* * Setup the top mode display. */ if (args.pa_flags & FLAG_DO_TOP) { args.pa_flags &= ~FLAG_DO_PRINT; if (args.pa_toptty) { /* * Init ncurses. */ initscr(); if(has_colors() == TRUE) { args.pa_topcolor = 1; start_color(); use_default_colors(); pair_content(0, &cf, &cb); init_pair(1, COLOR_RED, cb); init_pair(2, COLOR_YELLOW, cb); init_pair(3, COLOR_GREEN, cb); } cbreak(); noecho(); nonl(); nodelay(stdscr, 1); intrflush(stdscr, FALSE); keypad(stdscr, TRUE); clear(); /* Get terminal width / height with ncurses. */ getmaxyx(stdscr, pmcstat_displayheight, pmcstat_displaywidth); pmcstat_displayheight--; pmcstat_displaywidth--; atexit(pmcstat_topexit); } } /* * loop till either the target process (if any) exits, or we * are killed by a SIGINT or we reached the time duration. */ runstate = PMCSTAT_RUNNING; do_print = do_read = 0; do { if ((c = kevent(pmcstat_kq, NULL, 0, &kev, 1, NULL)) <= 0) { if (errno != EINTR) err(EX_OSERR, "ERROR: kevent failed"); else continue; } if (kev.flags & EV_ERROR) errc(EX_OSERR, kev.data, "ERROR: kevent failed"); switch (kev.filter) { case EVFILT_PROC: /* target has exited */ runstate = pmcstat_close_log(); do_print = 1; break; case EVFILT_READ: /* log file data is present */ if (kev.ident == (unsigned)fileno(stdin) && (args.pa_flags & FLAG_DO_TOP)) { if (pmcstat_keypress_log()) runstate = pmcstat_close_log(); } else { do_read = 0; runstate = pmcstat_process_log(); } break; case EVFILT_SIGNAL: if (kev.ident == SIGCHLD) { /* * The child process sends us a * SIGCHLD if its exec() failed. We * wait for it to exit and then exit * ourselves. */ (void) wait(&c); runstate = PMCSTAT_FINISHED; } else if (kev.ident == SIGIO) { /* * We get a SIGIO if a PMC loses all * of its targets, or if logfile * writes encounter an error. */ runstate = pmcstat_close_log(); do_print = 1; /* print PMCs at exit */ } else if (kev.ident == SIGINT) { /* Kill the child process if we started it */ if (args.pa_flags & FLAG_HAS_COMMANDLINE) pmcstat_kill_process(); runstate = pmcstat_close_log(); } else if (kev.ident == SIGWINCH) { if (ioctl(fileno(args.pa_printfile), TIOCGWINSZ, &ws) < 0) err(EX_OSERR, "ERROR: Cannot determine window size"); pmcstat_displayheight = ws.ws_row - 1; pmcstat_displaywidth = ws.ws_col - 1; } else assert(0); break; case EVFILT_TIMER: /* time duration reached, exit */ if (args.pa_flags & FLAG_HAS_DURATION) { runstate = PMCSTAT_FINISHED; break; } /* print out counting PMCs */ if ((args.pa_flags & FLAG_DO_TOP) && pmc_flush_logfile() == 0) do_read = 1; do_print = 1; break; } if (do_print && !do_read) { if ((args.pa_required & FLAG_HAS_OUTPUT_LOGFILE) == 0) { pmcstat_print_pmcs(); if (runstate == PMCSTAT_FINISHED && /* final newline */ (args.pa_flags & FLAG_DO_PRINT) == 0) (void) fprintf(args.pa_printfile, "\n"); } if (args.pa_flags & FLAG_DO_TOP) pmcstat_display_log(); do_print = 0; } } while (runstate != PMCSTAT_FINISHED); if ((args.pa_flags & FLAG_DO_TOP) && args.pa_toptty) { pmcstat_topexit(); args.pa_toptty = 0; } /* flush any pending log entries */ if (args.pa_flags & (FLAG_HAS_OUTPUT_LOGFILE | FLAG_HAS_PIPE)) pmc_close_logfile(); pmcstat_cleanup(); - - free(args.pa_kernel); /* check if the driver lost any samples or events */ if (check_driver_stats) { if (pmc_get_driver_stats(&ds_end) < 0) err(EX_OSERR, "ERROR: Cannot retrieve driver statistics"); if (ds_start.pm_intr_bufferfull != ds_end.pm_intr_bufferfull && args.pa_verbosity > 0) warnx( "WARNING: sampling was paused at least %u time%s.\n" "Please consider tuning the \"kern.hwpmc.nsamples\" tunable.", ds_end.pm_intr_bufferfull - ds_start.pm_intr_bufferfull, ((ds_end.pm_intr_bufferfull - ds_start.pm_intr_bufferfull) != 1) ? "s" : "" ); if (ds_start.pm_buffer_requests_failed != ds_end.pm_buffer_requests_failed && args.pa_verbosity > 0) warnx( "WARNING: at least %u event%s were discarded while running.\n" "Please consider tuning the \"kern.hwpmc.nbuffers\" tunable.", ds_end.pm_buffer_requests_failed - ds_start.pm_buffer_requests_failed, ((ds_end.pm_buffer_requests_failed - ds_start.pm_buffer_requests_failed) != 1) ? "s" : "" ); } exit(EX_OK); } Index: user/alc/PQ_LAUNDRY/usr.sbin/portsnap/portsnap/portsnap.sh =================================================================== --- user/alc/PQ_LAUNDRY/usr.sbin/portsnap/portsnap/portsnap.sh (revision 306831) +++ user/alc/PQ_LAUNDRY/usr.sbin/portsnap/portsnap/portsnap.sh (revision 306832) @@ -1,1146 +1,1147 @@ #!/bin/sh #- # Copyright 2004-2005 Colin Percival # All rights reserved # # Redistribution and use in source and binary forms, with or without # modification, are permitted providing that the following conditions # are met: # 1. Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # 2. Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # # THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE # ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY # DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS # OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) # HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, # STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING # IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE # POSSIBILITY OF SUCH DAMAGE. # $FreeBSD$ #### Usage function -- called from command-line handling code. # Usage instructions. Options not listed: # --debug -- don't filter output from utilities # --no-stats -- don't show progress statistics while fetching files usage() { cat < serverlist_tried # Check that host(1) exists (i.e., that the system wasn't built with the # WITHOUT_BIND set) and don't try to find a mirror if it doesn't exist. if ! which -s host; then : > serverlist_full return 1 fi echo -n "Looking up ${SERVERNAME} mirrors... " # Issue the SRV query and pull out the Priority, Weight, and Target fields. # BIND 9 prints "$name has SRV record ..." while BIND 8 prints # "$name server selection ..."; we allow either format. MLIST="_http._tcp.${SERVERNAME}" host -t srv "${MLIST}" | sed -nE "s/${MLIST} (has SRV record|server selection) //Ip" | cut -f 1,2,4 -d ' ' | sed -e 's/\.$//' | sort > serverlist_full # If no records, give up -- we'll just use the server name we were given. if [ `wc -l < serverlist_full` -eq 0 ]; then echo "none found." return 1 fi # Report how many mirrors we found. echo `wc -l < serverlist_full` "mirrors found." # Generate a random seed for use in picking mirrors. If HTTP_PROXY # is set, this will be used to generate the seed; otherwise, the seed # will be random. if [ -n "${HTTP_PROXY}${http_proxy}" ]; then RANDVALUE=`sha256 -qs "${HTTP_PROXY}${http_proxy}" | tr -d 'a-f' | cut -c 1-9` else RANDVALUE=`jot -r 1 0 999999999` fi } # Pick a mirror. Returns 1 if we have run out of mirrors to try. fetch_pick_server() { # Generate a list of not-yet-tried mirrors sort serverlist_tried | comm -23 serverlist_full - > serverlist # Have we run out of mirrors? if [ `wc -l < serverlist` -eq 0 ]; then echo "No mirrors remaining, giving up." return 1 fi # Find the highest priority level (lowest numeric value). SRV_PRIORITY=`cut -f 1 -d ' ' serverlist | sort -n | head -1` # Add up the weights of the response lines at that priority level. SRV_WSUM=0; while read X; do case "$X" in ${SRV_PRIORITY}\ *) SRV_W=`echo $X | cut -f 2 -d ' '` SRV_WSUM=$(($SRV_WSUM + $SRV_W)) ;; esac done < serverlist # If all the weights are 0, pretend that they are all 1 instead. if [ ${SRV_WSUM} -eq 0 ]; then SRV_WSUM=`grep -E "^${SRV_PRIORITY} " serverlist | wc -l` SRV_W_ADD=1 else SRV_W_ADD=0 fi # Pick a value between 0 and the sum of the weights - 1 SRV_RND=`expr ${RANDVALUE} % ${SRV_WSUM}` # Read through the list of mirrors and set SERVERNAME. Write the line # corresponding to the mirror we selected into serverlist_tried so that # we won't try it again. while read X; do case "$X" in ${SRV_PRIORITY}\ *) SRV_W=`echo $X | cut -f 2 -d ' '` SRV_W=$(($SRV_W + $SRV_W_ADD)) if [ $SRV_RND -lt $SRV_W ]; then SERVERNAME=`echo $X | cut -f 3 -d ' '` echo "$X" >> serverlist_tried break else SRV_RND=$(($SRV_RND - $SRV_W)) fi ;; esac done < serverlist } # Check that we have a public key with an appropriate hash, or # fetch the key if it doesn't exist. Returns 1 if the key has # not yet been fetched. fetch_key() { if [ -r pub.ssl ] && [ `${SHA256} -q pub.ssl` = ${KEYPRINT} ]; then return 0 fi echo -n "Fetching public key from ${SERVERNAME}... " rm -f pub.ssl fetch ${QUIETFLAG} http://${SERVERNAME}/pub.ssl \ 2>${QUIETREDIR} || true if ! [ -r pub.ssl ]; then echo "failed." return 1 fi if ! [ `${SHA256} -q pub.ssl` = ${KEYPRINT} ]; then echo "key has incorrect hash." rm -f pub.ssl return 1 fi echo "done." } # Fetch a snapshot tag fetch_tag() { rm -f snapshot.ssl tag.new echo ${NDEBUG} "Fetching snapshot tag from ${SERVERNAME}... " fetch ${QUIETFLAG} http://${SERVERNAME}/$1.ssl \ 2>${QUIETREDIR} || true if ! [ -r $1.ssl ]; then echo "failed." return 1 fi openssl rsautl -pubin -inkey pub.ssl -verify \ < $1.ssl > tag.new 2>${QUIETREDIR} || true rm $1.ssl if ! [ `wc -l < tag.new` = 1 ] || ! grep -qE "^portsnap\|[0-9]{10}\|[0-9a-f]{64}" tag.new; then echo "invalid snapshot tag." return 1 fi echo "done." SNAPSHOTDATE=`cut -f 2 -d '|' < tag.new` SNAPSHOTHASH=`cut -f 3 -d '|' < tag.new` } # Sanity-check the date on a snapshot tag fetch_snapshot_tagsanity() { if [ `date "+%s"` -gt `expr ${SNAPSHOTDATE} + 31536000` ]; then echo "Snapshot appears to be more than a year old!" echo "(Is the system clock correct?)" echo "Cowardly refusing to proceed any further." return 1 fi if [ `date "+%s"` -lt `expr ${SNAPSHOTDATE} - 86400` ]; then echo -n "Snapshot appears to have been created more than " echo "one day into the future!" echo "(Is the system clock correct?)" echo "Cowardly refusing to proceed any further." return 1 fi } # Sanity-check the date on a snapshot update tag fetch_update_tagsanity() { fetch_snapshot_tagsanity || return 1 if [ ${OLDSNAPSHOTDATE} -gt ${SNAPSHOTDATE} ]; then echo -n "Latest snapshot on server is " echo "older than what we already have!" echo -n "Cowardly refusing to downgrade from " date -r ${OLDSNAPSHOTDATE} echo "to `date -r ${SNAPSHOTDATE}`." return 1 fi } # Compare old and new tags; return 1 if update is unnecessary fetch_update_neededp() { if [ ${OLDSNAPSHOTDATE} -eq ${SNAPSHOTDATE} ]; then echo -n "Latest snapshot on server matches " echo "what we already have." echo "No updates needed." rm tag.new return 1 fi if [ ${OLDSNAPSHOTHASH} = ${SNAPSHOTHASH} ]; then echo -n "Ports tree hasn't changed since " echo "last snapshot." echo "No updates needed." rm tag.new return 1 fi return 0 } # Fetch snapshot metadata file fetch_metadata() { rm -f ${SNAPSHOTHASH} tINDEX.new echo ${NDEBUG} "Fetching snapshot metadata... " fetch ${QUIETFLAG} http://${SERVERNAME}/t/${SNAPSHOTHASH} \ 2>${QUIETREDIR} || return if [ "`${SHA256} -q ${SNAPSHOTHASH}`" != ${SNAPSHOTHASH} ]; then echo "snapshot metadata corrupt." return 1 fi mv ${SNAPSHOTHASH} tINDEX.new echo "done." } # Warn user about bogus metadata fetch_metadata_freakout() { echo echo "Portsnap metadata is correctly signed, but contains" echo "at least one line which appears bogus." echo "Cowardly refusing to proceed any further." } # Sanity-check a snapshot metadata file fetch_metadata_sanity() { if grep -qvE "^[0-9A-Z.]+\|[0-9a-f]{64}$" tINDEX.new; then fetch_metadata_freakout return 1 fi if [ `look INDEX tINDEX.new | wc -l` != 1 ]; then echo echo "Portsnap metadata appears bogus." echo "Cowardly refusing to proceed any further." return 1 fi } # Take a list of ${oldhash}|${newhash} and output a list of needed patches fetch_make_patchlist() { local IFS='|' echo "" 1>${QUIETREDIR} grep -vE "^([0-9a-f]{64})\|\1$" | while read X Y; do printf "Processing: $X $Y ...\r" 1>${QUIETREDIR} if [ -f "files/${Y}.gz" -o ! -f "files/${X}.gz" ]; then continue; fi echo "${X}|${Y}" done echo "" 1>${QUIETREDIR} } # Print user-friendly progress statistics fetch_progress() { LNC=0 while read x; do LNC=$(($LNC + 1)) if [ $(($LNC % 10)) = 0 ]; then echo -n $LNC elif [ $(($LNC % 2)) = 0 ]; then echo -n . fi done echo -n " " } pct_fmt() { printf " \r" printf "($1/$2) %02.2f%% " `echo "scale=4;$LNC / $TOTAL * 100"|bc` } fetch_progress_percent() { TOTAL=$1 LNC=0 pct_fmt $LNC $TOTAL while read x; do LNC=$(($LNC + 1)) if [ $(($LNC % 100)) = 0 ]; then pct_fmt $LNC $TOTAL elif [ $(($LNC % 10)) = 0 ]; then echo -n . fi done pct_fmt $LNC $TOTAL echo " done. " } # Sanity-check an index file fetch_index_sanity() { if grep -qvE "^[-_+./@0-9A-Za-z]+\|[0-9a-f]{64}$" INDEX.new || fgrep -q "./" INDEX.new; then fetch_metadata_freakout return 1 fi } # Verify a list of files fetch_snapshot_verify() { while read F; do if [ "`gunzip -c < snap/${F}.gz | ${SHA256} -q`" != ${F} ]; then echo "snapshot corrupt." return 1 fi done return 0 } # Fetch a snapshot tarball, extract, and verify. fetch_snapshot() { while ! fetch_tag snapshot; do fetch_pick_server || return 1 done fetch_snapshot_tagsanity || return 1 fetch_metadata || return 1 fetch_metadata_sanity || return 1 rm -rf snap/ # Don't ask fetch(1) to be quiet -- downloading a snapshot of ~ 35MB will # probably take a while, so the progrees reports that fetch(1) generates # will be useful for keeping the users' attention from drifting. echo "Fetching snapshot generated at `date -r ${SNAPSHOTDATE}`:" fetch -r http://${SERVERNAME}/s/${SNAPSHOTHASH}.tgz || return 1 echo -n "Extracting snapshot... " tar -xz --numeric-owner -f ${SNAPSHOTHASH}.tgz snap/ || return 1 rm ${SNAPSHOTHASH}.tgz echo "done." echo -n "Verifying snapshot integrity... " # Verify the metadata files cut -f 2 -d '|' tINDEX.new | fetch_snapshot_verify || return 1 # Extract the index rm -f INDEX.new gunzip -c < snap/`look INDEX tINDEX.new | cut -f 2 -d '|'`.gz > INDEX.new fetch_index_sanity || return 1 # Verify the snapshot contents cut -f 2 -d '|' INDEX.new | fetch_snapshot_verify || return 1 - cut -f 2 -d '|' tINDEX.new INDEX.new | sort -u > files.expected - find snap -mindepth 1 | sed -E 's^snap/(.*)\.gz^\1^' | sort > files.snap + cut -f 2 -d '|' tINDEX.new INDEX.new | sort -u | + lam -s 'snap/' - -s '.gz' > files.expected + find snap -mindepth 1 | sort > files.snap if ! cmp -s files.expected files.snap; then echo "unexpected files in snapshot." return 1 fi rm files.expected files.snap echo "done." # Move files into their proper locations rm -f tag INDEX tINDEX rm -rf files mv tag.new tag mv tINDEX.new tINDEX mv INDEX.new INDEX mv snap/ files/ return 0 } # Update a compressed snapshot fetch_update() { rm -f patchlist diff OLD NEW filelist INDEX.new OLDSNAPSHOTDATE=`cut -f 2 -d '|' < tag` OLDSNAPSHOTHASH=`cut -f 3 -d '|' < tag` while ! fetch_tag latest; do fetch_pick_server || return 1 done fetch_update_tagsanity || return 1 fetch_update_neededp || return 0 fetch_metadata || return 1 fetch_metadata_sanity || return 1 echo -n "Updating from `date -r ${OLDSNAPSHOTDATE}` " echo "to `date -r ${SNAPSHOTDATE}`." # Generate a list of wanted metadata patches join -t '|' -o 1.2,2.2 tINDEX tINDEX.new | fetch_make_patchlist > patchlist # Attempt to fetch metadata patches echo -n "Fetching `wc -l < patchlist | tr -d ' '` " echo ${NDEBUG} "metadata patches.${DDSTATS}" tr '|' '-' < patchlist | lam -s "tp/" - -s ".gz" | xargs ${XARGST} ${PHTTPGET} ${SERVERNAME} \ 2>${STATSREDIR} | fetch_progress echo "done." # Attempt to apply metadata patches echo -n "Applying metadata patches... " local oldifs="$IFS" IFS='|' while read X Y; do if [ ! -f "${X}-${Y}.gz" ]; then continue; fi gunzip -c < ${X}-${Y}.gz > diff gunzip -c < files/${X}.gz > OLD cut -c 2- diff | join -t '|' -v 2 - OLD > ptmp grep '^\+' diff | cut -c 2- | sort -k 1,1 -t '|' -m - ptmp > NEW if [ `${SHA256} -q NEW` = ${Y} ]; then mv NEW files/${Y} gzip -n files/${Y} fi rm -f diff OLD NEW ${X}-${Y}.gz ptmp done < patchlist 2>${QUIETREDIR} IFS="$oldifs" echo "done." # Update metadata without patches join -t '|' -v 2 tINDEX tINDEX.new | cut -f 2 -d '|' /dev/stdin patchlist | while read Y; do if [ ! -f "files/${Y}.gz" ]; then echo ${Y}; fi done > filelist echo -n "Fetching `wc -l < filelist | tr -d ' '` " echo ${NDEBUG} "metadata files... " lam -s "f/" - -s ".gz" < filelist | xargs ${XARGST} ${PHTTPGET} ${SERVERNAME} \ 2>${QUIETREDIR} while read Y; do echo -n "Verifying ${Y}... " 1>${QUIETREDIR} if [ `gunzip -c < ${Y}.gz | ${SHA256} -q` = ${Y} ]; then mv ${Y}.gz files/${Y}.gz else echo "metadata is corrupt." return 1 fi echo "ok." 1>${QUIETREDIR} done < filelist echo "done." # Extract the index echo -n "Extracting index... " 1>${QUIETREDIR} gunzip -c < files/`look INDEX tINDEX.new | cut -f 2 -d '|'`.gz > INDEX.new fetch_index_sanity || return 1 # If we have decided to refuse certain updates, construct a hybrid index which # is equal to the old index for parts of the tree which we don't want to # update, and equal to the new index for parts of the tree which gets updates. # This means that we should always have a "complete snapshot" of the ports # tree -- with the caveat that it isn't actually a snapshot. if [ ! -z "${REFUSE}" ]; then echo "Refusing to download updates for ${REFUSE}" \ >${QUIETREDIR} grep -Ev "${REFUSE}" INDEX.new > INDEX.tmp grep -E "${REFUSE}" INDEX | sort -m -k 1,1 -t '|' - INDEX.tmp > INDEX.new rm -f INDEX.tmp fi # Generate a list of wanted ports patches echo -n "Generating list of wanted patches..." 1>${QUIETREDIR} join -t '|' -o 1.2,2.2 INDEX INDEX.new | fetch_make_patchlist > patchlist echo " done." 1>${QUIETREDIR} # Attempt to fetch ports patches patchcnt=`wc -l < patchlist | tr -d ' '` echo -n "Fetching $patchcnt " echo ${NDEBUG} "patches.${DDSTATS}" echo " " tr '|' '-' < patchlist | lam -s "bp/" - | xargs ${XARGST} ${PHTTPGET} ${SERVERNAME} \ 2>${STATSREDIR} | fetch_progress_percent $patchcnt echo "done." # Attempt to apply ports patches PATCHCNT=`wc -l patchlist` echo "Applying patches... " local oldifs="$IFS" IFS='|' I=0 while read X Y; do I=$(($I + 1)) F="${X}-${Y}" if [ ! -f "${F}" ]; then printf " Skipping ${F} (${I} of ${PATCHCNT}).\r" continue; fi echo " Processing ${F}..." 1>${QUIETREDIR} gunzip -c < files/${X}.gz > OLD ${BSPATCH} OLD NEW ${X}-${Y} if [ `${SHA256} -q NEW` = ${Y} ]; then mv NEW files/${Y} gzip -n files/${Y} fi rm -f diff OLD NEW ${X}-${Y} done < patchlist 2>${QUIETREDIR} IFS="$oldifs" echo "done." # Update ports without patches join -t '|' -v 2 INDEX INDEX.new | cut -f 2 -d '|' /dev/stdin patchlist | while read Y; do if [ ! -f "files/${Y}.gz" ]; then echo ${Y}; fi done > filelist echo -n "Fetching `wc -l < filelist | tr -d ' '` " echo ${NDEBUG} "new ports or files... " lam -s "f/" - -s ".gz" < filelist | xargs ${XARGST} ${PHTTPGET} ${SERVERNAME} \ 2>${QUIETREDIR} I=0 while read Y; do I=$(($I + 1)) printf " Processing ${Y} (${I} of ${PATCHCNT}).\r" 1>${QUIETREDIR} if [ `gunzip -c < ${Y}.gz | ${SHA256} -q` = ${Y} ]; then mv ${Y}.gz files/${Y}.gz else echo "snapshot is corrupt." return 1 fi done < filelist echo "done." # Remove files which are no longer needed cut -f 2 -d '|' tINDEX INDEX | sort -u > oldfiles cut -f 2 -d '|' tINDEX.new INDEX.new | sort -u | comm -13 - oldfiles | lam -s "files/" - -s ".gz" | xargs rm -f rm patchlist filelist oldfiles # We're done! mv INDEX.new INDEX mv tINDEX.new tINDEX mv tag.new tag return 0 } # Do the actual work involved in "fetch" / "cron". fetch_run() { fetch_pick_server_init && fetch_pick_server while ! fetch_key; do fetch_pick_server || return 1 done if ! [ -d files -a -r tag -a -r INDEX -a -r tINDEX ]; then fetch_snapshot || return 1 fi fetch_update || return 1 } # Build a ports INDEX file extract_make_index() { if ! look $1 ${WORKDIR}/tINDEX > /dev/null; then echo -n "$1 not provided by portsnap server; " echo "$2 not being generated." else gunzip -c < "${WORKDIR}/files/`look $1 ${WORKDIR}/tINDEX | cut -f 2 -d '|'`.gz" | cat - ${LOCALDESC} | ${MKINDEX} /dev/stdin > ${PORTSDIR}/$2 fi } # Create INDEX, INDEX-5, INDEX-6 extract_indices() { echo -n "Building new INDEX files... " for PAIR in ${INDEXPAIRS}; do INDEXFILE=`echo ${PAIR} | cut -f 1 -d '|'` DESCRIBEFILE=`echo ${PAIR} | cut -f 2 -d '|'` extract_make_index ${DESCRIBEFILE} ${INDEXFILE} || return 1 done echo "done." } # Create .portsnap.INDEX; if we are REFUSEing to touch certain directories, # merge the values from any exiting .portsnap.INDEX file. extract_metadata() { if [ -z "${REFUSE}" ]; then sort ${WORKDIR}/INDEX > ${PORTSDIR}/.portsnap.INDEX elif [ -f ${PORTSDIR}/.portsnap.INDEX ]; then grep -E "${REFUSE}" ${PORTSDIR}/.portsnap.INDEX \ > ${PORTSDIR}/.portsnap.INDEX.tmp grep -vE "${REFUSE}" ${WORKDIR}/INDEX | sort | sort -m - ${PORTSDIR}/.portsnap.INDEX.tmp \ > ${PORTSDIR}/.portsnap.INDEX rm -f ${PORTSDIR}/.portsnap.INDEX.tmp else grep -vE "${REFUSE}" ${WORKDIR}/INDEX | sort \ > ${PORTSDIR}/.portsnap.INDEX fi } # Do the actual work involved in "extract" extract_run() { local oldifs="$IFS" IFS='|' mkdir -p ${PORTSDIR} || return 1 if ! if ! [ -z "${EXTRACTPATH}" ]; then grep "^${EXTRACTPATH}" ${WORKDIR}/INDEX elif ! [ -z "${REFUSE}" ]; then grep -vE "${REFUSE}" ${WORKDIR}/INDEX else cat ${WORKDIR}/INDEX fi | while read FILE HASH; do echo ${PORTSDIR}/${FILE} if ! [ -s "${WORKDIR}/files/${HASH}.gz" ]; then echo "files/${HASH}.gz not found -- snapshot corrupt." return 1 fi case ${FILE} in */) rm -rf ${PORTSDIR}/${FILE%/} mkdir -p ${PORTSDIR}/${FILE} tar -xz --numeric-owner -f ${WORKDIR}/files/${HASH}.gz \ -C ${PORTSDIR}/${FILE} ;; *) rm -f ${PORTSDIR}/${FILE} tar -xz --numeric-owner -f ${WORKDIR}/files/${HASH}.gz \ -C ${PORTSDIR} ${FILE} ;; esac done; then return 1 fi if [ ! -z "${EXTRACTPATH}" ]; then return 0; fi IFS="$oldifs" extract_metadata extract_indices } update_run_extract() { local IFS='|' # Install new files echo "Extracting new files:" if ! if ! [ -z "${REFUSE}" ]; then grep -vE "${REFUSE}" ${WORKDIR}/INDEX | sort else sort ${WORKDIR}/INDEX fi | comm -13 ${PORTSDIR}/.portsnap.INDEX - | while read FILE HASH; do echo ${PORTSDIR}/${FILE} if ! [ -s "${WORKDIR}/files/${HASH}.gz" ]; then echo "files/${HASH}.gz not found -- snapshot corrupt." return 1 fi case ${FILE} in */) mkdir -p ${PORTSDIR}/${FILE} tar -xz --numeric-owner -f ${WORKDIR}/files/${HASH}.gz \ -C ${PORTSDIR}/${FILE} ;; *) tar -xz --numeric-owner -f ${WORKDIR}/files/${HASH}.gz \ -C ${PORTSDIR} ${FILE} ;; esac done; then return 1 fi } # Do the actual work involved in "update" update_run() { if ! [ -z "${INDEXONLY}" ]; then extract_indices >/dev/null || return 1 return 0 fi if sort ${WORKDIR}/INDEX | cmp -s ${PORTSDIR}/.portsnap.INDEX -; then echo "Ports tree is already up to date." return 0 fi # If we are REFUSEing to touch certain directories, don't remove files # from those directories (even if they are out of date) echo -n "Removing old files and directories... " if ! [ -z "${REFUSE}" ]; then sort ${WORKDIR}/INDEX | comm -23 ${PORTSDIR}/.portsnap.INDEX - | cut -f 1 -d '|' | grep -vE "${REFUSE}" | lam -s "${PORTSDIR}/" - | sed -e 's|/$||' | xargs rm -rf else sort ${WORKDIR}/INDEX | comm -23 ${PORTSDIR}/.portsnap.INDEX - | cut -f 1 -d '|' | lam -s "${PORTSDIR}/" - | sed -e 's|/$||' | xargs rm -rf fi echo "done." update_run_extract || return 1 extract_metadata extract_indices } #### Main functions -- call parameter-handling and core functions # Using the command line, configuration file, and defaults, # set all the parameters which are needed later. get_params() { init_params parse_cmdline $@ sanity_conffile default_conffile parse_conffile default_params } # Fetch command. Make sure that we're being called # interactively, then run fetch_check_params and fetch_run cmd_fetch() { if [ "${INTERACTIVE}" != "YES" ]; then echo -n "`basename $0` fetch should not " echo "be run non-interactively." echo "Run `basename $0` cron instead" exit 1 fi fetch_check_params fetch_run || exit 1 } # Cron command. Make sure the parameters are sensible; wait # rand(3600) seconds; then fetch updates. While fetching updates, # send output to a temporary file; only print that file if the # fetching failed. cmd_cron() { fetch_check_params sleep `jot -r 1 0 3600` TMPFILE=`mktemp /tmp/portsnap.XXXXXX` || exit 1 if ! fetch_run >> ${TMPFILE}; then cat ${TMPFILE} rm ${TMPFILE} exit 1 fi rm ${TMPFILE} } # Extract command. Make sure the parameters are sensible, # then extract the ports tree (or part thereof). cmd_extract() { extract_check_params extract_run || exit 1 } # Update command. Make sure the parameters are sensible, # then update the ports tree. cmd_update() { update_check_params update_run || exit 1 } # Auto command. Run 'fetch' or 'cron' depending on # whether stdin is a terminal; then run 'update' or # 'extract' depending on whether ${PORTSDIR} exists. cmd_auto() { if [ "${INTERACTIVE}" = "YES" ]; then cmd_fetch else cmd_cron fi if [ -r ${PORTSDIR}/.portsnap.INDEX ]; then cmd_update else cmd_extract fi } #### Entry point # Make sure we find utilities from the base system export PATH=/sbin:/bin:/usr/sbin:/usr/bin:${PATH} # Set LC_ALL in order to avoid problems with character ranges like [A-Z]. export LC_ALL=C get_params $@ for COMMAND in ${COMMANDS}; do cmd_${COMMAND} done Index: user/alc/PQ_LAUNDRY =================================================================== --- user/alc/PQ_LAUNDRY (revision 306831) +++ user/alc/PQ_LAUNDRY (revision 306832) Property changes on: user/alc/PQ_LAUNDRY ___________________________________________________________________ Modified: svn:mergeinfo ## -0,0 +0,1 ## Merged /head:r306777-306831